![alt text](pandas.png "Title")

In [0]:
import pandas as pd

# Dataframes: basic alterations

## Test data

In [0]:
data     = {'gender': ['M', 'F', 'F'], 'age': [20, 25, 23]}
patients = [10010, 10011, 10012]

In [0]:
def create_df(data: dict, patients: list):

    df = pd.DataFrame(data, index= patients, columns=['age', 'gender', 'race'])
    return df
    
df= create_df(data, patients)
df

## Delete a column

In [0]:
# Removal of a df column
del df['race']
df

# This is an **in-place** method. It modifies the object directly and does not return anything.

In [0]:
# Alternatively, using drop() on the column axis (axis=1)
df = pd.DataFrame(data, index= patients, columns=['age', 'gender', 'race'])
df = df.drop(['race'], axis = 1)
df

# This time it was not in-place: I assigned the outcome, overwritting the original object

## Rename columns

In [0]:
# in-place edit of the 'columns' attribute:
df.columns = ['age', 'sex']
df

In [0]:
# We could also pass a dictionnary of the columns we want to rename. 
df.rename(columns={'age': 'agen'})                  # Not in-place

# df.rename(columns={'age': 'agen'}, inplace=True)  # in-place option

## Change columns order

In [0]:
# Get the list of df columns (following order found in the df):
cols = df.columns
print(type(cols))

# This object is iterable. We can make a list out of it:
list(df.columns)

In [0]:
# Change the order, e.g. reverting order:
df = df[ cols[::-1] ] 
df

## Add new rows

In [0]:
df = pd.DataFrame(data, index= patients, columns=['age', 'gender'])
df

In [0]:
# One way is to use loc with a new index value, passing a list of value for each column:
df.loc[10013] = [40, 'M']
df

In [0]:
# This syntax, without loc, won't work: 
df[10015] = [50, 'F']

# Why ? It implies we are using a Series to create a new column. It's length is unexpected. 

In [0]:
# If the length matches, then it's fine. But that's not what we intended:
df[10015] = [50, 'F', 'test', 'test2']
df

In [0]:
# In case we are not using a labelled index:
df = pd.DataFrame(data, columns=['age', 'gender'])

# Number of rows in our df:
n_rows = len(df)

# Add a new row:
df.loc[n_rows] = [50, 'F']

df

In [0]:
# Alternatively, we can use the concat() method, which concatenates 2 dataframes.
# You concatenate dataframes/series along a particular axis, with many options (handling indexes, missings, mismatch columns...)

new_row = pd.DataFrame(data=[[48, 'M']], columns=['age', 'gender'] ,index= [4])
new_row

In [0]:
df = pd.concat([df, new_row])
df

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html

## Delete rows

In [0]:
df=create_df(data, patients)
df

In [0]:
# We can use drop(), this time on the row axis (the default)
for subjid in [10011, 10012]:
    df.drop([subjid], inplace=True) # axis=0
df

## Changing values

In [0]:
df=create_df(data, patients)
df

In [0]:
# In-place modification of a single value using the loc method and the coordinates:
df.loc[10010, 'age'] = 80
df

In [0]:
# Alternatively, you can use another df to update the values in a df. Update is done with matching index values.
new_row = pd.DataFrame({'gender': ['F'], 'age': [77]}, index= [10010], columns=['age', 'gender'])
new_row

In [0]:
df.update(new_row)
df

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.update.html

In [0]:
# Broadcasting: create a column using the same value for the whole Series:
df=create_df(data, patients)
df['Broadcast_example'] = 'Test'
df

In [0]:
# Applying a vector to do some maths, element-wise
df=create_df(data, patients)
df['months'] = df.age * 12
df

In [0]:
# Using pandas map() to apply a small transformation
gender = {'M': 'Male', 'F': 'Female'}
df['gender_long'] = df['gender'].map(lambda x: gender[x])
df

## Sorting by values

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html

In [0]:
# sort_values() is not in-place
df = df.sort_values('gender')

# in-place sorting, with more options
df.sort_values(['gender','age'], inplace=True, ascending=[True, False])
df

__________________________________________________
Nicolas Dupuis, Methodology and Innovation (IDAR C&SP), 2020+