![alt text](pandas.png "Title")

# Practice pandas

## Run these cells

In [36]:
import pandas as pd
import numpy as np
import random
%matplotlib inline

In [37]:
data = {
    'subjid': range(10000, 10000 + n_subjects),
    'sex'   : random.choices(['F', 'M'], k=n_subjects),
    'age'   : pd.Series(random.choices(range(18, 65), k=n_subjects), dtype=pd.Int64Dtype())
}

dm = pd.DataFrame(data)
dm.loc[2,   'age'] = np.nan
dm.loc[198, 'age'] = np.nan
dm.loc[198, 'sex'] = np.nan
dm.loc[57,  'sex'] = np.nan
dm

Unnamed: 0,subjid,sex,age
0,10000,M,55
1,10001,M,21
2,10002,M,
3,10003,F,50
4,10004,M,39
...,...,...,...
195,10195,M,48
196,10196,F,62
197,10197,M,47
198,10198,,


In [47]:
patients = list( range(10000, 10000 + n_subjects))
visits = [1, 2, 3]
param = ['heart rate', 'systolic blood pressure']

data = {
    'subjid': sorted(patients * len(visits)) * len(param),
    'visit' : visits * len(param) * len(patients),
    'param' : sorted(param * len(visits) * len(patients)),
    'result': [random.randint(50, 150)  for n in range(len(visits) * len(patients))] +
              [random.randint(100, 180) for n in range(len(visits) * len(patients))] 
}

vs = pd.DataFrame(data, columns=['subjid', 'visit', 'param', 'result'])
vs = vs.reset_index().drop(['index'], axis=1)

vs.loc[2, 'result'] = np.nan
vs.loc[1199, 'result'] = np.nan
vs

Unnamed: 0,subjid,visit,param,result
0,10000,1,heart rate,70.0
1,10000,2,heart rate,83.0
2,10000,3,heart rate,
3,10001,1,heart rate,76.0
4,10001,2,heart rate,99.0
...,...,...,...,...
1195,10198,2,systolic blood pressure,169.0
1196,10198,3,systolic blood pressure,109.0
1197,10199,1,systolic blood pressure,133.0
1198,10199,2,systolic blood pressure,137.0


## Exercise 1

In [None]:
# What's the shape of the dm dataframe? What's its memory size?

# INSERT YOUR SOLUTION 

In [None]:
# Create a study_id column in both dm & vs dataframes. Give any value you want

# INSERT YOUR SOLUTION 

In [None]:
# Add a column in dm named'A-S'. It gets the concatenation of age and sex, with a dash as delimiter.
# Sex valid values in this variable are MALE or FEMALE

# INSERT YOUR SOLUTION

In [None]:
# Add a 'Year of birth' column in dm, using age and today's date as reference

# INSERT YOUR SOLUTION

In [None]:
# Which subjects were born on a leap year?

# INSERT YOUR SOLUTION

In [None]:
# Infer a 'DOB' column using Jan 1st and midnight for birth date. Series type should be a datetime.

# INSERT YOUR SOLUTION

In [None]:
# Label the row index in dm using the concatenation of dm.study_id and dm.subjid

# INSERT YOUR SOLUTION 

In [None]:
# Retain the following column order in dm: study_id, subjid, age, sex

# INSERT YOUR SOLUTION

## Exercice 2

In [None]:
# Filter dm to show only records with at least one missing value

# INSERT YOUR SOLUTION

In [None]:
# Display dm.age Series with only missing values

# INSERT YOUR SOLUTION

In [None]:
# Create a dictionary with 2 keys
#  'missing_age'      : the value will be a tuple of subjid with missing age in dm 
#  'missing_sex'      : the value will be a tuple of subjid with missing sex in dm
#  'missing_some_data': the value will be an iterable object of unique subjid with a missing value in either age or sex in dm
#  'missing_all_data' : the value will be an iterable object of unique subjid with a missing value in either age and sex in dm
#  'no_missing'       : the value will be an integer: the number of subjects with no missing values for age and sex in dm

# INSERT YOUR SOLUTION

In [None]:
# What's the type for Series age? And why?

# INSERT YOUR SOLUTION

In [None]:
# in vs, apply a LOCF to 'result'
# i.e. when result is missing, carry forward the value from the previous visit (by subject and param)

# INSERT YOUR SOLUTION

## Exercise 3

In [None]:
# How many female subjects over 40 years old in dm?

# INSERT AT LEAST 2 SOLUTIONS

In [None]:
# What's the average age by sex and race?

# INSERT YOUR SOLUTION

In [None]:
# Build a list of subjid for the 5 oldest subject(s) in descending order

# INSERT YOUR SOLUTION

In [39]:
# Optional: Create a plot showing age distribution by sex

# INSERT YOUR SOLUTION

In [None]:
# Add a variable in dm with this value: 
# "I'm <...> year", replacing the ... with the actual age

# INSERT YOUR SOLUTION

In [None]:
# Add a variable in dm with this value for all rows: 'hello@world@how's it going?"
# Split this variable by delimiter @ into as many variables in dm as needed

# INSERT YOUR SOLUTION

In [None]:
# Delete the variables created in this Exercise

# INSERT AT LEAST 2 SOLUTIONS

## Exercise 4

In [19]:
# Build a dataframe dm_fixed for subjects 10100 to 10120.
# Assign them a random age (different from dm, from 18 to 65) but make sure you use the same gender as in dm

# INSERT YOUR SOLUTION 

In [None]:
# Change the age values in dm using the values from dm_fixed

# INSERT YOUR SOLUTION 

In [None]:
# Reset the row index in dm to use a default index (i.e. a range of integers)

# INSERT YOUR SOLUTION 

## Exercise 5

In [None]:
# Add age and sex from dm to the vs dataframe, by subjid

# INSERT YOUR SOLUTION 

In [None]:
# Display the highest heart rate across the visits by subject

# INSERT YOUR SOLUTION 

In [None]:
# Optional: plot the highest heart rate by age

# INSERT YOUR SOLUTION 

## Exercice 6

In [None]:
# In vs, drop age and sex variables
# Instead, in vs, create a visit=0 with param='age' and param='sex' with values coming from dm

# INSERT YOUR SOLUTION 

In [None]:
# Now remove these param=0 records

# INSERT YOUR SOLUTION 

## Exercice 7

In [None]:
# The baseline for a parameter in vs is defined at visit 1.
# Add a new variable in vs to display the change from baseline for all parameters

# INSERT YOUR SOLUTION 

In [None]:
# Display the 5 absolute highest change from baseline for heart rate at visit 3

# INSERT YOUR SOLUTION 

In [None]:
# Add a 'High change HR' flag (True/False) in dm
# Set it to True if the subject had at least a 15% increase or more than a 5% decrease in change from baseline for heart rate.
# Set to False otherwise

# INSERT YOUR SOLUTION 

In [None]:
# Add a 'review' flag in dm: 
# if 'High change HR' is True and age>60 (or age missing), set the flag to 'Review'.

# INSERT YOUR SOLUTION 

## Exercise 8

In [None]:
# Save the dm and vs dataframes to your IDAR Shiny home. Save them as .xpt and .xlsx. 

# INSERT YOUR SOLUTION 

In [None]:
# In a new Jupyter notebook, open dm.xpt and dm.xlsx
# Are they equal?

# INSERT YOUR SOLUTION IN A NEW NOTEBOOK