In [None]:
import pandas as pd
import janitor
import seaborn as sns
import scipy.stats
import numpy as np
import matplotlib.pyplot as plt

In [None]:
pd.set_option("display.max_columns", 100)

## Influenza and family 

##### 3.1

At least one parent has influenza

##### 3.2

Both parents have influenza

#### 3.3

No, both children can have influenza at the same time

#### 3.4

At least one child has influenza

#### 3.5

First child has influenza

#### 3.6

C = A1 u A2

#### 3.7

D = B u C

#### 3.8 

Mother does not have influenza

#### 3.9

Father does not have influenza

#### 3.10

~C = ~A1 ^ ~A2

#### 3.11

~D = ~B ^ ~C

#### 3.12

In [3]:
pA1 = .1
pA2 = .1
pA1A2 = .02

In [4]:
# By definitionof independence
pA1*pA2 == pA1A2

False

#### 3.13

In [5]:
pA3 = .2
pA4 = .2
pA3A4 = .1

In [7]:
pA3orA4 = pA3 + pA4 - pA3A4
pA3orA4

0.30000000000000004

#### 3.14

In [8]:
# Father has influenza givent mother has
pA1A2/pA2

0.19999999999999998

#### 3.15

In [9]:
pA2*(1-pA1)

0.09000000000000001

In [14]:
# Father has influenza givent mother has not
pNotA1 = 1 - pA1
pA2notA1 = pA2 - pA1A2
pA2notA1/pNotA1

0.08888888888888889

### Mental health 

#### 3.16

In [None]:
df.Difference.median()

#### 2.17

In [None]:
sns.boxplot(data=df, x='Difference')

#### 2.18

In [None]:
df['base_level'] = (df.Before < df.Before.median()).map({True: 'low', False: 'high'})

In [None]:
sns.boxplot(data=df, x='base_level', y='Difference')

### Hypertension

In [None]:
df = pd.read_excel('data/hypertension.xlsx').set_index('Participant')
df.sample(5)

In [None]:
sns.scatterplot(df['Standing systolic', df['Standing diastolic']])

#### 2.19

In [None]:
df['Systolic diff'] = df['Recumbent systolic'] - df['Standing systolic']
df['Diastolic diff'] = df['Recumbent diastolic'] - df['Standing diastolic']

In [None]:
df['Systolic diff'].mean(), df['Systolic diff'].median()

In [None]:
df['Diastolic diff'].mean(), df['Diastolic diff'].median()

#### 2.20

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
sns.boxplot(data=df, y='Systolic diff', ax=ax1)
sns.boxplot(data=df, y='Diastolic diff', ax=ax2)

#### 2.22

In [None]:
df['Systolic diff'].quantile([0.1, 0.9])

### Pulmonary Disease

In [None]:
df = pd.read_csv('data/FEV.DAT.txt').clean_names().set_index('id')
df.sample(5)

#### 2.23

In [None]:
cols = df.columns[:3]
f, axes = plt.subplots(1, len(cols), figsize=(15, 8))
for ax, col in zip(axes, cols):
    sns.boxplot(data=df, y=col, ax=ax).set_title(col)

In [None]:
cols = df.columns[3:]
f, axes = plt.subplots(1, len(cols), figsize=(15, 8))
for ax, col in zip(axes, cols):
    sns.barplot(data=df, y=col, ax=ax).set_title(col)

#### 2.24

In [None]:
sns.boxplot(data=df, y='fev', x='age', hue='sex');

In [None]:
sns.boxplot(data=df, y='fev', x='hgt', hue='sex');

In [None]:
sns.boxplot(data=df, y='fev', x='smoke', hue='sex');

#### 2.25

In [None]:
df['age_ctg'] = pd.cut(df.age, [2, 4, 9, 14, 19])

In [None]:
sns.boxplot(data=df, y='fev', x='age_ctg', hue='sex');

In [None]:
df.groupby(['age_ctg', 'sex']).fev.mean().unstack()

### Nutrition

In [None]:
df = pd.read_csv('data/VALID.DAT.txt').clean_names().set_index('id')
df.sample(5)

#### 2.26

In [None]:
df.describe()

In [None]:
df.iloc[:,:-2].plot(kind='box')

In [None]:
df.iloc[:,-2:].plot(kind='box')

#### 2.27

DR have greater values and have less spread

#### 2.28 

In [None]:
df.quantile([.2, .4, .6, .8, 1])

#### 2.29

In [None]:
df['tfat_dr_cal'] = df.tfat_dr*9/df.cal_dr*100
df['tfat_ffq_cal'] = df.tfat_ffq*9/df.cal_ffq*100

In [None]:
df[['tfat_dr_cal', 'tfat_ffq_cal']].describe()

#### 2.30

In [None]:
df[['tfat_dr_cal', 'tfat_ffq_cal']].quantile([.2, .4, .6, .8, 1])

### Environmental Health, Pediatrics

In [None]:
df = pd.read_csv('data/LEAD.DAT.txt').clean_names().set_index('id')
df.sample(5)

In [None]:
df['group'] = df['group'].map({1: 'control', 2: 'exposed'})

In [None]:
df.group.value_counts()

#### 2.31

In [None]:
sns.boxplot(data=df, x='group', y='ageyrs')

In [None]:
sns.barplot(data=df, x='group', y='sex')

#### 2.32

In [None]:
sns.boxplot(data=df, x='group', y='iqp')

In [None]:
sns.boxplot(data=df, x='group', y='iqv')

### Cardiovascular Disease

In [None]:
df = pd.read_csv('data/APC.csv').clean_names().set_index('sample')

#### 2.33

In [None]:
df['mean'] = df[['a', 'b']].mean(axis=1)
df['std'] =  df[['a', 'b']].std(axis=1, ddof=0)
df['cv'] = 100*df['std']/df['mean']

#### 2.34

In [None]:
df.cv.mean()

### Microbiology

In [None]:
df = pd.read_csv('data/soy_plants.csv').clean_names().set_index('sample')
df.sample(5)

In [None]:
df_piv = df.pivot_longer(values_to='weight', names_to='soil')

#### 2.35

In [None]:
df_piv.groupby('soil').describe()

#### 2.36

In [None]:
sns.boxplot(data=df_piv, x='soil', y='weight')

#### 2.37

Inoculation outperforms the second group

### Endocrinology

In [None]:
df = pd.read_csv('data/BONEDEN.DAT.txt').clean_names().set_index('id')
df.sample(5)

#### 2.38 

In [None]:
df['ls_bmd_diff_abs'] = df.ls2 - df.ls1
df['ls_bmd_mean'] = (df.ls2 + df.ls1)/2
df['ls_bmd_diff_rel'] = 100 * df.ls_bmd_diff_abs / df.ls_bmd_mean

In [None]:
df['ls_bmd_diff_rel'].describe()

#### 2.39

In [None]:
df['pyr_diff'] = df.pyr2 - df.pyr1
df['pyr_grp'] = pd.cut(df.pyr_diff, [0, 10, 20, 30, 40, 100], right=False)

In [None]:
sns.scatterplot(data=df, x='pyr_diff', y='ls_bmd_diff_rel', hue='pyr_grp', style='zyg')
plt.axhline(y=0, ls='--', c='black', lw=1)

#### 2.40

The bigger difference the less bone density

#### 2.41-2.43

In [None]:
df['fn_bmd_diff_abs'] = df.fn2 - df.fn1
df['fn_bmd_mean'] = (df.fn2 + df.fn1)/2
df['fn_bmd_diff_rel'] = 100 * df.fn_bmd_diff_abs / df.fn_bmd_mean

In [None]:
df['fn_bmd_diff_rel'].describe()

In [None]:
sns.scatterplot(data=df, x='pyr_diff', y='fn_bmd_diff_rel', hue='pyr_grp', style='zyg')
plt.axhline(y=0, ls='--', c='black', lw=1)

#### 2.44-2.46

In [None]:
df['fs_bmd_diff_abs'] = df.fs2 - df.fs1
df['fs_bmd_mean'] = (df.fs2 + df.fs1)/2
df['fs_bmd_diff_rel'] = 100 * df.fs_bmd_diff_abs / df.fs_bmd_mean

In [None]:
df['fs_bmd_diff_rel'].describe()

In [None]:
sns.scatterplot(data=df, x='pyr_diff', y='fs_bmd_diff_rel', hue='pyr_grp', style='zyg')
plt.axhline(y=0, ls='--', c='black', lw=1)

### Cardiovascular Disease

Lack of dataset :(