In [None]:
import pandas as pd
import janitor
import seaborn as sns
import scipy.stats
import numpy as np
import matplotlib.pyplot as plt

## Infectious Disease

In [None]:
df = pd.read_csv('data/HOSPITAL.DAT.txt').clean_names()

In [None]:
df.head()

##### 2.1 

In [None]:
df.dur_stay.mean()

In [None]:
df.dur_stay.median()

##### 2.2

In [None]:
df.dur_stay.std()

In [None]:
(df.dur_stay.min(), df.dur_stay.max())

#### 2.3

In [None]:
sns.boxplot(data=df, x='dur_stay', y='antibio', orient='h');

#### 2.4

Median will be also multiplied

In [None]:
(df.dur_stay*2).median(), df.dur_stay.median()*2

#### 2.5

Mode will be also multiplied

In [None]:
(df.dur_stay*2).min(), (df.dur_stay*2).max()

#### 2.6

Geometrical mean will be also multiplied

In [None]:
scipy.stats.gmean(df.dur_stay*2), scipy.stats.gmean(df.dur_stay)*2

#### 2.7

Rande will be also multiplied

In [None]:
((df.dur_stay*2).min(), (df.dur_stay*2).max()), ((df.dur_stay).min()*2, (df.dur_stay).max()*2)

### Health promotion

In [None]:
scipy.stats.gmean(df.dur_stay*2), scipy.stats.gmean(df.dur_stay)*2

In [None]:
df = pd.DataFrame([12.8, 12.2, 12.25, 12.18, 11.53, 12.47, 12.30, 12.08, 11.72, 11.57, 11.73, 12.67, 11.92, 11.67, 11.80, 12.33, 12.55, 11.83], index=range(1,19), columns=['time'])
df.head()

#### 2.8 

In [None]:
df.time.mean()

#### 2.9

In [None]:
df.time.std()

#### 12.10

In [None]:
df['time_100'] = (df.time*100).astype(int)
df.sample(5)

In [None]:
df.time_100.mean()

In [None]:
df.time_100.std()

#### 2.12

In [None]:
sns.boxplot(data=df, x='time')

In [None]:
iqr = scipy.stats.iqr(df.time)
upper_limit = np.percentile(df.time, 75) + 1.5*iqr

In [None]:
12.97 > upper_limit

### Cardiovascular Disease

In [None]:
df = pd.read_excel('data/Cholesterol.xlsx').set_index('Subject')
df.Difference = df.Before - df.After

In [None]:
df.sample(5)

#### 2.13

In [None]:
df.Difference.mean()

#### 2.14

In [None]:
df.Difference.mean()

In [None]:
df.Difference.std()

#### 2.16

In [None]:
df.Difference.median()

#### 2.17

In [None]:
sns.boxplot(data=df, x='Difference')

#### 2.18

In [None]:
df['base_level'] = (df.Before < df.Before.median()).map({True: 'low', False: 'high'})

In [None]:
sns.boxplot(data=df, x='base_level', y='Difference')

### Hypertension

In [None]:
df = pd.read_excel('data/hypertension.xlsx').set_index('Participant')
df.sample(5)

#### 2.19

In [None]:
df['Systolic diff'] = df['Recumbent systolic'] - df['Standing systolic']
df['Diastolic diff'] = df['Recumbent diastolic'] - df['Standing diastolic']

In [None]:
df['Systolic diff'].mean(), df['Systolic diff'].median()

In [None]:
df['Diastolic diff'].mean(), df['Diastolic diff'].median()

#### 2.20

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
sns.boxplot(data=df, y='Systolic diff', ax=ax1)
sns.boxplot(data=df, y='Diastolic diff', ax=ax2)

#### 2.22

In [None]:
df['Systolic diff'].quantile([0.1, 0.9])

### Pulmonary Disease

In [None]:
df = pd.read_csv('data/FEV.DAT.txt').clean_names().set_index('id')
df.sample(5)

#### 2.23

In [None]:
cols = df.columns[:3]
f, axes = plt.subplots(1, len(cols), figsize=(15, 8))
for ax, col in zip(axes, cols):
    sns.boxplot(data=df, y=col, ax=ax).set_title(col)

In [None]:
cols = df.columns[3:]
f, axes = plt.subplots(1, len(cols), figsize=(15, 8))
for ax, col in zip(axes, cols):
    sns.barplot(data=df, y=col, ax=ax).set_title(col)

#### 2.24

In [None]:
sns.boxplot(data=df, y='fev', x='age', hue='sex');

In [None]:
sns.boxplot(data=df, y='fev', x='hgt', hue='sex');

In [None]:
sns.boxplot(data=df, y='fev', x='smoke', hue='sex');

#### 2.25

In [None]:
df['age_ctg'] = pd.cut(df.age, [2, 4, 9, 14, 19])

In [None]:
sns.boxplot(data=df, y='fev', x='age_ctg', hue='sex');

In [None]:
df.groupby(['age_ctg', 'sex']).fev.mean().unstack()

### Nutrition

In [None]:
df = pd.read_csv('data/VALID.DAT.txt').clean_names().set_index('id')
df.sample(5)

#### 2.26

In [None]:
df.describe()

In [None]:
df.iloc[:,:-2].plot(kind='box')

In [None]:
df.iloc[:,-2:].plot(kind='box')

#### 2.27

DR have greater values and have less spread

#### 2.28 

In [None]:
df.quantile([.2, .4, .6, .8, 1])

#### 2.29

In [None]:
df['tfat_dr_cal'] = df.tfat_dr*9/df.cal_dr*100
df['tfat_ffq_cal'] = df.tfat_ffq*9/df.cal_ffq*100

In [None]:
df[['tfat_dr_cal', 'tfat_ffq_cal']].describeribe()

#### 2.30

In [None]:
df[['tfat_dr_cal', 'tfat_ffq_cal']].quantile([.2, .4, .6, .8, 1])