## Python statistics essential training - 03_05_categorical

Standard imports

In [1]:
import numpy as np
import scipy.stats
import pandas as pd

In [2]:
import matplotlib
import matplotlib.pyplot as pp

import pandas.plotting

from IPython import display
from ipywidgets import interact, widgets

%matplotlib inline

In [3]:
import re
import mailbox
import csv

In [4]:
smoking = pd.read_csv('whickham.csv')

In [5]:
smoking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1314 entries, 0 to 1313
Data columns (total 3 columns):
outcome    1314 non-null object
smoker     1314 non-null object
age        1314 non-null int64
dtypes: int64(1), object(2)
memory usage: 30.9+ KB


In [6]:
smoking.head()

Unnamed: 0,outcome,smoker,age
0,Alive,Yes,23
1,Alive,Yes,18
2,Dead,Yes,71
3,Alive,No,67
4,Alive,No,64


In [7]:
pd.DataFrame(smoking.smoker.value_counts())

Unnamed: 0,smoker
No,732
Yes,582


In [8]:
pd.DataFrame(smoking.outcome.value_counts())

Unnamed: 0,outcome
Alive,945
Dead,369


In [9]:
pd.DataFrame(smoking.outcome.value_counts(normalize=True)) # to view in fractions.

Unnamed: 0,outcome
Alive,0.719178
Dead,0.280822


In [11]:
bysomker = smoking.groupby('smoker').outcome.value_counts(normalize=True)

In [12]:
bysomker

smoker  outcome
No      Alive      0.685792
        Dead       0.314208
Yes     Alive      0.761168
        Dead       0.238832
Name: outcome, dtype: float64

In [13]:
bysomker.index # index has 2 levels.

MultiIndex(levels=[['No', 'Yes'], ['Alive', 'Dead']],
           codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=['smoker', 'outcome'])

In [15]:
bysomker.unstack() # Move one or 2 index leves to columns using unstack.

outcome,Alive,Dead
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.685792,0.314208
Yes,0.761168,0.238832


In [16]:
# From the above we can see smokers tend to live longer that non smokers.
# In order to analise the behaviour we use methods of STRATIFICATION using panda's cut.

In [19]:
# We divided cases into age group using panda's cut.
# we generate catagorical levels based on set of bins.
smoking['ageGroup'] = pd.cut(smoking.age, [0, 30, 40, 53, 64], labels=['0-30', '30-40', '40-53', '53-64'])
#### Read about pandas cut

In [23]:
smoking['ageGroup'].head() # Why NaN? We don't have a group for smokers whose age is more that 64.

0     0-30
1     0-30
2      NaN
3      NaN
4    53-64
Name: ageGroup, dtype: category
Categories (4, object): [0-30 < 30-40 < 40-53 < 53-64]

In [24]:
smoking['age'].head()

0    23
1    18
2    71
3    67
4    64
Name: age, dtype: int64

In [25]:
# Stratify the proportions
byage = smoking.groupby(['ageGroup', 'smoker']).outcome.value_counts(normalize=True)

In [26]:
# Stratification = Arragement of things to different groups.
byage

ageGroup  smoker  outcome
0-30      No      Alive      0.981818
                  Dead       0.018182
          Yes     Alive      0.975610
                  Dead       0.024390
30-40     No      Alive      0.955224
                  Dead       0.044776
          Yes     Alive      0.940678
                  Dead       0.059322
40-53     No      Alive      0.876106
                  Dead       0.123894
          Yes     Alive      0.802395
                  Dead       0.197605
53-64     No      Alive      0.669291
                  Dead       0.330709
          Yes     Alive      0.580645
                  Dead       0.419355
Name: outcome, dtype: float64

In [27]:
byage.unstack()

Unnamed: 0_level_0,outcome,Alive,Dead
ageGroup,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
0-30,No,0.981818,0.018182
0-30,Yes,0.97561,0.02439
30-40,No,0.955224,0.044776
30-40,Yes,0.940678,0.059322
40-53,No,0.876106,0.123894
40-53,Yes,0.802395,0.197605
53-64,No,0.669291,0.330709
53-64,Yes,0.580645,0.419355


In [28]:
byage.unstack().drop('Dead', axis=1) # dropped dead column.

Unnamed: 0_level_0,outcome,Alive
ageGroup,smoker,Unnamed: 2_level_1
0-30,No,0.981818
0-30,Yes,0.97561
30-40,No,0.955224
30-40,Yes,0.940678
40-53,No,0.876106
40-53,Yes,0.802395
53-64,No,0.669291
53-64,Yes,0.580645


In [29]:
# Observation: Non-smokers have better life-expectancy.
# Observation: This data exhibits simpsons paradox. 

In [30]:
# Simpsons paradox (Propability and statistics)
# 1) A phenomenon appers in several groups of data. But disappears/reverses when those groups are combined.