# 08_05: Summarizing and visualizing categorical data

In [None]:
import math
import collections
import dataclasses
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as pp

In [None]:
smoking = pd.read_csv('whickham.csv')

# encoding the variables explicitly as categories will save memory and CPU
smoking.smoker = smoking.smoker.astype('category')
smoking.outcome = smoking.outcome.astype('category')

In [None]:
smoking.head()

In [None]:
smoking.smoker.value_counts()

In [None]:
smoking.outcome.value_counts()

In [None]:
smoking.outcome.value_counts(normalize=True)

In [None]:
# specifying observer=True includes only categories that are represented in the data
# it makes no difference here but it suppressed a pandas warning

smoking.groupby("smoker", observed=True).outcome.value_counts(normalize=True)

In [None]:
smoking.groupby("smoker", observed=True).outcome.value_counts(normalize=True).unstack()

In [None]:
smoking['agegroup'] = pd.cut(smoking.age, bins=[15,40,65,90])
smoking['agegroup']

In [None]:
smoking.agegroup.value_counts()

In [None]:
smoking.groupby(['agegroup', 'smoker'], observed=True).outcome.value_counts(normalize=True).unstack()

In [None]:
pp.figure(figsize=(6,2))

pp.subplot(1,2,1)
smoking.outcome.value_counts().plot(kind='bar', color=['C0','C1']) 
pp.title('outcome')

pp.subplot(1,2,2)
smoking.smoker.value_counts().plot(kind='bar', color=['C2','C3'])
pp.title('smoker');

In [None]:
smoking.outcome.value_counts().plot(kind='barh', color=['C0','C1'], figsize=(3,2)) 

In [None]:
smoking.outcome.value_counts().plot(kind='pie', color=['C0','C1'], figsize=(3,2)) 

In [None]:
grouped = smoking.groupby("smoker", observed=True).outcome.value_counts(normalize=True).unstack()
grouped

In [None]:
axes = grouped.plot(kind='bar', figsize=(4,3))
axes.legend(loc='upper right', bbox_to_anchor=(1.1, 1)); # fix legend location

In [None]:
axes = grouped.plot(kind='bar', figsize=(4,3), stacked=True)
axes.legend(loc='upper right', bbox_to_anchor=(1.25, 1)); # fix legend location

In [None]:
regrouped = smoking.groupby(['agegroup', 'smoker'], observed=True).outcome.value_counts(normalize=True).unstack()
regrouped

In [None]:
regrouped.plot(kind='bar', figsize=(6,4));

In [None]:
regrouped = smoking.groupby(['agegroup', 'smoker'], observed=True) \
                   .outcome.value_counts(normalize=True).loc[:,:,'Alive'].unstack()
regrouped

In [None]:
regrouped.plot(kind='bar', color=['C2','C3'], figsize=(4,3));
pp.ylabel('fraction alive after 20 years');