# Chapter 5 - Basic Math and Statistics
## Segment 4 - Summarizing categorical data using pandas

In [3]:
import numpy as np
import pandas as pd

### The basics

In [9]:
address = 'Data/mtcars.csv'
cars = pd.read_csv(address)
cars.columns = ['car_names', 'mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']
cars.index = cars.car_names
cars

Unnamed: 0_level_0,car_names,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
car_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Mazda RX4,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
Valiant,Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
Duster 360,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
Merc 240D,Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
Merc 230,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
Merc 280,Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [14]:
carb = cars.carb
carb.value_counts()

4    10
2    10
1     7
3     3
8     1
6     1
Name: carb, dtype: int64

In [25]:
cars_cat = cars[['cyl', 'vs', 'am', 'gear', 'carb']]
cars_cat.head()

Unnamed: 0_level_0,cyl,vs,am,gear,carb
car_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mazda RX4,6,0,1,4,4
Mazda RX4 Wag,6,0,1,4,4
Datsun 710,4,1,1,4,1
Hornet 4 Drive,6,1,0,3,1
Hornet Sportabout,8,0,0,3,2


In [24]:
gears_group = cars_cat.groupby('gear')
gears_group.head()

Unnamed: 0_level_0,cyl,vs,am,gear,carb
car_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mazda RX4,6,0,1,4,4
Mazda RX4 Wag,6,0,1,4,4
Datsun 710,4,1,1,4,1
Hornet 4 Drive,6,1,0,3,1
Hornet Sportabout,8,0,0,3,2
Valiant,6,1,0,3,1
Duster 360,8,0,0,3,4
Merc 240D,4,1,0,4,2
Merc 230,4,1,0,4,2
Merc 450SE,8,0,0,3,3


In [23]:
gears_group.describe()

Unnamed: 0_level_0,cyl,cyl,cyl,cyl,cyl,cyl,cyl,cyl,vs,vs,...,am,am,carb,carb,carb,carb,carb,carb,carb,carb
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
gear,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
3,15.0,7.466667,1.187234,4.0,8.0,8.0,8.0,8.0,15.0,0.2,...,0.0,0.0,15.0,2.666667,1.175139,1.0,2.0,3.0,4.0,4.0
4,12.0,4.666667,0.984732,4.0,4.0,4.0,6.0,6.0,12.0,0.833333,...,1.0,1.0,12.0,2.333333,1.302678,1.0,1.0,2.0,4.0,4.0
5,5.0,6.0,2.0,4.0,4.0,6.0,8.0,8.0,5.0,0.2,...,1.0,1.0,5.0,4.4,2.607681,2.0,2.0,4.0,6.0,8.0


### Transforming variables to categorical data type

In [29]:
cars['gear_group'] = pd.Series(cars.gear, dtype='category')
cars.head()

Unnamed: 0_level_0,car_names,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,gear_group
car_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Mazda RX4,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4,4
Mazda RX4 Wag,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4,4
Datsun 710,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1,4
Hornet 4 Drive,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1,3
Hornet Sportabout,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2,3


In [37]:
cars['gear_group'].dtypes

CategoricalDtype(categories=[3, 4, 5], ordered=False)

In [40]:
cars['gear_group'].value_counts()

3    15
4    12
5     5
Name: gear_group, dtype: int64

### Describing categorical data with crosstabs

In [43]:
pd.crosstab(cars['am'], cars['gear'])

gear,3,4,5
am,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,15,4,0
1,0,8,5


In [44]:
pd.crosstab(cars['mpg'], cars['hp'])

hp,52,62,65,66,91,93,95,97,105,109,...,123,150,175,180,205,215,230,245,264,335
mpg,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10.4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
13.3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
14.3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
14.7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
15.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
15.2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
15.5,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
15.8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
16.4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
17.3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
