In [26]:
import pandas as pd
import numpy as np

In [27]:
df = pd.DataFrame(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D'],
                     index = ['Excellent', 'Excellent', 'Excellent',
                                'Good', 'Good', 'Good',
                                 'Average', 'Average', 'Average',
                                 'OK', 'OK'])
df.rename(columns={0: 'Letter Grades'}, inplace = True)
df

Unnamed: 0,Letter Grades
Excellent,A+
Excellent,A
Excellent,A-
Good,B+
Good,B
Good,B-
Average,C+
Average,C
Average,C-
OK,D+


In [28]:
# assigning a column a particular scale, in this case type is category, eg. of nominal data
df['Letter Grades'].astype('category')

Excellent    A+
Excellent     A
Excellent    A-
Good         B+
Good          B
Good         B-
Average      C+
Average       C
Average      C-
OK           D+
OK            D
Name: Letter Grades, dtype: category
Categories (11, object): [A, A+, A-, B, ..., C+, C-, D, D+]

In [29]:
# giving order to different categories so that they can be compared, eg. of ordinal scale
grades = df['Letter Grades'].astype('category',
                                    categories = ['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+'],
                                    ordered = True)

  This is separate from the ipykernel package so we can avoid doing imports until


In [30]:
# ordered data helps in creating boolean masking
grades > 'C'

Excellent     True
Excellent     True
Excellent     True
Good          True
Good          True
Good          True
Average       True
Average      False
Average      False
OK           False
OK           False
Name: Letter Grades, dtype: bool

In [35]:
# using pd.cut to segregate the values of the columns into different bins, which are equally spaced
# eg. of interval or ratio scale 
cdf = pd.read_csv('census.csv')
cdf = cdf[cdf['SUMLEV'] == 50]
cut_cdf = cdf.set_index('STNAME').groupby(level=0).agg({'CENSUS2010POP': np.average})
pd.cut(cut_cdf['CENSUS2010POP'], 10)

STNAME
Alabama                   (11706.087, 75333.413]
Alaska                    (11706.087, 75333.413]
Arizona                 (390320.176, 453317.529]
Arkansas                  (11706.087, 75333.413]
California              (579312.234, 642309.586]
Colorado                 (75333.413, 138330.766]
Connecticut             (390320.176, 453317.529]
Delaware                (264325.471, 327322.823]
District of Columbia    (579312.234, 642309.586]
Florida                 (264325.471, 327322.823]
Georgia                   (11706.087, 75333.413]
Hawaii                  (264325.471, 327322.823]
Idaho                     (11706.087, 75333.413]
Illinois                 (75333.413, 138330.766]
Indiana                   (11706.087, 75333.413]
Iowa                      (11706.087, 75333.413]
Kansas                    (11706.087, 75333.413]
Kentucky                  (11706.087, 75333.413]
Louisiana                 (11706.087, 75333.413]
Maine                    (75333.413, 138330.766]
Maryland     

In [40]:
# cutting the data into different bins and labelling each bin
pop_comparison = pd.cut(cut_cdf['CENSUS2010POP'], 3, labels = ['Scarcely Populated', 'Moderately Populated', 'Densely Populated'])
pop_comparison

STNAME
Alabama                   Scarcely Populated
Alaska                    Scarcely Populated
Arizona                 Moderately Populated
Arkansas                  Scarcely Populated
California                 Densely Populated
Colorado                  Scarcely Populated
Connecticut                Densely Populated
Delaware                Moderately Populated
District of Columbia       Densely Populated
Florida                 Moderately Populated
Georgia                   Scarcely Populated
Hawaii                  Moderately Populated
Idaho                     Scarcely Populated
Illinois                  Scarcely Populated
Indiana                   Scarcely Populated
Iowa                      Scarcely Populated
Kansas                    Scarcely Populated
Kentucky                  Scarcely Populated
Louisiana                 Scarcely Populated
Maine                     Scarcely Populated
Maryland                Moderately Populated
Massachusetts              Densely Populated
Mic

In [41]:
cars = pd.read_csv('cars.csv')
cars

Unnamed: 0,YEAR,Make,Model,Size,(kW),Unnamed: 5,TYPE,CITY (kWh/100 km),HWY (kWh/100 km),COMB (kWh/100 km),CITY (Le/100 km),HWY (Le/100 km),COMB (Le/100 km),(g/km),RATING,(km),TIME (h)
0,2012,MITSUBISHI,i-MiEV,SUBCOMPACT,49,A1,B,16.9,21.4,18.7,1.9,2.4,2.1,0,,100,7
1,2012,NISSAN,LEAF,MID-SIZE,80,A1,B,19.3,23.0,21.1,2.2,2.6,2.4,0,,117,7
2,2013,FORD,FOCUS ELECTRIC,COMPACT,107,A1,B,19.0,21.1,20.0,2.1,2.4,2.2,0,,122,4
3,2013,MITSUBISHI,i-MiEV,SUBCOMPACT,49,A1,B,16.9,21.4,18.7,1.9,2.4,2.1,0,,100,7
4,2013,NISSAN,LEAF,MID-SIZE,80,A1,B,19.3,23.0,21.1,2.2,2.6,2.4,0,,117,7
5,2013,SMART,FORTWO ELECTRIC DRIVE CABRIOLET,TWO-SEATER,35,A1,B,17.2,22.5,19.6,1.9,2.5,2.2,0,,109,8
6,2013,SMART,FORTWO ELECTRIC DRIVE COUPE,TWO-SEATER,35,A1,B,17.2,22.5,19.6,1.9,2.5,2.2,0,,109,8
7,2013,TESLA,MODEL S (40 kWh battery),FULL-SIZE,270,A1,B,22.4,21.9,22.2,2.5,2.5,2.5,0,,224,6
8,2013,TESLA,MODEL S (60 kWh battery),FULL-SIZE,270,A1,B,22.2,21.7,21.9,2.5,2.4,2.5,0,,335,10
9,2013,TESLA,MODEL S (85 kWh battery),FULL-SIZE,270,A1,B,23.8,23.2,23.6,2.7,2.6,2.6,0,,426,12


In [43]:
# getting the mean of battery capacity for each manufacturer and each year
# index uses the column as the index for the pivot table
# columns uses the column as the pivot table's columns
# values uses the column values to be considered to go in the pivot table cells
# aggfunc is the aggregation function that will be applied to the values

cars.pivot_table(values = '(kW)', columns = 'Make', index = 'YEAR', aggfunc = np.mean)

Make,BMW,CHEVROLET,FORD,KIA,MITSUBISHI,NISSAN,SMART,TESLA
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012,,,,,49.0,80.0,,
2013,,,107.0,,49.0,80.0,35.0,280.0
2014,,104.0,107.0,,49.0,80.0,35.0,268.333333
2015,125.0,104.0,107.0,81.0,49.0,80.0,35.0,320.666667
2016,125.0,104.0,107.0,81.0,49.0,80.0,35.0,409.7


In [48]:
# passing a list of aggregation functions will give us another column which gives overall statistics as well

cars.pivot_table(values = '(kW)', columns = 'Make', index = 'YEAR', aggfunc = [np.mean, np.min], margins = True)

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,amin,amin,amin,amin,amin,amin,amin,amin,amin
Make,BMW,CHEVROLET,FORD,KIA,MITSUBISHI,NISSAN,SMART,TESLA,All,BMW,CHEVROLET,FORD,KIA,MITSUBISHI,NISSAN,SMART,TESLA,All
YEAR,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
2012,,,,,49.0,80.0,,,64.5,,,,,49.0,80.0,,,49
2013,,,107.0,,49.0,80.0,35.0,280.0,158.444444,,,107.0,,49.0,80.0,35.0,270.0,35
2014,,104.0,107.0,,49.0,80.0,35.0,268.333333,135.0,,104.0,107.0,,49.0,80.0,35.0,225.0,35
2015,125.0,104.0,107.0,81.0,49.0,80.0,35.0,320.666667,181.428571,125.0,104.0,107.0,81.0,49.0,80.0,35.0,280.0,35
2016,125.0,104.0,107.0,81.0,49.0,80.0,35.0,409.7,252.263158,125.0,104.0,107.0,81.0,49.0,80.0,35.0,283.0,35
All,125.0,104.0,107.0,81.0,49.0,80.0,35.0,345.478261,190.622642,125.0,104.0,107.0,81.0,49.0,80.0,35.0,225.0,35
