In [26]:
import pandas as pd
import numpy as np

In [27]:
df = pd.DataFrame(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D'],
                     index = ['Excellent', 'Excellent', 'Excellent',
                                'Good', 'Good', 'Good',
                                 'Average', 'Average', 'Average',
                                 'OK', 'OK'])
df.rename(columns={0: 'Letter Grades'}, inplace = True)
df

Unnamed: 0,Letter Grades
Excellent,A+
Excellent,A
Excellent,A-
Good,B+
Good,B
Good,B-
Average,C+
Average,C
Average,C-
OK,D+


In [28]:
# assigning a column a particular scale, in this case type is category, eg. of nominal data
df['Letter Grades'].astype('category')

Excellent    A+
Excellent     A
Excellent    A-
Good         B+
Good          B
Good         B-
Average      C+
Average       C
Average      C-
OK           D+
OK            D
Name: Letter Grades, dtype: category
Categories (11, object): [A, A+, A-, B, ..., C+, C-, D, D+]

In [29]:
# giving order to different categories so that they can be compared, eg. of ordinal scale
grades = df['Letter Grades'].astype('category',
                                    categories = ['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+'],
                                    ordered = True)

  This is separate from the ipykernel package so we can avoid doing imports until


In [30]:
# ordered data helps in creating boolean masking
grades > 'C'

Excellent     True
Excellent     True
Excellent     True
Good          True
Good          True
Good          True
Average       True
Average      False
Average      False
OK           False
OK           False
Name: Letter Grades, dtype: bool

In [35]:
# using pd.cut to segregate the values of the columns into different bins, which are equally spaced
# eg. of interval or ratio scale 
cdf = pd.read_csv('census.csv')
cdf = cdf[cdf['SUMLEV'] == 50]
cut_cdf = cdf.set_index('STNAME').groupby(level=0).agg({'CENSUS2010POP': np.average})
pd.cut(cut_cdf['CENSUS2010POP'], 10)

STNAME
Alabama                   (11706.087, 75333.413]
Alaska                    (11706.087, 75333.413]
Arizona                 (390320.176, 453317.529]
Arkansas                  (11706.087, 75333.413]
California              (579312.234, 642309.586]
Colorado                 (75333.413, 138330.766]
Connecticut             (390320.176, 453317.529]
Delaware                (264325.471, 327322.823]
District of Columbia    (579312.234, 642309.586]
Florida                 (264325.471, 327322.823]
Georgia                   (11706.087, 75333.413]
Hawaii                  (264325.471, 327322.823]
Idaho                     (11706.087, 75333.413]
Illinois                 (75333.413, 138330.766]
Indiana                   (11706.087, 75333.413]
Iowa                      (11706.087, 75333.413]
Kansas                    (11706.087, 75333.413]
Kentucky                  (11706.087, 75333.413]
Louisiana                 (11706.087, 75333.413]
Maine                    (75333.413, 138330.766]
Maryland     

In [40]:
# cutting the data into different bins and labelling each bin
pop_comparison = pd.cut(cut_cdf['CENSUS2010POP'], 3, labels = ['Scarcely Populated', 'Moderately Populated', 'Densely Populated'])
pop_comparison

STNAME
Alabama                   Scarcely Populated
Alaska                    Scarcely Populated
Arizona                 Moderately Populated
Arkansas                  Scarcely Populated
California                 Densely Populated
Colorado                  Scarcely Populated
Connecticut                Densely Populated
Delaware                Moderately Populated
District of Columbia       Densely Populated
Florida                 Moderately Populated
Georgia                   Scarcely Populated
Hawaii                  Moderately Populated
Idaho                     Scarcely Populated
Illinois                  Scarcely Populated
Indiana                   Scarcely Populated
Iowa                      Scarcely Populated
Kansas                    Scarcely Populated
Kentucky                  Scarcely Populated
Louisiana                 Scarcely Populated
Maine                     Scarcely Populated
Maryland                Moderately Populated
Massachusetts              Densely Populated
Mic