In [None]:
### Scales
# (a,b)  (c,d): Scales
"""
- Ratio scale:
  - units are equally spaced
  - mathematical operations of +-*/ are all valid
  - E.g. height and weight
  
- Interval scale:
  - units are equally spaced, but there is no true zero

- Ordinal scale:
  - the order of the units are important, but not evenly spaced
  - Letter grades such as A+, A are a good example

- Nominal scale:
  - categories of data, but the categories have no order with respect to one another
  - E.g.Teams of a sport
"""

In [3]:
import pandas as pd
df = pd.DataFrame(['A+','A','A-','B+','B','B-','C+','C','C-','D+','D'],
                  index = ['excellent','excellent','excellent','good','good','good','ok','ok',
                           'ok','poor','poor'])
df.rename(columns = {0:'Grades'}, inplace = True)
df

Unnamed: 0,Grades
excellent,A+
excellent,A
excellent,A-
good,B+
good,B
good,B-
ok,C+
ok,C
ok,C-
poor,D+


In [4]:
# then we could use astype to transform thest data into categorical data
df['Grades'].astype('category')


excellent    A+
excellent     A
excellent    A-
good         B+
good          B
good         B-
ok           C+
ok            C
ok           C-
poor         D+
poor          D
Name: Grades, dtype: category
Categories (11, object): [A, A+, A-, B, ..., C+, C-, D, D+]

In [5]:
import pandas as pd
df = pd.DataFrame(['A+','A','A-','B+','B','B-','C+','C','C-','D+','D'],
                  index = ['excellent','excellent','excellent','good','good','good','ok','ok',
                           'ok','poor','poor'])
df.rename(columns = {0:'Grades'}, inplace = True)
# we could also make the grades in an ordinal order by using ordered
grades = df['Grades'].astype('category',categories = ['D','D+','C-','C','C+','B-','B','B+','A-',
                                                      'A','A+'],ordered = True)
grades

  


excellent    A+
excellent     A
excellent    A-
good         B+
good          B
good         B-
ok           C+
ok            C
ok           C-
poor         D+
poor          D
Name: Grades, dtype: category
Categories (11, object): [D < D+ < C- < C ... B+ < A- < A < A+]

In [7]:
# Well since we already have this ordinal data, we could using the order to make the boolean mask
df = df[df['Grades']>'C']
df

Unnamed: 0,Grades
ok,C+
ok,C-
poor,D+
poor,D


In [9]:
# Exercise 1
s = pd.Series(['Low','Low','High','Medium','Low','High','Low'])
level = s.astype('category',categories = ['Low','Medium','High'],ordered = True)
level
# the schema is df = df.astype('category', categories = [.. , .. , .., ..], ordered =True),
# then the actually order is the order within the list

  This is separate from the ipykernel package so we can avoid doing imports until


0       Low
1       Low
2      High
3    Medium
4       Low
5      High
6       Low
dtype: category
Categories (3, object): [Low < Medium < High]

In [14]:
import numpy as np
df = pd.read_csv('census.csv')
df = df[df['SUMLEV'] == 50]
df = df.set_index('STNAME').groupby(level = 0)['CENSUS2010POP'].agg({'avg':np.average})
# this will cut the df into n bins, each with equal length of interval,
# and the output will gives back the index of each data and which interval they belongs to
print(pd.cut(df['avg'],10))

STNAME
Alabama                   (11706.087, 75333.413]
Alaska                    (11706.087, 75333.413]
Arizona                 (390320.176, 453317.529]
Arkansas                  (11706.087, 75333.413]
California              (579312.234, 642309.586]
Colorado                 (75333.413, 138330.766]
Connecticut             (390320.176, 453317.529]
Delaware                (264325.471, 327322.823]
District of Columbia    (579312.234, 642309.586]
Florida                 (264325.471, 327322.823]
Georgia                   (11706.087, 75333.413]
Hawaii                  (264325.471, 327322.823]
Idaho                     (11706.087, 75333.413]
Illinois                 (75333.413, 138330.766]
Indiana                   (11706.087, 75333.413]
Iowa                      (11706.087, 75333.413]
Kansas                    (11706.087, 75333.413]
Kentucky                  (11706.087, 75333.413]
Louisiana                 (11706.087, 75333.413]
Maine                    (75333.413, 138330.766]
Maryland     

is deprecated and will be removed in a future version
  after removing the cwd from sys.path.


In [21]:
# Exercise 2
# Suppose we have a series that holds height data for jacket weares. Use pd.cut to bin this data
# into 3 bins
s = pd.Series([168,180,174,190,170,185,179,181,175,169,182,177,180,171])
pd.cut(s,3)

0     (167.978, 175.333]
1     (175.333, 182.667]
2     (167.978, 175.333]
3       (182.667, 190.0]
4     (167.978, 175.333]
5       (182.667, 190.0]
6     (175.333, 182.667]
7     (175.333, 182.667]
8     (167.978, 175.333]
9     (167.978, 175.333]
10    (175.333, 182.667]
11    (175.333, 182.667]
12    (175.333, 182.667]
13    (167.978, 175.333]
dtype: category
Categories (3, interval[float64]): [(167.978, 175.333] < (175.333, 182.667] < (182.667, 190.0]]