# Advanced Dataframes Notes

In [1]:
import pandas as pd

In [2]:
import numpy as np

np.random.seed(123)

In [3]:
# Create list of values for names column.

students = ['Sally', 'Jane', 'Suzie', 'Billy', 'Ada', 'John', 'Thomas',
            'Marie', 'Albert', 'Richard', 'Isaac', 'Alan']

# Randomly generate arrays of scores for each student for each subject.
# Note that all the values need to have the same length here.

math_grades = np.random.randint(low=60, high=100, size=len(students))
english_grades = np.random.randint(low=60, high=100, size=len(students))
reading_grades = np.random.randint(low=60, high=100, size=len(students))

In [37]:
# Construct the DataFrame using the above lists and arrays.

df = pd.DataFrame({'name': students,
                   'math': math_grades,
                   'english': english_grades,
                   'reading': reading_grades,
                   'classroom': np.random.choice(['A', 'B'], len(students))})
df

Unnamed: 0,name,math,english,reading,classroom
0,Sally,62,85,80,A
1,Jane,88,79,67,B
2,Suzie,94,74,95,B
3,Billy,98,96,88,B
4,Ada,77,92,98,B
5,John,79,76,93,A
6,Thomas,82,64,81,A
7,Marie,93,63,90,A
8,Albert,92,62,87,A
9,Richard,69,80,94,B


In [5]:
# can make dataframes out of dictionaries of lists
# can make dataframes out of lists of dictionaries (as long as dicts have same keys)
# can make dfs out of lists of lists
# can make dfs out of arrays of arrays

In [6]:
# dataframes are rectangular data
# can read .csv files into a df
# can read .json files/format into df
# can read results of SQL queries into df

In [7]:
# read .csv file from a webpage into a dataframe
url = "https://gist.githubusercontent.com/ryanorsinger/19bc7eccd6279661bd13307026628ace/raw/e4b5d6787015a4782f96cad6d1d62a8bdbac54c7/lemonade.csv"
lemonade = pd.read_csv(url)
lemonade.head()

Unnamed: 0,Date,Day,Temperature,Rainfall,Flyers,Price,Sales
0,1/1/17,Sunday,27.0,2.0,15,0.5,10
1,1/2/17,Monday,28.9,1.33,15,0.5,13
2,1/3/17,Tuesday,34.5,1.33,27,0.5,15
3,1/4/17,Wednesday,44.1,1.05,28,0.5,17
4,1/5/17,Thursday,42.4,1.0,33,0.5,18


In [8]:
# can also read local .csv files
# file = 'file_path/file_name.csv'
# df = pd.read_csv(file)
# df.head()

In [9]:
# JSON is short for JavaScript Object Notation
# json is valid python syntax for dictionaries or lists of dictionaries
quotes = pd.read_json('https://aphorisms.glitch.me/api/all')
quotes.head()

Unnamed: 0,quote,author,name
0,"To go fast, go alone. To go far, go together",African Proverb,
1,"In fact, the only way to manage stress is to b...",anomymous,
2,Predispose yourself to practice,anonymous,
3,Respect the specs,Dr. Linda F. Wilson,
4,What we're doing is paint along with me rather...,Zach Gulde,


# SQL/Python+Pandas Crossover

In [10]:
def get_db_url(db_name):
    from env import user, host, password
    return f'mysql+pymysql://{user}:{password}@{host}/{db_name}'

In [11]:
sql = """
    SELECT * from dept_emp limit 3
"""
url = get_db_url("employees")

df1 = pd.read_sql(sql, url)
df1

Unnamed: 0,emp_no,dept_no,from_date,to_date
0,10001,d005,1986-06-26,9999-01-01
1,10002,d007,1996-08-03,9999-01-01
2,10003,d004,1995-12-03,9999-01-01


# Section 2

In [12]:
# getting a certain set of columns from dataframe

In [13]:
# make variable holding list of column names as strings
columns = ['name','math', 'classroom']
df[columns].head()

Unnamed: 0,name,math,classroom
0,Sally,62,A
1,Jane,88,B
2,Suzie,94,A
3,Billy,98,B
4,Ada,77,A


In [14]:
# or use double brackets without a variable:
df[['name','math', 'classroom']]

Unnamed: 0,name,math,classroom
0,Sally,62,A
1,Jane,88,B
2,Suzie,94,A
3,Billy,98,B
4,Ada,77,A
5,John,79,B
6,Thomas,82,A
7,Marie,93,A
8,Albert,92,A
9,Richard,69,A


In [16]:
# boolean mask review
a_names = df.name.str.startswith('A')
a_names

0     False
1     False
2     False
3     False
4      True
5     False
6     False
7     False
8      True
9     False
10    False
11     True
Name: name, dtype: bool

In [17]:
df[a_names]

Unnamed: 0,name,math,english,reading,classroom
4,Ada,77,92,98,A
8,Albert,92,62,87,A
11,Alan,92,62,72,A


In [18]:
# .loc
# df.loc[row_start:row_end, column_start:column_end]
# .loc is INCLUSIVE
df.loc[0:2]

Unnamed: 0,name,math,english,reading,classroom
0,Sally,62,85,80,A
1,Jane,88,79,67,B
2,Suzie,94,74,95,A


In [19]:
# show rows 0:2 (inclusive)
# show columns math through reading (inclusive)
df.loc[0:2, 'math':'reading']

Unnamed: 0,math,english,reading
0,62,85,80
1,88,79,67
2,94,74,95


In [21]:
df.loc[a_names, 'math':'reading']

Unnamed: 0,math,english,reading
4,77,92,98
8,92,62,87
11,92,62,72


In [22]:
# .iloc
# .iloc is NON-INCLUSIVE
# df.iloc[row_start:row_end, column_start:column_end]
# can't use column names, must use integer locations
df.iloc[0:2, 0:4]

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67


## Aggregate Functions
- sum, count, min, mean, max, median
- can run on a list, give you a single number
- can run on a whole column
- can run on a group

In [23]:
df.head()

Unnamed: 0,name,math,english,reading,classroom
0,Sally,62,85,80,A
1,Jane,88,79,67,B
2,Suzie,94,74,95,A
3,Billy,98,96,88,B
4,Ada,77,92,98,A


In [26]:
df.math.agg(['mean', 'min', 'max', 'count', 'median'])

mean      84.833333
min       62.000000
max       98.000000
count     12.000000
median    90.000000
Name: math, dtype: float64

In [29]:
df.describe()
# performs multiple agg functions

Unnamed: 0,math,english,reading
count,12.0,12.0,12.0
mean,84.833333,77.666667,86.5
std,11.134168,13.371158,9.643651
min,62.0,62.0,67.0
25%,78.5,63.75,80.75
50%,90.0,77.5,89.0
75%,92.25,86.75,93.25
max,98.0,99.0,98.0


In [33]:
# group by
# creates tuple of dataframes
# doesn't give any info on its own
# needs aggregate function(s) to give meaningful data
df.groupby('classroom').agg(['mean', 'median','max'])

Unnamed: 0_level_0,math,math,math,english,english,english,reading,reading,reading
Unnamed: 0_level_1,mean,median,max,mean,median,max,mean,median,max
classroom,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
A,82.625,87,94,72.75,69.0,92,87.125,88.5,98
B,89.25,90,98,87.5,87.5,99,85.25,90.5,93


In [34]:
# to transpose for readability
df.groupby('classroom').agg(['mean', 'median','max']).T

Unnamed: 0,classroom,A,B
math,mean,82.625,89.25
math,median,87.0,90.0
math,max,94.0,98.0
english,mean,72.75,87.5
english,median,69.0,87.5
english,max,92.0,99.0
reading,mean,87.125,85.25
reading,median,88.5,90.5
reading,max,98.0,93.0


In [36]:
df.groupby('classroom').describe()

Unnamed: 0_level_0,math,math,math,math,math,math,math,math,english,english,english,english,english,reading,reading,reading,reading,reading,reading,reading,reading
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
classroom,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
A,8.0,82.625,12.281665,62.0,75.0,87.0,92.25,94.0,8.0,72.75,...,81.25,92.0,8.0,87.125,8.88719,72.0,80.75,88.5,94.25,98.0
B,4.0,89.25,7.973916,79.0,85.75,90.0,93.5,98.0,4.0,87.5,...,96.75,99.0,4.0,85.25,12.392874,67.0,82.75,90.5,93.0,93.0


In [39]:
# can specify certain columns with groupby:
df.groupby('classroom')['english'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
classroom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,6.0,68.666667,9.626353,62.0,62.25,63.5,73.0,85.0
B,6.0,86.666667,10.308572,74.0,79.25,86.0,95.0,99.0


In [40]:
# np.where
# np.where(condition, this_where_True, this_where_False)
# create new column based on existing column
df['passing_math'] = np.where(df.math < 70, 'failing', 'passing')
df

Unnamed: 0,name,math,english,reading,classroom,passing_math
0,Sally,62,85,80,A,failing
1,Jane,88,79,67,B,passing
2,Suzie,94,74,95,B,passing
3,Billy,98,96,88,B,passing
4,Ada,77,92,98,B,passing
5,John,79,76,93,A,passing
6,Thomas,82,64,81,A,passing
7,Marie,93,63,90,A,passing
8,Albert,92,62,87,A,passing
9,Richard,69,80,94,B,failing


In [41]:
# can group by multiple columns
# like a python pivot table
grade_groups = df.groupby(['passing_math', 'classroom']).reading.agg(['mean', 'count'])
grade_groups

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
passing_math,classroom,Unnamed: 2_level_1,Unnamed: 3_level_1
failing,A,80.0,1
failing,B,94.0,1
passing,A,84.6,5
passing,B,88.2,5


In [43]:
grade_groups.columns

Index(['mean', 'count'], dtype='object')

In [45]:
# can change names of columns
grade_groups.columns = ['avg_reading_grade', 'number_of_students']
grade_groups

Unnamed: 0_level_0,Unnamed: 1_level_0,avg_reading_grade,number_of_students
passing_math,classroom,Unnamed: 2_level_1,Unnamed: 3_level_1
failing,A,80.0,1
failing,B,94.0,1
passing,A,84.6,5
passing,B,88.2,5


## Concatenating Dataframes

In [46]:
# concat with list of two dfs
# pd.concat([df1,df2], axis = 0)

In [47]:
df.math.mean(axis = 0) # axis = 0 means columnwise
# gives mean of column

84.83333333333333

In [48]:
df['total_avg_grade'] = df[['math','reading','english']].mean(axis = 1) # axis = 1 means row-wise
df.head()
# gives mean of row

Unnamed: 0,name,math,english,reading,classroom,passing_math,total_avg_grade
0,Sally,62,85,80,A,failing,75.666667
1,Jane,88,79,67,B,passing,78.0
2,Suzie,94,74,95,B,passing,87.666667
3,Billy,98,96,88,B,passing,94.0
4,Ada,77,92,98,B,passing,89.0


In [52]:
df1 = pd.DataFrame({'a': [1,2,3]}) # yesterday's data
df2 = pd.DataFrame({'a': [4,5,6]}) # today's data

# pd.concat([df1,df2]) # defaults axis argument to 0
concat_df1 = pd.concat([df1,df2], axis = 0, ignore_index = True)
concat_df1

Unnamed: 0,a
0,1
1,2
2,3
3,4
4,5
5,6


In [55]:
concat_df2 = pd.DataFrame({'b':[10,20,30,40,50,60]})
concat_df2

Unnamed: 0,b
0,10
1,20
2,30
3,40
4,50
5,60


In [57]:
pd.concat([concat_df1,concat_df2], axis = 1)

Unnamed: 0,a,b
0,1,10
1,2,20
2,3,30
3,4,40
4,5,50
5,6,60


In [58]:
# .merge
# works like a SQL join
# left_df.merge(right_df, how='inner', on=None, left_on=None, right_on=None, 
#               left_index=False, right_index=False, indicator=False)
users = pd.DataFrame({
    'id': [1, 2, 3, 4, 5, 6],
    'name': ['bob', 'joe', 'sally', 'adam', 'jane', 'mike'],
    'role_id': [1, 2, 3, 3, np.nan, np.nan]
})
roles = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'name': ['admin', 'author', 'reviewer', 'commenter']
})

In [59]:
# Perform an outer join specifying the left and right DataFrame keys.

users.merge(roles, left_on='role_id', right_on='id', how='outer', indicator=True)
# outer join stitches dataframes together, keeping nulls from both dataframes
# indicator argument creates _merge column, shows which table(s) data is from
# x and y in columns are auto-generated bc both dataframes had columns with those names
# can use .drop/.rename to tidy up output

Unnamed: 0,id_x,name_x,role_id,id_y,name_y,_merge
0,1.0,bob,1.0,1.0,admin,both
1,2.0,joe,2.0,2.0,author,both
2,3.0,sally,3.0,3.0,reviewer,both
3,4.0,adam,3.0,3.0,reviewer,both
4,5.0,jane,,,,left_only
5,6.0,mike,,,,left_only
6,,,,4.0,commenter,right_only


# Section 3

## Reshaping
- changing level of observation
- can focus on subgroups

In [60]:
# pd.crosstab

# count number of students passing math in each classroom
df

Unnamed: 0,name,math,english,reading,classroom,passing_math,total_avg_grade
0,Sally,62,85,80,A,failing,75.666667
1,Jane,88,79,67,B,passing,78.0
2,Suzie,94,74,95,B,passing,87.666667
3,Billy,98,96,88,B,passing,94.0
4,Ada,77,92,98,B,passing,89.0
5,John,79,76,93,A,passing,82.666667
6,Thomas,82,64,81,A,passing,75.666667
7,Marie,93,63,90,A,passing,82.0
8,Albert,92,62,87,A,passing,80.333333
9,Richard,69,80,94,B,failing,81.0


In [61]:
pd.crosstab(df.passing_math, df.classroom)

classroom,A,B
passing_math,Unnamed: 1_level_1,Unnamed: 2_level_1
failing,1,1
passing,5,5


In [62]:
# can view subtotals with margins set to True
pd.crosstab(df.passing_math, df.classroom, margins=True)

classroom,A,B,All
passing_math,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
failing,1,1,2
passing,5,5,10
All,6,6,12


In [65]:
# can get percentages too
pd.crosstab(df.passing_math, df.classroom, margins=True, normalize=True).round(3)

classroom,A,B,All
passing_math,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
failing,0.083,0.083,0.167
passing,0.417,0.417,0.833
All,0.5,0.5,1.0


In [66]:
# .pivot_table
# values that make up rows (index)
# values that make up columns
# values that we're aggregating
# aggregation method (defaults to mean)

df.pivot_table(index='classroom', columns='passing_math', values='math')
# shows avg math grade for all students, separated by classroom and pass/fail status

passing_math,failing,passing
classroom,Unnamed: 1_level_1,Unnamed: 2_level_1
A,62.0,87.6
B,69.0,89.8


In [69]:
# aggfunc lets you specify aggregate function
df.pivot_table(aggfunc='min', index='classroom', columns='passing_math', values='english')

passing_math,failing,passing
classroom,Unnamed: 1_level_1,Unnamed: 2_level_1
A,85,62
B,80,74


In [71]:
# can use aggfunc to show multiple aggregate functions
df.pivot_table(aggfunc=['min', 'median', 'mean'], index='classroom', columns='passing_math', values='english')

Unnamed: 0_level_0,min,min,median,median,mean,mean
passing_math,failing,passing,failing,passing,failing,passing
classroom,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,85,62,85,63,85.0,65.4
B,80,74,80,92,80.0,88.0


In [72]:
from pydataset import data
tips=data('tips')

In [73]:
tips.pivot_table(index='day', columns='time', values='tip')

time,Dinner,Lunch
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Fri,2.94,2.382857
Sat,2.993103,
Sun,3.255132,
Thur,3.0,2.767705


In [67]:
n = 40

orders = pd.DataFrame({
    'drink': np.random.choice(['Tea', 'Water', 'Water'], n),
    'meal': np.random.choice(['Curry', 'Yakisoba Noodle', 'Pad Thai'], n),
})

orders.sample(10)

Unnamed: 0,drink,meal
0,Tea,Yakisoba Noodle
4,Water,Yakisoba Noodle
22,Water,Pad Thai
16,Water,Curry
11,Tea,Pad Thai
19,Water,Curry
32,Water,Yakisoba Noodle
18,Water,Yakisoba Noodle
26,Water,Pad Thai
21,Tea,Yakisoba Noodle


In [74]:
prices = {
    'Yakisoba Noodle': 9,
    'Curry': 11,
    'Pad Thai': 10,
    'Tea': 2,
    'Water': 0,
}

In [77]:
orders.meal.map(prices) + orders.drink.map(prices)

0     11
1     10
2     10
3     13
4      9
5      9
6     11
7      9
8     11
9      9
10    12
11    12
12    11
13    11
14     9
15     9
16    11
17    11
18     9
19    11
20    12
21    11
22    10
23    11
24     9
25    11
26    10
27    11
28    13
29    10
30    11
31    10
32     9
33    11
34    11
35    11
36    13
37    11
38    11
39     9
dtype: int64

In [79]:
orders['bill'] = orders.meal.map(prices) + orders.drink.map(prices)
orders.head()

Unnamed: 0,drink,meal,bill
0,Tea,Yakisoba Noodle,11
1,Water,Pad Thai,10
2,Water,Pad Thai,10
3,Tea,Curry,13
4,Water,Yakisoba Noodle,9


In [81]:
orders.pivot_table(index='meal', columns='drink', values='bill')

drink,Tea,Water
meal,Unnamed: 1_level_1,Unnamed: 2_level_1
Curry,13,11
Pad Thai,12,10
Yakisoba Noodle,11,9


In [82]:
pd.crosstab(orders.meal, orders.drink, normalize=True, margins=True)

drink,Tea,Water,All
meal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Curry,0.075,0.35,0.425
Pad Thai,0.075,0.15,0.225
Yakisoba Noodle,0.1,0.25,0.35
All,0.25,0.75,1.0


In [85]:
# gives same info as pivot table above
orders.groupby(['drink','meal']).bill.mean()

drink  meal           
Tea    Curry              13
       Pad Thai           12
       Yakisoba Noodle    11
Water  Curry              11
       Pad Thai           10
       Yakisoba Noodle     9
Name: bill, dtype: int64