In [1]:
import pandas as pd

# Understanding merge operations on DF

In [13]:
# Creating products and invoices dataframes
products=pd.DataFrame([{'Product ID':4109,'Price':5.0,'Product':'Sushi Roll'},
                      {'Product ID':1412,'Price':0.5,'Product':'Egg'},
                     {'Product ID':8931,'Price':1.5,'Product':'Bagel'}])
products=products.set_index('Product ID')
products.head()

Unnamed: 0_level_0,Price,Product
Product ID,Unnamed: 1_level_1,Unnamed: 2_level_1
4109,5.0,Sushi Roll
1412,0.5,Egg
8931,1.5,Bagel


In [14]:
invoices=pd.DataFrame([{'Customer':'Ali','Product ID':4109,'Quantity':1},
                      {'Customer':'Eric','Product ID':1412,'Quantity':12},
                      {'Customer':'Ande','Product ID':8931,'Quantity':6},
                      {'Customer':'Sam','Product ID':4109,'Quantity':2}])
invoices.head()

Unnamed: 0,Customer,Product ID,Quantity
0,Ali,4109,1
1,Eric,1412,12
2,Ande,8931,6
3,Sam,4109,2


In [17]:
# OUTER JOIN PERFORMS THE FULL JOIN AND SELECT ALL THE ENTRIES
result=pd.merge(products,invoices,how='outer',left_on=products.index,right_on='Product ID')
result.head()

Unnamed: 0,Price,Product,Customer,Product ID,Quantity
0,5.0,Sushi Roll,Ali,4109,1
1,5.0,Sushi Roll,Sam,4109,2
2,0.5,Egg,Eric,1412,12
3,1.5,Bagel,Ande,8931,6


In [18]:
# Creating a DF and understanding only those students who are also staff members, i.e. inner join
staff_df=pd.DataFrame([{'First Name':'Kelly','Last Name':'Desjardine','Role':'Director of HR'},
                      {'First Name':'Sally','Last Name':'Brooks','Role':'Course liaison'},
                      {'First Name':'James','Last Name':'Wilde','Role':'Grader'}])

student_df=pd.DataFrame([{'First Name':'James','Last Name':'Hammond','School':'Business'},
                        {'First Name':'Mike','Last Name':'Smith','School':'Law'},
                        {'First Name':'Sally','Last Name':'Brooks','School':'Engineering'}])
print(staff_df)
print(student_df)

  First Name   Last Name            Role
0      Kelly  Desjardine  Director of HR
1      Sally      Brooks  Course liaison
2      James       Wilde          Grader
  First Name Last Name       School
0      James   Hammond     Business
1       Mike     Smith          Law
2      Sally    Brooks  Engineering


In [19]:
# Selecting the students who are also staff members
result=pd.merge(student_df,staff_df,how='inner',on=['First Name','Last Name'])
result

Unnamed: 0,First Name,Last Name,School,Role
0,Sally,Brooks,Engineering,Course liaison


# Idiomatic Pandas : Making Code Pandorable

In [32]:
# There are more than on ways to solve a particular problem in Python,
# Idiomatic Python solutions are those that has both high performance and high readability.
census=pd.read_csv('../dataset/census.csv')
census.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,...,0.002295,-0.193196,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861


In [33]:
# Consider the following operations
census=census[census['SUMLEV']==50]
census.set_index(['STNAME','CTYNAME'],inplace=True)
census.rename(columns={'ESTIMATESBASE2010':'Estimate Base 2010'},inplace=True)
census.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SUMLEV,REGION,DIVISION,STATE,COUNTY,CENSUS2010POP,Estimate Base 2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alabama,Autauga County,50,3,6,1,1,54571,54571,54660,55253,55175,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
Alabama,Baldwin County,50,3,6,1,3,182265,182265,183193,186659,190396,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
Alabama,Barbour County,50,3,6,1,5,27457,27457,27341,27226,27159,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
Alabama,Bibb County,50,3,6,1,7,22915,22919,22861,22733,22642,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
Alabama,Blount County,50,3,6,1,9,57322,57322,57373,57711,57776,...,1.807375,-1.177622,-1.748766,-2.062535,-1.36997,1.859511,-0.84858,-1.402476,-1.577232,-0.884411


In [42]:
# The above operations can also be performed in one line,
# and hence makes the code more readable
census=pd.read_csv('../dataset/census.csv')
census=(census.where(census['SUMLEV']==50)
                .dropna()
                .set_index(['STNAME','CTYNAME'])
                .rename(columns={'ESTIMATESBASE2010':'Estimate Base 2010'}))
census.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SUMLEV,REGION,DIVISION,STATE,COUNTY,CENSUS2010POP,Estimate Base 2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alabama,Autauga County,50.0,3.0,6.0,1.0,1.0,54571.0,54571.0,54660.0,55253.0,55175.0,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
Alabama,Baldwin County,50.0,3.0,6.0,1.0,3.0,182265.0,182265.0,183193.0,186659.0,190396.0,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
Alabama,Barbour County,50.0,3.0,6.0,1.0,5.0,27457.0,27457.0,27341.0,27226.0,27159.0,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
Alabama,Bibb County,50.0,3.0,6.0,1.0,7.0,22915.0,22919.0,22861.0,22733.0,22642.0,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
Alabama,Blount County,50.0,3.0,6.0,1.0,9.0,57322.0,57322.0,57373.0,57711.0,57776.0,...,1.807375,-1.177622,-1.748766,-2.062535,-1.36997,1.859511,-0.84858,-1.402476,-1.577232,-0.884411
