In [34]:
import pandas as pd
import numpy as np

## Creating Series

In [2]:
# from simple list
animals=['Tiger','Lion',None,'Giraffee']
pd.Series(animals)

0       Tiger
1        Lion
2        None
3    Giraffee
dtype: object

In [3]:
numbers=[1,2,None,3,4]
pd.Series(numbers)

0    1.0
1    2.0
2    NaN
3    3.0
4    4.0
dtype: float64

In [10]:
# Creating series from dict
sports={'Arhcery':'Bhutan',
       'Cricket':'India',
       'Swimming':'USA'}
s=pd.Series(sports)
s

Arhcery     Bhutan
Cricket      India
Swimming       USA
dtype: object

In [11]:
s.index

Index(['Arhcery', 'Cricket', 'Swimming'], dtype='object')

## Querying in Series

In [15]:
s.iloc[2]   # Attributes with index
s[2]

'USA'

In [16]:
# loc[] can also be used to add new values if the key doesn't exist
s.loc['Cricket']   # Queries by the key values
s['Cricket']  # Same as above

'India'

## Operations on data

In [17]:
s=pd.Series([100.00,120.00,101.00,3.00])
s

0    100.0
1    120.0
2    101.0
3      3.0
dtype: float64

In [18]:
total=0
for item in s:
    total+=item
print(total)   # This works but is very slow for large values

324.0


In [19]:
# uses the numpy functionality.
s.sum()

324.0

In [21]:
# checking if the second is actually faster.
# creating larger Series
s=pd.Series(np.random.randint(0,1000,1000))
s.head()

0    682
1    198
2    945
3    322
4    807
dtype: int64

In [24]:
%%timeit -n 100
total=0
for item in s:
    total+=item

155 µs ± 28.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [26]:
%%timeit -n 100
summary=s.sum() 

102 µs ± 41.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


[NOTE :
Series need not have unique values in the index]

## Creating a DataFrame

In [2]:
purchase1=pd.Series({'Name':'Ram',
                    'Item':'Chocolate',
                    'Cost':12.00})
purchase2=pd.Series({'Name':'Sham',
                    'Item':'Ice cream',
                    'Cost':12.00})
purchase3=pd.Series({'Name':'Sam',
                    'Item':'Ice',
                    'Cost':5.00})
df=pd.DataFrame([purchase1,purchase2,purchase3],index=['Store1','Store2','Store3'])
df

Unnamed: 0,Name,Item,Cost
Store1,Ram,Chocolate,12.0
Store2,Sham,Ice cream,12.0
Store3,Sam,Ice,5.0


In [6]:
# loc , iloc 
df.loc['Store1']
df.iloc[0]

Name          Ram
Item    Chocolate
Cost           12
Name: Store1, dtype: object

In [9]:
df.loc[:,['Name','Cost']]

Unnamed: 0,Name,Cost
Store1,Ram,12.0
Store2,Sham,12.0
Store3,Sam,5.0


## QUERYING

In [10]:
df=pd.read_csv('dataset/olympics.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,,№ Summer,01 !,02 !,03 !,Total,№ Winter,01 !,02 !,03 !,Total,№ Games,01 !,02 !,03 !,Combined total
1,Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
2,Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
3,Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
4,Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12


In [35]:
# we'll remove first row while importing the data
df=pd.read_csv('dataset/olympics.csv',skiprows=1,index_col=0)
df.head()

Unnamed: 0,№ Summer,01 !,02 !,03 !,Total,№ Winter,01 !.1,02 !.1,03 !.1,Total.1,№ Games,01 !.2,02 !.2,03 !.2,Combined total
Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12
Australasia (ANZ) [ANZ],2,3,4,5,12,0,0,0,0,0,2,3,4,5,12


In [36]:
# Renamming the columns to make the name more sensible
for col in df.columns:
    if col[:2]=='01':
        df.rename(columns={col:'GOLD'+col[4:]},inplace=True)
    if col[:2]=='02':
        df.rename(columns={col:'SILVER'+col[4:]},inplace=True)
    if col[:2]=='03':
        df.rename(columns={col:'BRONZE'+col[4:]},inplace=True)
    if col[:1]=='№':
        df.rename(columns={col:"#"+col[1:]},inplace=True)

df.head()

Unnamed: 0,# Summer,GOLD,SILVER,BRONZE,Total,# Winter,GOLD.1,SILVER.1,BRONZE.1,Total.1,# Games,GOLD.2,SILVER.2,BRONZE.2,Combined total
Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12
Australasia (ANZ) [ANZ],2,3,4,5,12,0,0,0,0,0,2,3,4,5,12


In [26]:
# Using boolean mask to extract the data from the df
# Selecting those countries who has won atleast 1 Gold in the olympics
new_gold=df[df['GOLD']>0]
new_gold.head()

Unnamed: 0,# Summer,GOLD,SILVER,BRONZE,Total,# Winter,GOLD.1,SILVER.1,BRONZE.1,Total.1,# Games,GOLD.2,SILVER.2,BRONZE.2,Combined total
Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12
Australasia (ANZ) [ANZ],2,3,4,5,12,0,0,0,0,0,2,3,4,5,12
Australia (AUS) [AUS] [Z],25,139,152,177,468,18,5,3,4,12,43,144,155,181,480


In [30]:
# Using more than one condition for choosing the data
# Countries who have won Gold either in winter or summer olympics
len(df[(df['GOLD']>0) | (df['GOLD.1']>0)])

101

In [37]:
# Countries who have won Gold only in winter olympics but not in Summer olympics
df[(df['GOLD.1']>0) & (df['GOLD']==0)]

Unnamed: 0,# Summer,GOLD,SILVER,BRONZE,Total,# Winter,GOLD.1,SILVER.1,BRONZE.1,Total.1,# Games,GOLD.2,SILVER.2,BRONZE.2,Combined total
Liechtenstein (LIE),16,0,0,0,0,18,2,2,5,9,34,2,2,5,9


# Indexing DataFrames

In [38]:
# Suppose we don't want to index via Country and Index by Gold column
# It is a destructive process and we need to save the data i.e. index
df['Country']=df.index
df=df.set_index('GOLD')

In [39]:
df.head()

Unnamed: 0_level_0,# Summer,SILVER,BRONZE,Total,# Winter,GOLD.1,SILVER.1,BRONZE.1,Total.1,# Games,GOLD.2,SILVER.2,BRONZE.2,Combined total,Country
GOLD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,13,0,2,2,0,0,0,0,0,13,0,0,2,2,Afghanistan (AFG)
5,12,2,8,15,3,0,0,0,0,15,5,2,8,15,Algeria (ALG)
18,23,24,28,70,18,0,0,0,0,41,18,24,28,70,Argentina (ARG)
1,5,2,9,12,6,0,0,0,0,11,1,2,9,12,Armenia (ARM)
3,2,4,5,12,0,0,0,0,0,2,3,4,5,12,Australasia (ANZ) [ANZ]


In [40]:
# we can reset the default index to numeric values by using the f'n reset_index
df=df.reset_index()
df.head()

Unnamed: 0,GOLD,# Summer,SILVER,BRONZE,Total,# Winter,GOLD.1,SILVER.1,BRONZE.1,Total.1,# Games,GOLD.2,SILVER.2,BRONZE.2,Combined total,Country
0,0,13,0,2,2,0,0,0,0,0,13,0,0,2,2,Afghanistan (AFG)
1,5,12,2,8,15,3,0,0,0,0,15,5,2,8,15,Algeria (ALG)
2,18,23,24,28,70,18,0,0,0,0,41,18,24,28,70,Argentina (ARG)
3,1,5,2,9,12,6,0,0,0,0,11,1,2,9,12,Armenia (ARM)
4,3,2,4,5,12,0,0,0,0,0,2,3,4,5,12,Australasia (ANZ) [ANZ]


## Composite indexing

In [44]:
# Load the census data
# US census bureau data that contains population breakdown at the US County level
df=pd.read_csv('dataset/census.csv')
df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,...,0.002295,-0.193196,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861


In [45]:
# removing summary data for whole country
print(df['SUMLEV'].unique())
df=df[df['SUMLEV']==50]
df.head()

[40 50]


Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
5,50,3,6,1,9,Alabama,Blount County,57322,57322,57373,...,1.807375,-1.177622,-1.748766,-2.062535,-1.36997,1.859511,-0.84858,-1.402476,-1.577232,-0.884411


In [47]:
# reduce the data to total population estimate and total births
cols_to_keep=['STNAME',
             'CTYNAME',
             'BIRTHS2010',
             'BIRTHS2011',
             'BIRTHS2012',
             'BIRTHS2013',
             'BIRTHS2014',
             'BIRTHS2015',
             'POPESTIMATE2010',
             'POPESTIMATE2011',
             'POPESTIMATE2012',
             'POPESTIMATE2013',
             'POPESTIMATE2014',
             'POPESTIMATE2015']
df=df[cols_to_keep]
df.head()

Unnamed: 0,STNAME,CTYNAME,BIRTHS2010,BIRTHS2011,BIRTHS2012,BIRTHS2013,BIRTHS2014,BIRTHS2015,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015
1,Alabama,Autauga County,151,636,615,574,623,600,54660,55253,55175,55038,55290,55347
2,Alabama,Baldwin County,517,2187,2092,2160,2186,2240,183193,186659,190396,195126,199713,203709
3,Alabama,Barbour County,70,335,300,283,260,269,27341,27226,27159,26973,26815,26489
4,Alabama,Bibb County,44,266,245,259,247,253,22861,22733,22642,22512,22549,22583
5,Alabama,Blount County,183,744,710,646,618,603,57373,57711,57776,57734,57658,57673


In [48]:
# Reindexing to first the STNAME and then the CTYNAME
df=df.set_index(['STNAME','CTYNAME'])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,BIRTHS2010,BIRTHS2011,BIRTHS2012,BIRTHS2013,BIRTHS2014,BIRTHS2015,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Alabama,Autauga County,151,636,615,574,623,600,54660,55253,55175,55038,55290,55347
Alabama,Baldwin County,517,2187,2092,2160,2186,2240,183193,186659,190396,195126,199713,203709
Alabama,Barbour County,70,335,300,283,260,269,27341,27226,27159,26973,26815,26489
Alabama,Bibb County,44,266,245,259,247,253,22861,22733,22642,22512,22549,22583
Alabama,Blount County,183,744,710,646,618,603,57373,57711,57776,57734,57658,57673


In [49]:
# Querying this data using Loc base on hierarchy of index
df.loc['Michigan','Washtenaw County']

BIRTHS2010            977
BIRTHS2011           3826
BIRTHS2012           3780
BIRTHS2013           3662
BIRTHS2014           3683
BIRTHS2015           3709
POPESTIMATE2010    345563
POPESTIMATE2011    349048
POPESTIMATE2012    351213
POPESTIMATE2013    354289
POPESTIMATE2014    357029
POPESTIMATE2015    358880
Name: (Michigan, Washtenaw County), dtype: int64

In [52]:
# Querying this data to compare county of two different cities
df.loc[[('Michigan','Washtenaw County'),('Michigan','Wayne County')]]

Unnamed: 0_level_0,Unnamed: 1_level_0,BIRTHS2010,BIRTHS2011,BIRTHS2012,BIRTHS2013,BIRTHS2014,BIRTHS2015,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Michigan,Washtenaw County,977,3826,3780,3662,3683,3709,345563,349048,351213,354289,357029,358880
Michigan,Wayne County,5918,23819,23270,23377,23607,23586,1815199,1801273,1792514,1775713,1766008,1759335
