## Agenda

- How to connect to SQL and python (pandas)
- How to set a column as index?
- Working with both rows & columns
- Handling duplicate records
- Pandas built-in operations
- Aggregate functions
- Sorting values
- Concatenating DataFrames
- Merging DataFrames

In [None]:
!wget "https://drive.google.com/uc?export=download&id=1E3bwvYGf1ig32RmcYiWc0IXPN-mD_bI_" -O mckinsey.csv

--2024-05-25 16:11:39--  https://drive.google.com/uc?export=download&id=1E3bwvYGf1ig32RmcYiWc0IXPN-mD_bI_
Resolving drive.google.com (drive.google.com)... 108.177.11.101, 108.177.11.139, 108.177.11.102, ...
Connecting to drive.google.com (drive.google.com)|108.177.11.101|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1E3bwvYGf1ig32RmcYiWc0IXPN-mD_bI_&export=download [following]
--2024-05-25 16:11:39--  https://drive.usercontent.google.com/download?id=1E3bwvYGf1ig32RmcYiWc0IXPN-mD_bI_&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 74.125.26.132, 2607:f8b0:400c:c04::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|74.125.26.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 83785 (82K) [application/octet-stream]
Saving to: ‘mckinsey.csv’


2024-05-25 16:11:39 (33.5 MB/s) - ‘mckinsey.csv’ saved [83785/83785]



In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('mckinsey.csv')
df


Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1967,11537966,Asia,34.020,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,1987,9216418,Africa,62.351,706.157306
1700,Zimbabwe,1992,10704340,Africa,60.377,693.420786
1701,Zimbabwe,1997,11404948,Africa,46.809,792.449960
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623


In [None]:
df.head()

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.85303
2,Afghanistan,1962,10267083,Asia,31.997,853.10071
3,Afghanistan,1967,11537966,Asia,34.02,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106


In [None]:
df.set_index("country", inplace = True)

In [None]:
df

Unnamed: 0_level_0,year,population,continent,life_exp,gdp_cap
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,1952,8425333,Asia,28.801,779.445314
Afghanistan,1957,9240934,Asia,30.332,820.853030
Afghanistan,1962,10267083,Asia,31.997,853.100710
Afghanistan,1967,11537966,Asia,34.020,836.197138
Afghanistan,1972,13079460,Asia,36.088,739.981106
...,...,...,...,...,...
Zimbabwe,1987,9216418,Africa,62.351,706.157306
Zimbabwe,1992,10704340,Africa,60.377,693.420786
Zimbabwe,1997,11404948,Africa,46.809,792.449960
Zimbabwe,2002,11926563,Africa,39.989,672.038623


In [None]:
df.loc["Afghanistan"]

Unnamed: 0_level_0,year,population,continent,life_exp,gdp_cap
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,1952,8425333,Asia,28.801,779.445314
Afghanistan,1957,9240934,Asia,30.332,820.85303
Afghanistan,1962,10267083,Asia,31.997,853.10071
Afghanistan,1967,11537966,Asia,34.02,836.197138
Afghanistan,1972,13079460,Asia,36.088,739.981106
Afghanistan,1977,14880372,Asia,38.438,786.11336
Afghanistan,1982,12881816,Asia,39.854,978.011439
Afghanistan,1987,13867957,Asia,40.822,852.395945
Afghanistan,1992,16317921,Asia,41.674,649.341395
Afghanistan,1997,22227415,Asia,41.763,635.341351


In [None]:
df.reset_index(inplace = True)

In [None]:
df

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1967,11537966,Asia,34.020,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,1987,9216418,Africa,62.351,706.157306
1700,Zimbabwe,1992,10704340,Africa,60.377,693.420786
1701,Zimbabwe,1997,11404948,Africa,46.809,792.449960
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623


In [None]:
df["new_col"] = [i for i in range(0,1704)]

In [None]:
df

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap,new_col
0,Afghanistan,1952,8425333,Asia,28.801,779.445314,0
1,Afghanistan,1957,9240934,Asia,30.332,820.853030,1
2,Afghanistan,1962,10267083,Asia,31.997,853.100710,2
3,Afghanistan,1967,11537966,Asia,34.020,836.197138,3
4,Afghanistan,1972,13079460,Asia,36.088,739.981106,4
...,...,...,...,...,...,...,...
1699,Zimbabwe,1987,9216418,Africa,62.351,706.157306,1699
1700,Zimbabwe,1992,10704340,Africa,60.377,693.420786,1700
1701,Zimbabwe,1997,11404948,Africa,46.809,792.449960,1701
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623,1702


In [None]:
df.drop(columns = 'new_col', inplace=True)

In [None]:
df.head()

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.85303
2,Afghanistan,1962,10267083,Asia,31.997,853.10071
3,Afghanistan,1967,11537966,Asia,34.02,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106


In [None]:
# To add new rows in df
# 1) append
# 2) iloc/loc

In [None]:
new_row = {'country': 'India', 'year': 2000,'population':13500000, 'continent': "Asia", 'life_exp':37.08, 'gdp_cap':900.23}
df.append(new_row)

  df.append(new_row)


TypeError: Can only append a dict if ignore_index=True

In [None]:
df = df.append(new_row, ignore_index=True)
df

  df = df.append(new_row, ignore_index=True)


Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1967,11537966,Asia,34.020,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106
...,...,...,...,...,...,...
1700,Zimbabwe,1992,10704340,Africa,60.377,693.420786
1701,Zimbabwe,1997,11404948,Africa,46.809,792.449960
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623
1703,Zimbabwe,2007,12311143,Africa,43.487,469.709298


In [None]:
new_row = {'country': 'India', 'year': 2000,'population':13500000, 'continent': "Asia", 'life_exp':37.08, 'gdp_cap':900.23}
new_row_val = list(new_row.values())
new_row_val

['India', 2000, 13500000, 'Asia', 37.08, 900.23]

In [None]:
df.loc[len(df.index)] = new_row_val
df

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1967,11537966,Asia,34.020,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106
...,...,...,...,...,...,...
1701,Zimbabwe,1997,11404948,Africa,46.809,792.449960
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623
1703,Zimbabwe,2007,12311143,Africa,43.487,469.709298
1704,India,2000,13500000,Asia,37.080,900.230000


In [None]:
df.iloc[len(df.index)-1] = ['Japan', 1000, 1350000, 'Asia', 37.08, 100.23]
df

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1967,11537966,Asia,34.020,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106
...,...,...,...,...,...,...
1701,Zimbabwe,1997,11404948,Africa,46.809,792.449960
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623
1703,Zimbabwe,2007,12311143,Africa,43.487,469.709298
1704,India,2000,13500000,Asia,37.080,900.230000


In [None]:
len(df.index)

1706

In [None]:
df.iloc[len(df.index)] = ['India', 2000, 13500000, 'Asia', 37.08, 900.23]

IndexError: iloc cannot enlarge its target object

In [None]:
df.loc[len(df.index)] = ['India', 2000, 13500000, 'Asia', 37.08, 900.23]

In [None]:
df

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1967,11537966,Asia,34.020,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106
...,...,...,...,...,...,...
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623
1703,Zimbabwe,2007,12311143,Africa,43.487,469.709298
1704,India,2000,13500000,Asia,37.080,900.230000
1705,Japan,1000,1350000,Asia,37.080,100.230000


In [None]:
df_test = pd.DataFrame({'Name': ['Surya', 'Tejas', 'Amit', 'Lakshmi']})
df_test.index = ['a','b','c','d']

df_test

Unnamed: 0,Name
a,Surya
b,Tejas
c,Amit
d,Lakshmi


In [None]:
len(df_test.index)

4

In [None]:
df_test.loc[len(df_test.index)] = ['Yash']

In [None]:
df_test

Unnamed: 0,Name
a,Surya
b,Tejas
c,Amit
d,Lakshmi
4,Yash


In [None]:
df

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1967,11537966,Asia,34.020,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106
...,...,...,...,...,...,...
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623
1703,Zimbabwe,2007,12311143,Africa,43.487,469.709298
1704,India,2000,13500000,Asia,37.080,900.230000
1705,Japan,1000,1350000,Asia,37.080,100.230000


In [None]:
df = df.drop(3, axis = 0) # 0 for rows and 1 for columns

In [None]:
df

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
4,Afghanistan,1972,13079460,Asia,36.088,739.981106
5,Afghanistan,1977,14880372,Asia,38.438,786.113360
...,...,...,...,...,...,...
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623
1703,Zimbabwe,2007,12311143,Africa,43.487,469.709298
1704,India,2000,13500000,Asia,37.080,900.230000
1705,Japan,1000,1350000,Asia,37.080,100.230000


In [None]:
df.loc[4] # The 4th row is printed

country       Afghanistan
year                 1972
population       13079460
continent            Asia
life_exp           36.088
gdp_cap        739.981106
Name: 4, dtype: object

In [None]:
df.iloc[4] # The 5th row is printed

country       Afghanistan
year                 1977
population       14880372
continent            Asia
life_exp           38.438
gdp_cap         786.11336
Name: 5, dtype: object

In [None]:
df.drop([1, 2, 4], axis=0) # drops rows with labels 1, 2, 4

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
5,Afghanistan,1977,14880372,Asia,38.438,786.113360
6,Afghanistan,1982,12881816,Asia,39.854,978.011439
7,Afghanistan,1987,13867957,Asia,40.822,852.395945
8,Afghanistan,1992,16317921,Asia,41.674,649.341395
...,...,...,...,...,...,...
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623
1703,Zimbabwe,2007,12311143,Africa,43.487,469.709298
1704,India,2000,13500000,Asia,37.080,900.230000
1705,Japan,1000,1350000,Asia,37.080,100.230000


In [None]:
df.reset_index(drop=True,inplace=True) # since we removed a row earlier, we reset our indices
df

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1972,13079460,Asia,36.088,739.981106
4,Afghanistan,1977,14880372,Asia,38.438,786.113360
...,...,...,...,...,...,...
1701,Zimbabwe,2002,11926563,Africa,39.989,672.038623
1702,Zimbabwe,2007,12311143,Africa,43.487,469.709298
1703,India,2000,13500000,Asia,37.080,900.230000
1704,Japan,1000,1350000,Asia,37.080,100.230000


In [None]:
df.loc[len(df.index)] = ['India', 2000, 13500000, 'Asia', 37.08, 900.23]
df.loc[len(df.index)] = ['Sri Lanka',2022 ,130000000, 'Asia', 80.00,500.00]
df.loc[len(df.index)] = ['Sri Lanka',2022 ,130000000, 'Asia', 80.00,500.00]
df.loc[len(df.index)] = ['India',2000 ,13500000, 'Asia', 80.00,900.23]
df

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1972,13079460,Asia,36.088,739.981106
4,Afghanistan,1977,14880372,Asia,38.438,786.113360
...,...,...,...,...,...,...
1705,India,2000,13500000,Asia,37.080,900.230000
1706,India,2000,13500000,Asia,37.080,900.230000
1707,Sri Lanka,2022,130000000,Asia,80.000,500.000000
1708,Sri Lanka,2022,130000000,Asia,80.000,500.000000


In [None]:
df.iloc[1:4]

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
1,Afghanistan,1957,9240934,Asia,30.332,820.85303
2,Afghanistan,1962,10267083,Asia,31.997,853.10071
3,Afghanistan,1972,13079460,Asia,36.088,739.981106


In [None]:
df.loc[1:4]

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
1,Afghanistan,1957,9240934,Asia,30.332,820.85303
2,Afghanistan,1962,10267083,Asia,31.997,853.10071
3,Afghanistan,1972,13079460,Asia,36.088,739.981106
4,Afghanistan,1977,14880372,Asia,38.438,786.11336


In [None]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1705     True
1706     True
1707    False
1708     True
1709    False
Length: 1710, dtype: bool

In [None]:
df.loc[df.duplicated()] # How to get the duplicate records

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
1705,India,2000,13500000,Asia,37.08,900.23
1706,India,2000,13500000,Asia,37.08,900.23
1708,Sri Lanka,2022,130000000,Asia,80.0,500.0


In [None]:
df.drop_duplicates(keep = 'first') # keep = first, last, False

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1972,13079460,Asia,36.088,739.981106
4,Afghanistan,1977,14880372,Asia,38.438,786.113360
...,...,...,...,...,...,...
1702,Zimbabwe,2007,12311143,Africa,43.487,469.709298
1703,India,2000,13500000,Asia,37.080,900.230000
1704,Japan,1000,1350000,Asia,37.080,100.230000
1707,Sri Lanka,2022,130000000,Asia,80.000,500.000000


In [None]:
df.drop_duplicates(keep = 'last')

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1972,13079460,Asia,36.088,739.981106
4,Afghanistan,1977,14880372,Asia,38.438,786.113360
...,...,...,...,...,...,...
1702,Zimbabwe,2007,12311143,Africa,43.487,469.709298
1704,Japan,1000,1350000,Asia,37.080,100.230000
1706,India,2000,13500000,Asia,37.080,900.230000
1708,Sri Lanka,2022,130000000,Asia,80.000,500.000000


In [None]:
df.drop_duplicates(keep=False) #this considers all the identical values as duplicates.

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1972,13079460,Asia,36.088,739.981106
4,Afghanistan,1977,14880372,Asia,38.438,786.113360
...,...,...,...,...,...,...
1700,Zimbabwe,1997,11404948,Africa,46.809,792.449960
1701,Zimbabwe,2002,11926563,Africa,39.989,672.038623
1702,Zimbabwe,2007,12311143,Africa,43.487,469.709298
1704,Japan,1000,1350000,Asia,37.080,100.230000


In [None]:
[1601, 1701, 1801, 1901] #Keep = First, last

In [None]:
df

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1972,13079460,Asia,36.088,739.981106
4,Afghanistan,1977,14880372,Asia,38.438,786.113360
...,...,...,...,...,...,...
1705,India,2000,13500000,Asia,37.080,900.230000
1706,India,2000,13500000,Asia,37.080,900.230000
1707,Sri Lanka,2022,130000000,Asia,80.000,500.000000
1708,Sri Lanka,2022,130000000,Asia,80.000,500.000000


In [None]:
df.drop_duplicates(subset=['country'],keep='first')

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
11,Albania,1952,1282697,Europe,55.230,1601.056136
23,Algeria,1952,9279525,Africa,43.077,2449.008185
35,Angola,1952,4232095,Africa,30.015,3520.610273
47,Argentina,1952,17876956,Americas,62.485,5911.315053
...,...,...,...,...,...,...
1643,Vietnam,1952,26246839,Asia,40.412,605.066492
1655,West Bank and Gaza,1952,1030585,Asia,43.160,1515.592329
1667,"Yemen, Rep.",1952,4963829,Asia,32.548,781.717576
1679,Zambia,1952,2672000,Africa,42.038,1147.388831


In [None]:
df

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1972,13079460,Asia,36.088,739.981106
4,Afghanistan,1977,14880372,Asia,38.438,786.113360
...,...,...,...,...,...,...
1705,India,2000,13500000,Asia,37.080,900.230000
1706,India,2000,13500000,Asia,37.080,900.230000
1707,Sri Lanka,2022,130000000,Asia,80.000,500.000000
1708,Sri Lanka,2022,130000000,Asia,80.000,500.000000


## Quiz

In [None]:
# data = {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
#         'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
#         'C': ['small', 'large', 'large', 'small', 'small', 'large', 'large', 'small'],
#         'D': [1, 2, 2, 3, 3, 4, 5, 6]}

# df = pd.DataFrame(data)

# df

# print(sum(df.duplicated(subset=['A', 'B']))

Unnamed: 0,A,B,C,D
0,foo,one,small,1
1,bar,one,large,2
2,foo,two,large,2
3,bar,three,small,3
4,foo,two,small,3
5,bar,two,large,4
6,foo,one,large,5
7,foo,three,small,6


In [None]:
#df.duplicated(subset=['A', 'B'])

0    False
1    False
2    False
3    False
4     True
5    False
6     True
7    False
dtype: bool

In [None]:
#sum(df.duplicated(subset=['A', 'B']))

2

## Slicing the DF

In [None]:
# iloc and loc

In [None]:
df.iloc[0:4, 0:4]

Unnamed: 0,country,year,population,continent
0,Afghanistan,1952,8425333,Asia
1,Afghanistan,1957,9240934,Asia
2,Afghanistan,1962,10267083,Asia
3,Afghanistan,1967,11537966,Asia


In [None]:
df.loc[0:4, 'country':'year']

Unnamed: 0,country,year
0,Afghanistan,1952
1,Afghanistan,1957
2,Afghanistan,1962
3,Afghanistan,1967
4,Afghanistan,1972


In [None]:
df.loc[0:4, ['country','population']]

Unnamed: 0,country,population
0,Afghanistan,8425333
1,Afghanistan,9240934
2,Afghanistan,10267083
3,Afghanistan,11537966
4,Afghanistan,13079460


In [None]:
df.iloc[1:10:2]

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
1,Afghanistan,1957,9240934,Asia,30.332,820.85303
3,Afghanistan,1967,11537966,Asia,34.02,836.197138
5,Afghanistan,1977,14880372,Asia,38.438,786.11336
7,Afghanistan,1987,13867957,Asia,40.822,852.395945
9,Afghanistan,1997,22227415,Asia,41.763,635.341351


In [None]:
df.iloc[1:10:2, [4,3]]

Unnamed: 0,life_exp,continent
1,30.332,Asia
3,34.02,Asia
5,38.438,Asia
7,40.822,Asia
9,41.763,Asia


## Pandas built-in operations

In [None]:
le = df['life_exp']
le

0       28.801
1       30.332
2       31.997
3       34.020
4       36.088
         ...  
1699    62.351
1700    60.377
1701    46.809
1702    39.989
1703    43.487
Name: life_exp, Length: 1704, dtype: float64

In [None]:
le.mean()

59.474439366197174

In [None]:
le.sum()

101344.44467999999

In [None]:
le.max()

82.603

In [None]:
le.min()

23.599

In [None]:
le.count()

1704

In [None]:
le.median()

60.7125

In [None]:
df[['life_exp','gdp_cap']].mean()

life_exp      59.474439
gdp_cap     7215.327081
dtype: float64

In [None]:
le.mode()

0    69.39
Name: life_exp, dtype: float64

In [None]:
le.var()

166.851663976879

In [None]:
le.std()

12.917107415241192

In [None]:
le.sum() / le.count()

59.474439366197174

## Sorting Values

In [None]:
df.sort_values(by = 'life_exp')

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
1292,Rwanda,1992,7290203,Africa,23.599,737.068595
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
552,Gambia,1952,284320,Africa,30.000,485.230659
36,Angola,1952,4232095,Africa,30.015,3520.610273
1344,Sierra Leone,1952,2143249,Africa,30.331,879.787736
...,...,...,...,...,...,...
1487,Switzerland,2007,7554661,Europe,81.701,37506.419070
695,Iceland,2007,301931,Europe,81.757,36180.789190
802,Japan,2002,127065841,Asia,82.000,28604.591900
671,"Hong Kong, China",2007,6980412,Asia,82.208,39724.978670


In [None]:
df.sort_values(by = 'life_exp', ascending = False)

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
803,Japan,2007,127467972,Asia,82.603,31656.068060
671,"Hong Kong, China",2007,6980412,Asia,82.208,39724.978670
802,Japan,2002,127065841,Asia,82.000,28604.591900
695,Iceland,2007,301931,Europe,81.757,36180.789190
1487,Switzerland,2007,7554661,Europe,81.701,37506.419070
...,...,...,...,...,...,...
1344,Sierra Leone,1952,2143249,Africa,30.331,879.787736
36,Angola,1952,4232095,Africa,30.015,3520.610273
552,Gambia,1952,284320,Africa,30.000,485.230659
0,Afghanistan,1952,8425333,Asia,28.801,779.445314


In [None]:
df.sort_values(['year', 'life_exp'])

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
552,Gambia,1952,284320,Africa,30.000,485.230659
36,Angola,1952,4232095,Africa,30.015,3520.610273
1344,Sierra Leone,1952,2143249,Africa,30.331,879.787736
1032,Mozambique,1952,6446316,Africa,31.286,468.526038
...,...,...,...,...,...,...
71,Australia,2007,20434176,Oceania,81.235,34435.367440
1487,Switzerland,2007,7554661,Europe,81.701,37506.419070
695,Iceland,2007,301931,Europe,81.757,36180.789190
671,"Hong Kong, China",2007,6980412,Asia,82.208,39724.978670


In [None]:
df.sort_values(['year', 'life_exp'], ascending = [False, True])

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
1463,Swaziland,2007,1133066,Africa,39.613,4513.480643
1043,Mozambique,2007,19951656,Africa,42.082,823.685621
1691,Zambia,2007,11746035,Africa,42.384,1271.211593
1355,Sierra Leone,2007,6144562,Africa,42.568,862.540756
887,Lesotho,2007,2012649,Africa,42.592,1569.331442
...,...,...,...,...,...,...
408,Denmark,1952,4334000,Europe,70.780,9692.385245
1464,Sweden,1952,7124673,Europe,71.860,8527.844662
1080,Netherlands,1952,10381988,Europe,72.130,8941.571858
684,Iceland,1952,147962,Europe,72.490,7267.688428


## Quiz

In [None]:
df.sort_values(['country','population'], inplace=True, ascending=False)

## Concatenating DataFrames

In [None]:
users = pd.DataFrame({"userid":[1, 2, 3], "name":["sharadh", "shahid", "khusalli"]})
users

Unnamed: 0,userid,name
0,1,sharadh
1,2,shahid
2,3,khusalli


In [None]:
msgs = pd.DataFrame({"userid":[1, 1, 2, 4], "msg":['hmm', "acha", "theek hai", "nice"]})
msgs

Unnamed: 0,userid,msg
0,1,hmm
1,1,acha
2,2,theek hai
3,4,nice


In [None]:
pd.concat([users, msgs], axis = 0)

Unnamed: 0,userid,name,msg
0,1,sharadh,
1,2,shahid,
2,3,khusalli,
0,1,,hmm
1,1,,acha
2,2,,theek hai
3,4,,nice


In [None]:
pd.concat([users, msgs], axis = 0, ignore_index = True)

Unnamed: 0,userid,name,msg
0,1,sharadh,
1,2,shahid,
2,3,khusalli,
3,1,,hmm
4,1,,acha
5,2,,theek hai
6,4,,nice


In [None]:
pd.concat([users, msgs], axis = 1)

Unnamed: 0,userid,name,userid.1,msg
0,1.0,sharadh,1,hmm
1,2.0,shahid,1,acha
2,3.0,khusalli,2,theek hai
3,,,4,nice


## Merging

In [None]:
users

Unnamed: 0,id,name
0,1,sharadh
1,2,shahid
2,3,khusalli


In [None]:
msgs

Unnamed: 0,userid,msg
0,1,hmm
1,1,acha
2,2,theek hai
3,4,nice


In [None]:
users.merge(msgs, on="userid") # Like INNER JOIN

Unnamed: 0,userid,name,msg
0,1,sharadh,hmm
1,1,sharadh,acha
2,2,shahid,theek hai


In [None]:
users.merge(msgs, on="userid", how="outer") # Like Full outer join

Unnamed: 0,userid,name,msg
0,1,sharadh,hmm
1,1,sharadh,acha
2,2,shahid,theek hai
3,3,khusalli,
4,4,,nice


In [None]:
users.merge(msgs, on="userid", how="left") # Like Left Join

Unnamed: 0,userid,name,msg
0,1,sharadh,hmm
1,1,sharadh,acha
2,2,shahid,theek hai
3,3,khusalli,


In [None]:
users.merge(msgs, on="userid", how="right") # like right join

Unnamed: 0,userid,name,msg
0,1,sharadh,hmm
1,1,sharadh,acha
2,2,shahid,theek hai
3,4,,nice


In [None]:
users.rename(columns = {"userid": "id"}, inplace=True)
users

Unnamed: 0,id,name
0,1,sharadh
1,2,shahid
2,3,khusalli


In [None]:
users.merge(msgs, left_on="id", right_on="userid") # Incase the common columns are of different names

Unnamed: 0,id,name,userid,msg
0,1,sharadh,1,hmm
1,1,sharadh,1,acha
2,2,shahid,2,theek hai


## Quiz

In [None]:
df1 = pd.DataFrame({'A':[10,30,21], 'B':[20,40,21], 'C':[30, 60,21]})
df2 = pd.DataFrame({'A':[10,30], 'C':[30, 60]})

In [None]:
df1

Unnamed: 0,A,B,C
0,10,20,30
1,30,40,60
2,21,21,21


In [None]:
df2

Unnamed: 0,A,C
0,10,30
1,30,60


In [None]:
df2.merge(df1, on = 'A', how = 'right')

Unnamed: 0,A,C_x,B,C_y
0,10,30.0,20,30
1,30,60.0,40,60
2,21,,21,21


In [None]:
df2.merge(df1, on = 'A', how = 'outer').shape

(2, 4)

In [None]:
df2.merge(df1, on = 'A')

Unnamed: 0,A,C_x,B,C_y
0,10,30,20,30
1,30,60,40,60
