## Pandas

In [1]:
import numpy as np
import pandas as pd

### You can create a series object with a list or numpy array or a dictionary

### Series with List

In [2]:
my_list = [i for i in range(0,5)]
my_index =[i for i in 'risha']

In [3]:
my_list, my_index

([0, 1, 2, 3, 4], ['r', 'i', 's', 'h', 'a'])

In [4]:
pd.Series(data=my_list, index=my_index)

r    0
i    1
s    2
h    3
a    4
dtype: int64

In [5]:
pd.Series(my_list)

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [6]:
pd.Series(my_list,my_index)

r    0
i    1
s    2
h    3
a    4
dtype: int64

### Series with NumPy array

In [7]:
array = np.arange(0,5)

In [8]:
array, my_index

(array([0, 1, 2, 3, 4]), ['r', 'i', 's', 'h', 'a'])

In [9]:
pd.Series(data = array)

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [10]:
pd.Series(data = array, index = my_index)

r    0
i    1
s    2
h    3
a    4
dtype: int32

### Series with Dictionary

In [11]:
dict1 = {'one':1,'two':2,'three':3,'four':4}

In [12]:
dict1

{'one': 1, 'two': 2, 'three': 3, 'four': 4}

In [13]:
ser1 = pd.Series(dict1)
ser1

one      1
two      2
three    3
four     4
dtype: int64

## Series can hold wide variety of datatypes

In [14]:
## strings
list1 = ['a','b','c','d']
list2 = [1,2,3,4]
ser2 = pd.Series(data= list1 , index= list2)
ser2

1    a
2    b
3    c
4    d
dtype: object

In [15]:
## holding functions
list1 = [sum,min,max,type,range]
list2 = ['sum','min','max','type','range']
ser3 = pd.Series(data=list1, index=list2)
ser3

sum      <built-in function sum>
min      <built-in function min>
max      <built-in function max>
type              <class 'type'>
range            <class 'range'>
dtype: object

## Grabbing data from series

In [16]:
ser2[1]

'a'

In [17]:
dict1 = {'Mumbai':100,'Delhi':200,'Kolkata':300,'Chennai':400,'Bengaluru':500}
ser1 = pd.Series(dict1)
ser2 = pd.Series({'Mumbai':100,'Kolkata':300,'Chennai':400,'Bengaluru':500})
ser3 = pd.Series({'Mumbai':100,'Delhi':200,'Kolkata':300,'Bengaluru':500})

In [18]:
ser1['Mumbai']

100

In [19]:
ser1

Mumbai       100
Delhi        200
Kolkata      300
Chennai      400
Bengaluru    500
dtype: int64

In [20]:
ser2

Mumbai       100
Kolkata      300
Chennai      400
Bengaluru    500
dtype: int64

In [21]:
ser3

Mumbai       100
Delhi        200
Kolkata      300
Bengaluru    500
dtype: int64

### Basic operations on Series

In [22]:
ser1 +  ser2

Bengaluru    1000.0
Chennai       800.0
Delhi           NaN
Kolkata       600.0
Mumbai        200.0
dtype: float64

In [23]:
ser2 + ser3

Bengaluru    1000.0
Chennai         NaN
Delhi           NaN
Kolkata       600.0
Mumbai        200.0
dtype: float64

## Basic functions and attributes

In [24]:
ser = ser2 + ser3
ser

Bengaluru    1000.0
Chennai         NaN
Delhi           NaN
Kolkata       600.0
Mumbai        200.0
dtype: float64

In [25]:
ser.isnull()

Bengaluru    False
Chennai       True
Delhi         True
Kolkata      False
Mumbai       False
dtype: bool

In [26]:
ser.notnull()

Bengaluru     True
Chennai      False
Delhi        False
Kolkata       True
Mumbai        True
dtype: bool

In [27]:
ser.size

5

In [28]:
ser.axes

[Index(['Bengaluru', 'Chennai', 'Delhi', 'Kolkata', 'Mumbai'], dtype='object')]

In [29]:
ser.values

array([1000.,   nan,   nan,  600.,  200.])

In [30]:
ser.empty

False

In [31]:
ser.max()

1000.0

In [32]:
ser.min()

200.0

In [33]:
ser.std()

400.0

In [34]:
ser.var()

160000.0

In [35]:
ser.sum()

1800.0

In [36]:
ser.mean()

600.0

In [37]:
ser.head(2)

Bengaluru    1000.0
Chennai         NaN
dtype: float64

In [38]:
ser.tail(3)

Delhi        NaN
Kolkata    600.0
Mumbai     200.0
dtype: float64

## Dataframes

### Creating a dataframe

In [39]:
rows = 'India Australia Japan US Russia England France Germany Bhutan Fiji'.split()
cols = 'c1 c2 c3 c4 c5 c6 c7 c8 c9 c10'.split()
data = np.random.randint(0,100,100).reshape(10,10)

In [40]:
rows

['India',
 'Australia',
 'Japan',
 'US',
 'Russia',
 'England',
 'France',
 'Germany',
 'Bhutan',
 'Fiji']

In [41]:
cols

['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10']

In [42]:
data

array([[96, 97, 99, 94,  3,  3, 90, 84, 30, 30],
       [22, 12, 98,  9, 30, 47,  0, 59, 59, 96],
       [34, 77, 24, 93, 59, 64, 56, 96, 91, 62],
       [12,  0, 44, 28,  9, 41, 69, 72, 30, 27],
       [ 5, 88, 58, 83, 80, 29, 31, 89, 28, 73],
       [ 4, 13,  5, 63, 53, 32, 89, 55, 78, 86],
       [12, 38, 59, 11,  8, 57, 91, 74, 44, 29],
       [ 5, 10, 63, 69, 31, 72, 33, 46, 33, 93],
       [82, 19, 56, 77, 22, 40, 31, 22, 58, 64],
       [37, 52, 54, 43, 31, 44, 80, 29, 13, 48]])

In [43]:
df = pd.DataFrame(data=data, index=rows, columns=cols)

In [44]:
df

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10
India,96,97,99,94,3,3,90,84,30,30
Australia,22,12,98,9,30,47,0,59,59,96
Japan,34,77,24,93,59,64,56,96,91,62
US,12,0,44,28,9,41,69,72,30,27
Russia,5,88,58,83,80,29,31,89,28,73
England,4,13,5,63,53,32,89,55,78,86
France,12,38,59,11,8,57,91,74,44,29
Germany,5,10,63,69,31,72,33,46,33,93
Bhutan,82,19,56,77,22,40,31,22,58,64
Fiji,37,52,54,43,31,44,80,29,13,48


 ## Getting data out of DataFrames

### Selecting columns

In [45]:
df['c1']

India        96
Australia    22
Japan        34
US           12
Russia        5
England       4
France       12
Germany       5
Bhutan       82
Fiji         37
Name: c1, dtype: int32

In [46]:
df[['c1','c2']]

Unnamed: 0,c1,c2
India,96,97
Australia,22,12
Japan,34,77
US,12,0
Russia,5,88
England,4,13
France,12,38
Germany,5,10
Bhutan,82,19
Fiji,37,52


### Selecting rows

In [47]:
df.loc['India']

c1     96
c2     97
c3     99
c4     94
c5      3
c6      3
c7     90
c8     84
c9     30
c10    30
Name: India, dtype: int32

In [48]:
df.iloc[3]

c1     12
c2      0
c3     44
c4     28
c5      9
c6     41
c7     69
c8     72
c9     30
c10    27
Name: US, dtype: int32

In [49]:
df.loc[['India','England','Russia']]

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10
India,96,97,99,94,3,3,90,84,30,30
England,4,13,5,63,53,32,89,55,78,86
Russia,5,88,58,83,80,29,31,89,28,73


### Selecting subset of a dataframe

In [50]:
df

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10
India,96,97,99,94,3,3,90,84,30,30
Australia,22,12,98,9,30,47,0,59,59,96
Japan,34,77,24,93,59,64,56,96,91,62
US,12,0,44,28,9,41,69,72,30,27
Russia,5,88,58,83,80,29,31,89,28,73
England,4,13,5,63,53,32,89,55,78,86
France,12,38,59,11,8,57,91,74,44,29
Germany,5,10,63,69,31,72,33,46,33,93
Bhutan,82,19,56,77,22,40,31,22,58,64
Fiji,37,52,54,43,31,44,80,29,13,48


In [51]:
df.loc[['Japan','US','Russia'],['c5','c6','c7']]

Unnamed: 0,c5,c6,c7
Japan,59,64,56
US,9,41,69
Russia,80,29,31


### Adding a new column

In [52]:
df['c11'] = np.arange(0,10)

In [53]:
df

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,96,97,99,94,3,3,90,84,30,30,0
Australia,22,12,98,9,30,47,0,59,59,96,1
Japan,34,77,24,93,59,64,56,96,91,62,2
US,12,0,44,28,9,41,69,72,30,27,3
Russia,5,88,58,83,80,29,31,89,28,73,4
England,4,13,5,63,53,32,89,55,78,86,5
France,12,38,59,11,8,57,91,74,44,29,6
Germany,5,10,63,69,31,72,33,46,33,93,7
Bhutan,82,19,56,77,22,40,31,22,58,64,8
Fiji,37,52,54,43,31,44,80,29,13,48,9


### Adding a new row

In [54]:
df.loc['Poland'] = np.random.randint(0,100,11)

In [55]:
df

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,96,97,99,94,3,3,90,84,30,30,0
Australia,22,12,98,9,30,47,0,59,59,96,1
Japan,34,77,24,93,59,64,56,96,91,62,2
US,12,0,44,28,9,41,69,72,30,27,3
Russia,5,88,58,83,80,29,31,89,28,73,4
England,4,13,5,63,53,32,89,55,78,86,5
France,12,38,59,11,8,57,91,74,44,29,6
Germany,5,10,63,69,31,72,33,46,33,93,7
Bhutan,82,19,56,77,22,40,31,22,58,64,8
Fiji,37,52,54,43,31,44,80,29,13,48,9


### Dropping a column

In [56]:
df.drop('c11', axis=1)

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10
India,96,97,99,94,3,3,90,84,30,30
Australia,22,12,98,9,30,47,0,59,59,96
Japan,34,77,24,93,59,64,56,96,91,62
US,12,0,44,28,9,41,69,72,30,27
Russia,5,88,58,83,80,29,31,89,28,73
England,4,13,5,63,53,32,89,55,78,86
France,12,38,59,11,8,57,91,74,44,29
Germany,5,10,63,69,31,72,33,46,33,93
Bhutan,82,19,56,77,22,40,31,22,58,64
Fiji,37,52,54,43,31,44,80,29,13,48


### Dropping a row

In [57]:
df.drop('Poland',axis=0)

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,96,97,99,94,3,3,90,84,30,30,0
Australia,22,12,98,9,30,47,0,59,59,96,1
Japan,34,77,24,93,59,64,56,96,91,62,2
US,12,0,44,28,9,41,69,72,30,27,3
Russia,5,88,58,83,80,29,31,89,28,73,4
England,4,13,5,63,53,32,89,55,78,86,5
France,12,38,59,11,8,57,91,74,44,29,6
Germany,5,10,63,69,31,72,33,46,33,93,7
Bhutan,82,19,56,77,22,40,31,22,58,64,8
Fiji,37,52,54,43,31,44,80,29,13,48,9


## Conditional Selection

In [58]:
df >=50

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,True,True,True,True,False,False,True,True,False,False,False
Australia,False,False,True,False,False,False,False,True,True,True,False
Japan,False,True,False,True,True,True,True,True,True,True,False
US,False,False,False,False,False,False,True,True,False,False,False
Russia,False,True,True,True,True,False,False,True,False,True,False
England,False,False,False,True,True,False,True,True,True,True,False
France,False,False,True,False,False,True,True,True,False,False,False
Germany,False,False,True,True,False,True,False,False,False,True,False
Bhutan,True,False,True,True,False,False,False,False,True,True,False
Fiji,False,True,True,False,False,False,True,False,False,False,False


In [59]:
df[df>=50]

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,96.0,97.0,99.0,94.0,,,90.0,84.0,,,
Australia,,,98.0,,,,,59.0,59.0,96.0,
Japan,,77.0,,93.0,59.0,64.0,56.0,96.0,91.0,62.0,
US,,,,,,,69.0,72.0,,,
Russia,,88.0,58.0,83.0,80.0,,,89.0,,73.0,
England,,,,63.0,53.0,,89.0,55.0,78.0,86.0,
France,,,59.0,,,57.0,91.0,74.0,,,
Germany,,,63.0,69.0,,72.0,,,,93.0,
Bhutan,82.0,,56.0,77.0,,,,,58.0,64.0,
Fiji,,52.0,54.0,,,,80.0,,,,


In [60]:
df[df['c1']>=50]

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,96,97,99,94,3,3,90,84,30,30,0
Bhutan,82,19,56,77,22,40,31,22,58,64,8
Poland,69,75,64,74,52,25,57,11,59,60,52


In [61]:
df[df['c1']>=50].loc[['India','Bhutan']]

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,96,97,99,94,3,3,90,84,30,30,0
Bhutan,82,19,56,77,22,40,31,22,58,64,8


In [62]:
 df[(df>50) | (df%20==0)]

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,96.0,97.0,99.0,94.0,,,90.0,84.0,,,0.0
Australia,,,98.0,,,,0.0,59.0,59.0,96.0,
Japan,,77.0,,93.0,59.0,64.0,56.0,96.0,91.0,62.0,
US,,0.0,,,,,69.0,72.0,,,
Russia,,88.0,58.0,83.0,80.0,,,89.0,,73.0,
England,,,,63.0,53.0,,89.0,55.0,78.0,86.0,
France,,,59.0,,,57.0,91.0,74.0,,,
Germany,,,63.0,69.0,,72.0,,,,93.0,
Bhutan,82.0,,56.0,77.0,,40.0,,,58.0,64.0,
Fiji,,52.0,54.0,,,,80.0,,,,


### Useful Methods

In [63]:
### RESET INDEX
df.reset_index()

Unnamed: 0,index,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
0,India,96,97,99,94,3,3,90,84,30,30,0
1,Australia,22,12,98,9,30,47,0,59,59,96,1
2,Japan,34,77,24,93,59,64,56,96,91,62,2
3,US,12,0,44,28,9,41,69,72,30,27,3
4,Russia,5,88,58,83,80,29,31,89,28,73,4
5,England,4,13,5,63,53,32,89,55,78,86,5
6,France,12,38,59,11,8,57,91,74,44,29,6
7,Germany,5,10,63,69,31,72,33,46,33,93,7
8,Bhutan,82,19,56,77,22,40,31,22,58,64,8
9,Fiji,37,52,54,43,31,44,80,29,13,48,9


In [64]:
df['new_ind'] = 'a b c d e f g h i j k'.split()

In [65]:
df

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,new_ind
India,96,97,99,94,3,3,90,84,30,30,0,a
Australia,22,12,98,9,30,47,0,59,59,96,1,b
Japan,34,77,24,93,59,64,56,96,91,62,2,c
US,12,0,44,28,9,41,69,72,30,27,3,d
Russia,5,88,58,83,80,29,31,89,28,73,4,e
England,4,13,5,63,53,32,89,55,78,86,5,f
France,12,38,59,11,8,57,91,74,44,29,6,g
Germany,5,10,63,69,31,72,33,46,33,93,7,h
Bhutan,82,19,56,77,22,40,31,22,58,64,8,i
Fiji,37,52,54,43,31,44,80,29,13,48,9,j


In [66]:
df.set_index('new_ind',inplace=True)

In [67]:
df

Unnamed: 0_level_0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
new_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
a,96,97,99,94,3,3,90,84,30,30,0
b,22,12,98,9,30,47,0,59,59,96,1
c,34,77,24,93,59,64,56,96,91,62,2
d,12,0,44,28,9,41,69,72,30,27,3
e,5,88,58,83,80,29,31,89,28,73,4
f,4,13,5,63,53,32,89,55,78,86,5
g,12,38,59,11,8,57,91,74,44,29,6
h,5,10,63,69,31,72,33,46,33,93,7
i,82,19,56,77,22,40,31,22,58,64,8
j,37,52,54,43,31,44,80,29,13,48,9


In [68]:
df.head()

Unnamed: 0_level_0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
new_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
a,96,97,99,94,3,3,90,84,30,30,0
b,22,12,98,9,30,47,0,59,59,96,1
c,34,77,24,93,59,64,56,96,91,62,2
d,12,0,44,28,9,41,69,72,30,27,3
e,5,88,58,83,80,29,31,89,28,73,4


In [69]:
df.tail()

Unnamed: 0_level_0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
new_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
g,12,38,59,11,8,57,91,74,44,29,6
h,5,10,63,69,31,72,33,46,33,93,7
i,82,19,56,77,22,40,31,22,58,64,8
j,37,52,54,43,31,44,80,29,13,48,9
k,69,75,64,74,52,25,57,11,59,60,52


In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11 entries, a to k
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   c1      11 non-null     int32
 1   c2      11 non-null     int32
 2   c3      11 non-null     int32
 3   c4      11 non-null     int32
 4   c5      11 non-null     int32
 5   c6      11 non-null     int32
 6   c7      11 non-null     int32
 7   c8      11 non-null     int32
 8   c9      11 non-null     int32
 9   c10     11 non-null     int32
 10  c11     11 non-null     int32
dtypes: int32(11)
memory usage: 572.0+ bytes


In [71]:
df.describe()

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
count,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
mean,34.363636,43.727273,56.727273,58.545455,34.363636,41.272727,57.0,57.909091,47.545455,60.727273,8.818182
std,33.27844,35.4911,27.397412,31.049521,24.216824,19.277401,30.199338,28.342387,23.670081,25.175746,14.606972
min,4.0,0.0,5.0,9.0,3.0,3.0,0.0,11.0,13.0,27.0,0.0
25%,8.5,12.5,49.0,35.5,15.5,30.5,32.0,37.5,30.0,39.0,2.5
50%,22.0,38.0,58.0,69.0,31.0,41.0,57.0,59.0,44.0,62.0,5.0
75%,53.0,76.0,63.5,80.0,52.5,52.0,84.5,79.0,59.0,79.5,7.5
max,96.0,97.0,99.0,94.0,80.0,72.0,91.0,96.0,91.0,96.0,52.0


## Hierarichal Indexing

In [72]:
index = [['Asia','Asia','Asia','Europe','Europe','Europe','Africa','Africa','Africa'],
         ['Population','GDP','Gini','Population','GDP','Gini','Population','GDP','Gini'],
         [2010,2011,2012,2010,2011,2012,2010,2011,2012]]
data = np.random.randint(0,10,27).reshape(9,3)
cols = ['UN','World Bank','IMF']

In [73]:
df = pd.DataFrame(data=data,index=index,columns=cols)

In [74]:
df

Unnamed: 0,Unnamed: 1,Unnamed: 2,UN,World Bank,IMF
Asia,Population,2010,6,9,7
Asia,GDP,2011,5,1,8
Asia,Gini,2012,1,0,3
Europe,Population,2010,8,0,4
Europe,GDP,2011,8,1,8
Europe,Gini,2012,4,2,7
Africa,Population,2010,7,2,4
Africa,GDP,2011,1,7,2
Africa,Gini,2012,9,3,3


In [75]:
df.index.names

FrozenList([None, None, None])

In [76]:
df.index.names = ['Region','Parameter','Year']

In [77]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,UN,World Bank,IMF
Region,Parameter,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Asia,Population,2010,6,9,7
Asia,GDP,2011,5,1,8
Asia,Gini,2012,1,0,3
Europe,Population,2010,8,0,4
Europe,GDP,2011,8,1,8
Europe,Gini,2012,4,2,7
Africa,Population,2010,7,2,4
Africa,GDP,2011,1,7,2
Africa,Gini,2012,9,3,3


In [78]:
df.loc['Asia'].loc[['GDP','Gini'],['UN','World Bank']]

Unnamed: 0_level_0,Unnamed: 1_level_0,UN,World Bank
Parameter,Year,Unnamed: 2_level_1,Unnamed: 3_level_1
GDP,2011,5,1
Gini,2012,1,0


In [79]:
df['UN']

Region  Parameter   Year
Asia    Population  2010    6
        GDP         2011    5
        Gini        2012    1
Europe  Population  2010    8
        GDP         2011    8
        Gini        2012    4
Africa  Population  2010    7
        GDP         2011    1
        Gini        2012    9
Name: UN, dtype: int32

In [80]:
df.xs(('Asia','GDP'))

  return runner(coro)


Unnamed: 0_level_0,UN,World Bank,IMF
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011,5,1,8


In [81]:
df.xs(key=('GDP',2011),level=['Parameter','Year'])

Unnamed: 0_level_0,UN,World Bank,IMF
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Asia,5,1,8
Europe,8,1,8
Africa,1,7,2


## Handling missing data

In [82]:
## Creating a dataframe with a dictionary
df = pd.DataFrame({'India':[0,np.nan,2,3,4], 'China':[10,np.nan,12,13,np.nan], 'US':[np.nan,np.nan,np.nan,np.nan,np.nan],'Canada':[20,21,22,23,24]})

In [83]:
df

Unnamed: 0,India,China,US,Canada
0,0.0,10.0,,20
1,,,,21
2,2.0,12.0,,22
3,3.0,13.0,,23
4,4.0,,,24


In [84]:
df.isnull()

Unnamed: 0,India,China,US,Canada
0,False,False,True,False
1,True,True,True,False
2,False,False,True,False
3,False,False,True,False
4,False,True,True,False


In [85]:
df.notnull()

Unnamed: 0,India,China,US,Canada
0,True,True,False,True
1,False,False,False,True
2,True,True,False,True
3,True,True,False,True
4,True,False,False,True


In [86]:
df.dropna()

Unnamed: 0,India,China,US,Canada


In [87]:
df.dropna(axis=1)

Unnamed: 0,Canada
0,20
1,21
2,22
3,23
4,24


In [88]:
df.dropna(thresh=2)

Unnamed: 0,India,China,US,Canada
0,0.0,10.0,,20
2,2.0,12.0,,22
3,3.0,13.0,,23
4,4.0,,,24


In [89]:
df.fillna(20)

Unnamed: 0,India,China,US,Canada
0,0.0,10.0,20.0,20
1,20.0,20.0,20.0,21
2,2.0,12.0,20.0,22
3,3.0,13.0,20.0,23
4,4.0,20.0,20.0,24


In [90]:
df.fillna(method='ffill')

Unnamed: 0,India,China,US,Canada
0,0.0,10.0,,20
1,0.0,10.0,,21
2,2.0,12.0,,22
3,3.0,13.0,,23
4,4.0,13.0,,24


In [91]:
df.fillna(method='bfill')

Unnamed: 0,India,China,US,Canada
0,0.0,10.0,,20
1,2.0,12.0,,21
2,2.0,12.0,,22
3,3.0,13.0,,23
4,4.0,,,24


In [92]:
df.fillna(method='pad')

Unnamed: 0,India,China,US,Canada
0,0.0,10.0,,20
1,0.0,10.0,,21
2,2.0,12.0,,22
3,3.0,13.0,,23
4,4.0,13.0,,24


## Merging and Concatenating

### Merging

In [99]:
df1 = pd.DataFrame({'key':'a b c d e'.split(), 'X1':[1,2,3,4,5], 'X2':range(6,11)})
df2 = pd.DataFrame({'key':'a b c f g'.split(), 'X1':[1,2,3,11,12], 'X2':range(4,9)})

In [100]:
df1

Unnamed: 0,key,X1,X2
0,a,1,6
1,b,2,7
2,c,3,8
3,d,4,9
4,e,5,10


In [101]:
df2

Unnamed: 0,key,X1,X2
0,a,1,4
1,b,2,5
2,c,3,6
3,f,11,7
4,g,12,8


In [102]:
pd.merge(df1,df2,how='inner',on='key')

Unnamed: 0,key,X1_x,X2_x,X1_y,X2_y
0,a,1,6,1,4
1,b,2,7,2,5
2,c,3,8,3,6


In [103]:
pd.merge(df1,df2,how='left',on='key')

Unnamed: 0,key,X1_x,X2_x,X1_y,X2_y
0,a,1,6,1.0,4.0
1,b,2,7,2.0,5.0
2,c,3,8,3.0,6.0
3,d,4,9,,
4,e,5,10,,


In [104]:
pd.merge(df1,df2,how='right',on='key')

Unnamed: 0,key,X1_x,X2_x,X1_y,X2_y
0,a,1.0,6.0,1,4
1,b,2.0,7.0,2,5
2,c,3.0,8.0,3,6
3,f,,,11,7
4,g,,,12,8


In [105]:
pd.merge(df1,df2,how='outer',on='key')

Unnamed: 0,key,X1_x,X2_x,X1_y,X2_y
0,a,1.0,6.0,1.0,4.0
1,b,2.0,7.0,2.0,5.0
2,c,3.0,8.0,3.0,6.0
3,d,4.0,9.0,,
4,e,5.0,10.0,,
5,f,,,11.0,7.0
6,g,,,12.0,8.0


In [106]:
left = pd.DataFrame({'key1': ['a', 'a', 'b', 'c'],
                     'key2': ['a', 'b', 'a', 'b'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
    
right = pd.DataFrame({'key1': ['a', 'b', 'b', 'c'],
                      'key2': ['a', 'b', 'a', 'a'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})

In [108]:
pd.merge(left,right,how='inner',on=['key1','key2'])

Unnamed: 0,key1,key2,A,B,C,D
0,a,a,A0,B0,C0,D0
1,b,a,A2,B2,C2,D2


In [110]:
pd.merge(left,right,how='right',on=['key1','key2'])

Unnamed: 0,key1,key2,A,B,C,D
0,a,a,A0,B0,C0,D0
1,b,a,A2,B2,C2,D2
2,b,b,,,C1,D1
3,c,a,,,C3,D3


In [111]:
pd.merge(left,right,how='left',on=['key1','key2'])

Unnamed: 0,key1,key2,A,B,C,D
0,a,a,A0,B0,C0,D0
1,a,b,A1,B1,,
2,b,a,A2,B2,C2,D2
3,c,b,A3,B3,,


In [112]:
pd.merge(left,right,how='outer',on=['key1','key2'])

Unnamed: 0,key1,key2,A,B,C,D
0,a,a,A0,B0,C0,D0
1,a,b,A1,B1,,
2,b,a,A2,B2,C2,D2
3,c,b,A3,B3,,
4,b,b,,,C1,D1
5,c,a,,,C3,D3


### Concatenation

In [113]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])

In [114]:
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[4,5,6,7]) 

In [115]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [116]:
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [119]:
pd.concat([df1,df2],axis=1)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,A0,B0,C0,D0,,,,
1,A1,B1,C1,D1,,,,
2,A2,B2,C2,D2,,,,
3,A3,B3,C3,D3,,,,
4,,,,,A4,B4,C4,D4
5,,,,,A5,B5,C5,D5
6,,,,,A6,B6,C6,D6
7,,,,,A7,B7,C7,D7


In [120]:
pd.concat([df1,df2],axis=0)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
