# Agenda
* Introduction Missing Data
* Check for Missing Values [ isnull(), notnull()]
* Handling Missing value [ dropna(), fillna(), replace()]

##  Introduction Missing Data

In [243]:
import pandas as pd
import numpy as np
from IPython.display import display

raw_data =  pd.DataFrame({'Date':['01/08/2017','02/08/2017','02/08/2017','02/08/2017','02/08/2017',
                                 '01/08/2017','03/07/2017','04/07/2017',np.nan],
                        'StartupName':['TouchKin','Ethinos','Leverage Edu','Zepo','Click2Clunic','Billion Loans',
                                      'Ecolobriumenergy','Droom',np.nan],
                        'IndustryVertical':['Technology','TEchnology',np.nan,'ConsumeInternet','Consumer Internet',np.nan,np.nan,'eCommerce',np.nan],
                        'SubVerical':['Predicitve Care Platform','Digital Marketing Agency',
                                      'Online Platform for Higher Education Service',np.nan,np.nan,'Peer to Peer Lenading Platform',
                                      'Energy Management Solutions Provider','Online Marketplace for Automobiles',np.nan],
                        'CityLocation':['Bangalore','Mumbai','New Delhi','Mumbai','Hyderabad',np.nan,'Ahmedabad',np.nan,np.nan],
                        'AmountInUSD':[1300000,np.nan,np.nan,500000,850000,1000000,2600000,20000000,np.nan],
                         'InvestorsName':['People Group (Shaadi.com)','Hillhouse Capital, Tybourne Capital','Kalaari Capital, Sequoia Capital',
                                         'Asia Pacific Internet Group','KARSEMVEN Fund','Exfinity Fund, GrowX Ventures','MakeMyTrip',
                                         'UK based Group of Angel Investors',np.nan]},
columns = ['Date','StartupName','IndustryVertical','SubVerical','CityLocation','AmountInUSD','InvestorsName'])

display(raw_data)

Unnamed: 0,Date,StartupName,IndustryVertical,SubVerical,CityLocation,AmountInUSD,InvestorsName
0,01/08/2017,TouchKin,Technology,Predicitve Care Platform,Bangalore,1300000.0,People Group (Shaadi.com)
1,02/08/2017,Ethinos,TEchnology,Digital Marketing Agency,Mumbai,,"Hillhouse Capital, Tybourne Capital"
2,02/08/2017,Leverage Edu,,Online Platform for Higher Education Service,New Delhi,,"Kalaari Capital, Sequoia Capital"
3,02/08/2017,Zepo,ConsumeInternet,,Mumbai,500000.0,Asia Pacific Internet Group
4,02/08/2017,Click2Clunic,Consumer Internet,,Hyderabad,850000.0,KARSEMVEN Fund
5,01/08/2017,Billion Loans,,Peer to Peer Lenading Platform,,1000000.0,"Exfinity Fund, GrowX Ventures"
6,03/07/2017,Ecolobriumenergy,,Energy Management Solutions Provider,Ahmedabad,2600000.0,MakeMyTrip
7,04/07/2017,Droom,eCommerce,Online Marketplace for Automobiles,,20000000.0,UK based Group of Angel Investors
8,,,,,,,


## Check for Missing Values [ isnull(), notnull() ]

### isnull() [ pd.Series.isnull(), pd.DataFrame.isnull() ]

In [189]:
raw_data.isnull()

Unnamed: 0,Date,StartupName,IndustryVertical,SubVerical,CityLocation,AmountInUSD,InvestorsName
0,False,False,False,False,False,False,False
1,False,False,False,False,False,True,False
2,False,False,True,False,False,True,False
3,False,False,False,True,False,False,False
4,False,False,False,True,False,False,False
5,False,False,True,False,True,False,False
6,False,False,True,False,False,False,False
7,False,False,False,False,True,False,False
8,True,True,True,True,True,True,True


In [190]:
raw_data.isnull().sum()

Date                1
StartupName         1
IndustryVertical    4
SubVerical          3
CityLocation        3
AmountInUSD         3
InvestorsName       1
dtype: int64

In [192]:
raw_data['CityLocation'].isnull()

0    False
1    False
2    False
3    False
4    False
5     True
6    False
7     True
8     True
Name: CityLocation, dtype: bool

In [264]:
raw_data['CityLocation'].isnull().sum()

3

### isnotnull() [ pd.Series.isnull(), pd.DataFrame.isnull() ]

In [194]:
raw_data.notnull()

Unnamed: 0,Date,StartupName,IndustryVertical,SubVerical,CityLocation,AmountInUSD,InvestorsName
0,True,True,True,True,True,True,True
1,True,True,True,True,True,False,True
2,True,True,False,True,True,False,True
3,True,True,True,False,True,True,True
4,True,True,True,False,True,True,True
5,True,True,False,True,False,True,True
6,True,True,False,True,True,True,True
7,True,True,True,True,False,True,True
8,False,False,False,False,False,False,False


In [265]:
raw_data['AmountInUSD'].notnull()

0     True
1    False
2    False
3     True
4     True
5     True
6     True
7     True
8    False
Name: AmountInUSD, dtype: bool

## Handling Missing value [dropna(), fillna(), replace()]

### dropna() [ pd.Series.drop(), pd.DataFrame.drop ]

**Drop missing observations**

In [197]:
no_missing_value_data = raw_data.dropna()
display(no_missing_value_data)

Unnamed: 0,Date,StartupName,IndustryVertical,SubVerical,CityLocation,AmountInUSD,InvestorsName
0,01/08/2017,TouchKin,Technology,Predicitve Care Platform,Bangalore,1300000.0,People Group (Shaadi.com)


In [199]:
no_missing_value_data.isnull().sum()

Date                0
StartupName         0
IndustryVertical    0
SubVerical          0
CityLocation        0
AmountInUSD         0
InvestorsName       0
dtype: int64

**Drop row if they only contain missing values**

In [200]:
raw_data.dropna(how='all')

Unnamed: 0,Date,StartupName,IndustryVertical,SubVerical,CityLocation,AmountInUSD,InvestorsName
0,01/08/2017,TouchKin,Technology,Predicitve Care Platform,Bangalore,1300000.0,People Group (Shaadi.com)
1,02/08/2017,Ethinos,TEchnology,Digital Marketing Agency,Mumbai,,"Hillhouse Capital, Tybourne Capital"
2,02/08/2017,Leverage Edu,,Online Platform for Higher Education Service,New Delhi,,"Kalaari Capital, Sequoia Capital"
3,02/08/2017,Zepo,ConsumeInternet,,Mumbai,500000.0,Asia Pacific Internet Group
4,02/08/2017,Click2Clunic,Consumer Internet,,Hyderabad,850000.0,KARSEMVEN Fund
5,01/08/2017,Billion Loans,,Peer to Peer Lenading Platform,,1000000.0,"Exfinity Fund, GrowX Ventures"
6,03/07/2017,Ecolobriumenergy,,Energy Management Solutions Provider,Ahmedabad,2600000.0,MakeMyTrip
7,04/07/2017,Droom,eCommerce,Online Marketplace for Automobiles,,20000000.0,UK based Group of Angel Investors


In [201]:
raw_data.dropna(axis = 0, how = 'all')

Unnamed: 0,Date,StartupName,IndustryVertical,SubVerical,CityLocation,AmountInUSD,InvestorsName
0,01/08/2017,TouchKin,Technology,Predicitve Care Platform,Bangalore,1300000.0,People Group (Shaadi.com)
1,02/08/2017,Ethinos,TEchnology,Digital Marketing Agency,Mumbai,,"Hillhouse Capital, Tybourne Capital"
2,02/08/2017,Leverage Edu,,Online Platform for Higher Education Service,New Delhi,,"Kalaari Capital, Sequoia Capital"
3,02/08/2017,Zepo,ConsumeInternet,,Mumbai,500000.0,Asia Pacific Internet Group
4,02/08/2017,Click2Clunic,Consumer Internet,,Hyderabad,850000.0,KARSEMVEN Fund
5,01/08/2017,Billion Loans,,Peer to Peer Lenading Platform,,1000000.0,"Exfinity Fund, GrowX Ventures"
6,03/07/2017,Ecolobriumenergy,,Energy Management Solutions Provider,Ahmedabad,2600000.0,MakeMyTrip
7,04/07/2017,Droom,eCommerce,Online Marketplace for Automobiles,,20000000.0,UK based Group of Angel Investors


**Drop column if they only contain missing values**
  * Create a new column full of missing values

In [244]:
raw_data['Remarks'] = np.nan
display(raw_data)

Unnamed: 0,Date,StartupName,IndustryVertical,SubVerical,CityLocation,AmountInUSD,InvestorsName,Remarks
0,01/08/2017,TouchKin,Technology,Predicitve Care Platform,Bangalore,1300000.0,People Group (Shaadi.com),
1,02/08/2017,Ethinos,TEchnology,Digital Marketing Agency,Mumbai,,"Hillhouse Capital, Tybourne Capital",
2,02/08/2017,Leverage Edu,,Online Platform for Higher Education Service,New Delhi,,"Kalaari Capital, Sequoia Capital",
3,02/08/2017,Zepo,ConsumeInternet,,Mumbai,500000.0,Asia Pacific Internet Group,
4,02/08/2017,Click2Clunic,Consumer Internet,,Hyderabad,850000.0,KARSEMVEN Fund,
5,01/08/2017,Billion Loans,,Peer to Peer Lenading Platform,,1000000.0,"Exfinity Fund, GrowX Ventures",
6,03/07/2017,Ecolobriumenergy,,Energy Management Solutions Provider,Ahmedabad,2600000.0,MakeMyTrip,
7,04/07/2017,Droom,eCommerce,Online Marketplace for Automobiles,,20000000.0,UK based Group of Angel Investors,
8,,,,,,,,


 * Drop column if they only contain missing values

In [203]:
raw_data.dropna(axis=1,how = 'all')

Unnamed: 0,Date,StartupName,IndustryVertical,SubVerical,CityLocation,AmountInUSD,InvestorsName
0,01/08/2017,TouchKin,Technology,Predicitve Care Platform,Bangalore,1300000.0,People Group (Shaadi.com)
1,02/08/2017,Ethinos,TEchnology,Digital Marketing Agency,Mumbai,,"Hillhouse Capital, Tybourne Capital"
2,02/08/2017,Leverage Edu,,Online Platform for Higher Education Service,New Delhi,,"Kalaari Capital, Sequoia Capital"
3,02/08/2017,Zepo,ConsumeInternet,,Mumbai,500000.0,Asia Pacific Internet Group
4,02/08/2017,Click2Clunic,Consumer Internet,,Hyderabad,850000.0,KARSEMVEN Fund
5,01/08/2017,Billion Loans,,Peer to Peer Lenading Platform,,1000000.0,"Exfinity Fund, GrowX Ventures"
6,03/07/2017,Ecolobriumenergy,,Energy Management Solutions Provider,Ahmedabad,2600000.0,MakeMyTrip
7,04/07/2017,Droom,eCommerce,Online Marketplace for Automobiles,,20000000.0,UK based Group of Angel Investors
8,,,,,,,


**Drop row & column  if they only contain missing values**

In [204]:
display(raw_data)

Unnamed: 0,Date,StartupName,IndustryVertical,SubVerical,CityLocation,AmountInUSD,InvestorsName,Remarks
0,01/08/2017,TouchKin,Technology,Predicitve Care Platform,Bangalore,1300000.0,People Group (Shaadi.com),
1,02/08/2017,Ethinos,TEchnology,Digital Marketing Agency,Mumbai,,"Hillhouse Capital, Tybourne Capital",
2,02/08/2017,Leverage Edu,,Online Platform for Higher Education Service,New Delhi,,"Kalaari Capital, Sequoia Capital",
3,02/08/2017,Zepo,ConsumeInternet,,Mumbai,500000.0,Asia Pacific Internet Group,
4,02/08/2017,Click2Clunic,Consumer Internet,,Hyderabad,850000.0,KARSEMVEN Fund,
5,01/08/2017,Billion Loans,,Peer to Peer Lenading Platform,,1000000.0,"Exfinity Fund, GrowX Ventures",
6,03/07/2017,Ecolobriumenergy,,Energy Management Solutions Provider,Ahmedabad,2600000.0,MakeMyTrip,
7,04/07/2017,Droom,eCommerce,Online Marketplace for Automobiles,,20000000.0,UK based Group of Angel Investors,
8,,,,,,,,


In [206]:
raw_data.dropna(axis=[0,1],how='all')

Unnamed: 0,Date,StartupName,IndustryVertical,SubVerical,CityLocation,AmountInUSD,InvestorsName
0,01/08/2017,TouchKin,Technology,Predicitve Care Platform,Bangalore,1300000.0,People Group (Shaadi.com)
1,02/08/2017,Ethinos,TEchnology,Digital Marketing Agency,Mumbai,,"Hillhouse Capital, Tybourne Capital"
2,02/08/2017,Leverage Edu,,Online Platform for Higher Education Service,New Delhi,,"Kalaari Capital, Sequoia Capital"
3,02/08/2017,Zepo,ConsumeInternet,,Mumbai,500000.0,Asia Pacific Internet Group
4,02/08/2017,Click2Clunic,Consumer Internet,,Hyderabad,850000.0,KARSEMVEN Fund
5,01/08/2017,Billion Loans,,Peer to Peer Lenading Platform,,1000000.0,"Exfinity Fund, GrowX Ventures"
6,03/07/2017,Ecolobriumenergy,,Energy Management Solutions Provider,Ahmedabad,2600000.0,MakeMyTrip
7,04/07/2017,Droom,eCommerce,Online Marketplace for Automobiles,,20000000.0,UK based Group of Angel Investors


** Drop rows that contain less than 6 observations**

In [207]:
display(raw_data)

Unnamed: 0,Date,StartupName,IndustryVertical,SubVerical,CityLocation,AmountInUSD,InvestorsName,Remarks
0,01/08/2017,TouchKin,Technology,Predicitve Care Platform,Bangalore,1300000.0,People Group (Shaadi.com),
1,02/08/2017,Ethinos,TEchnology,Digital Marketing Agency,Mumbai,,"Hillhouse Capital, Tybourne Capital",
2,02/08/2017,Leverage Edu,,Online Platform for Higher Education Service,New Delhi,,"Kalaari Capital, Sequoia Capital",
3,02/08/2017,Zepo,ConsumeInternet,,Mumbai,500000.0,Asia Pacific Internet Group,
4,02/08/2017,Click2Clunic,Consumer Internet,,Hyderabad,850000.0,KARSEMVEN Fund,
5,01/08/2017,Billion Loans,,Peer to Peer Lenading Platform,,1000000.0,"Exfinity Fund, GrowX Ventures",
6,03/07/2017,Ecolobriumenergy,,Energy Management Solutions Provider,Ahmedabad,2600000.0,MakeMyTrip,
7,04/07/2017,Droom,eCommerce,Online Marketplace for Automobiles,,20000000.0,UK based Group of Angel Investors,
8,,,,,,,,


In [208]:
raw_data.dropna(thresh=6)

Unnamed: 0,Date,StartupName,IndustryVertical,SubVerical,CityLocation,AmountInUSD,InvestorsName,Remarks
0,01/08/2017,TouchKin,Technology,Predicitve Care Platform,Bangalore,1300000.0,People Group (Shaadi.com),
1,02/08/2017,Ethinos,TEchnology,Digital Marketing Agency,Mumbai,,"Hillhouse Capital, Tybourne Capital",
3,02/08/2017,Zepo,ConsumeInternet,,Mumbai,500000.0,Asia Pacific Internet Group,
4,02/08/2017,Click2Clunic,Consumer Internet,,Hyderabad,850000.0,KARSEMVEN Fund,
6,03/07/2017,Ecolobriumenergy,,Energy Management Solutions Provider,Ahmedabad,2600000.0,MakeMyTrip,
7,04/07/2017,Droom,eCommerce,Online Marketplace for Automobiles,,20000000.0,UK based Group of Angel Investors,


### fillna() [ pd.Series.drop(), pd.DataFrame.drop ]

** Fill in missing data with zeros**

In [209]:
raw_data.fillna(0)

Unnamed: 0,Date,StartupName,IndustryVertical,SubVerical,CityLocation,AmountInUSD,InvestorsName,Remarks
0,01/08/2017,TouchKin,Technology,Predicitve Care Platform,Bangalore,1300000.0,People Group (Shaadi.com),0.0
1,02/08/2017,Ethinos,TEchnology,Digital Marketing Agency,Mumbai,0.0,"Hillhouse Capital, Tybourne Capital",0.0
2,02/08/2017,Leverage Edu,0,Online Platform for Higher Education Service,New Delhi,0.0,"Kalaari Capital, Sequoia Capital",0.0
3,02/08/2017,Zepo,ConsumeInternet,0,Mumbai,500000.0,Asia Pacific Internet Group,0.0
4,02/08/2017,Click2Clunic,Consumer Internet,0,Hyderabad,850000.0,KARSEMVEN Fund,0.0
5,01/08/2017,Billion Loans,0,Peer to Peer Lenading Platform,0,1000000.0,"Exfinity Fund, GrowX Ventures",0.0
6,03/07/2017,Ecolobriumenergy,0,Energy Management Solutions Provider,Ahmedabad,2600000.0,MakeMyTrip,0.0
7,04/07/2017,Droom,eCommerce,Online Marketplace for Automobiles,0,20000000.0,UK based Group of Angel Investors,0.0
8,0,0,0,0,0,0.0,0,0.0


**Fill in missing  with some value**

In [250]:
raw_data['Remarks'].fillna('Missed Remark',inplace=True)
display(raw_data)

Unnamed: 0,Date,StartupName,IndustryVertical,SubVerical,CityLocation,AmountInUSD,InvestorsName,Remarks
0,01/08/2017,TouchKin,Technology,Predicitve Care Platform,Bangalore,1300000.0,People Group (Shaadi.com),Missed Remark
1,02/08/2017,Ethinos,TEchnology,Digital Marketing Agency,Mumbai,,"Hillhouse Capital, Tybourne Capital",Missed Remark
2,02/08/2017,Leverage Edu,,Online Platform for Higher Education Service,New Delhi,,"Kalaari Capital, Sequoia Capital",Missed Remark
3,02/08/2017,Zepo,ConsumeInternet,,Mumbai,500000.0,Asia Pacific Internet Group,Missed Remark
4,02/08/2017,Click2Clunic,Consumer Internet,,Hyderabad,850000.0,KARSEMVEN Fund,Missed Remark
5,01/08/2017,Billion Loans,,Peer to Peer Lenading Platform,,1000000.0,"Exfinity Fund, GrowX Ventures",Missed Remark
6,03/07/2017,Ecolobriumenergy,,Energy Management Solutions Provider,Ahmedabad,2600000.0,MakeMyTrip,Missed Remark
7,04/07/2017,Droom,eCommerce,Online Marketplace for Automobiles,,20000000.0,UK based Group of Angel Investors,Missed Remark
8,,,,,,,,Missed Remark


**Fill in missing with forward(ffill/pad) value**

In [232]:
raw_data['AmountInUSD'].fillna(method ='pad')

0     1300000.0
1     1300000.0
2     1300000.0
3      500000.0
4      850000.0
5     1000000.0
6     2600000.0
7    20000000.0
8    20000000.0
Name: AmountInUSD, dtype: float64

**Fill in missing with backword(ffill/pad) value**

In [236]:
raw_data['AmountInUSD'].fillna(method = 'bfill')

0     1300000.0
1      500000.0
2      500000.0
3      500000.0
4      850000.0
5     1000000.0
6     2600000.0
7    20000000.0
8           NaN
Name: AmountInUSD, dtype: float64

**Fill in missing with ffill/bfill value by limit**

In [241]:
raw_data['AmountInUSD'].fillna(method = 'ffill',limit=1)

0     1300000.0
1     1300000.0
2           NaN
3      500000.0
4      850000.0
5     1000000.0
6     2600000.0
7    20000000.0
8    20000000.0
Name: AmountInUSD, dtype: float64

**Select some raws but ignore the missing data points**

In [253]:
raw_data[raw_data['StartupName'].notnull() & raw_data['AmountInUSD'] & raw_data['InvestorsName'].notnull() ]

Unnamed: 0,Date,StartupName,IndustryVertical,SubVerical,CityLocation,AmountInUSD,InvestorsName,Remarks
0,01/08/2017,TouchKin,Technology,Predicitve Care Platform,Bangalore,1300000.0,People Group (Shaadi.com),Missed Remark
3,02/08/2017,Zepo,ConsumeInternet,,Mumbai,500000.0,Asia Pacific Internet Group,Missed Remark
4,02/08/2017,Click2Clunic,Consumer Internet,,Hyderabad,850000.0,KARSEMVEN Fund,Missed Remark
5,01/08/2017,Billion Loans,,Peer to Peer Lenading Platform,,1000000.0,"Exfinity Fund, GrowX Ventures",Missed Remark
6,03/07/2017,Ecolobriumenergy,,Energy Management Solutions Provider,Ahmedabad,2600000.0,MakeMyTrip,Missed Remark
7,04/07/2017,Droom,eCommerce,Online Marketplace for Automobiles,,20000000.0,UK based Group of Angel Investors,Missed Remark


### replace[ pd.Series.drop(), pd.DataFrame.drop ]

** Replace missing with some value**

In [269]:
raw_data['IndustryVertical'].replace(to_replace = np.nan, value = 'Technology')

0           Technology
1           TEchnology
2           Technology
3      ConsumeInternet
4    Consumer Internet
5           Technology
6           Technology
7            eCommerce
8           Technology
Name: IndustryVertical, dtype: object