#### Topics Covered
- Missing Values
- Handling missing values
- Deleting a Column

In [2]:
#import pandas library and provide alias name as pd
import pandas as pd
import numpy as np

trend_data = pd.read_csv('Data/SearchTrendData_WithMissing.csv')
trend_data.head(20)

Unnamed: 0,Week,eLearning,DataScience,MachineLearning,ArtificialIntelligence,DeepLearning
0,20/09/2015,32.0,9.0,11.0,11.0,4.0
1,27/09/2015,35.0,10.0,11.0,11.0,4.0
2,4/10/2015,38.0,10.0,12.0,11.0,4.0
3,11/10/2015,37.0,9.0,12.0,10.0,4.0
4,18/10/2015,38.0,9.0,12.0,10.0,4.0
5,25/10/2015,38.0,10.0,,11.0,4.0
6,1/11/2015,37.0,9.0,13.0,11.0,3.0
7,8/11/2015,35.0,8.0,14.0,12.0,5.0
8,15/11/2015,35.0,9.0,13.0,12.0,4.0
9,22/11/2015,31.0,8.0,12.0,10.0,4.0


In [3]:
t_data = trend_data[:20]
t_data = t_data.reset_index(drop=True) # For resetting index
print(t_data)

          Week  eLearning  DataScience  MachineLearning  \
0   20/09/2015       32.0          9.0             11.0   
1   27/09/2015       35.0         10.0             11.0   
2    4/10/2015       38.0         10.0             12.0   
3   11/10/2015       37.0          9.0             12.0   
4   18/10/2015       38.0          9.0             12.0   
5   25/10/2015       38.0         10.0              NaN   
6    1/11/2015       37.0          9.0             13.0   
7    8/11/2015       35.0          8.0             14.0   
8   15/11/2015       35.0          9.0             13.0   
9   22/11/2015       31.0          8.0             12.0   
10  29/11/2015        NaN          9.0             13.0   
11   6/12/2015       33.0          9.0             13.0   
12  13/12/2015       30.0          9.0             14.0   
13  20/12/2015       17.0          7.0             11.0   
14  27/12/2015       17.0          7.0             10.0   
15   3/01/2016       24.0         10.0             12.0 

1a. Finding Missing values

In [7]:
#print(t_data.isnull())
#print(t_data.notnull())
print(t_data['MachineLearning'].isnull())

0     False
1     False
2     False
3     False
4     False
5      True
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
Name: MachineLearning, dtype: bool


1b. Finding sum on a column that has missing values

In [8]:
print(t_data['MachineLearning'].sum())
# Missing values are treated as zero here

237.0


1c. Fetching all the records with a missing value in a column

In [9]:
t_data[pd.isnull(t_data.eLearning)]

Unnamed: 0,Week,eLearning,DataScience,MachineLearning,ArtificialIntelligence,DeepLearning
10,29/11/2015,,9.0,13.0,11.0,4.0


In [10]:
# fetching all rows with a missing value
t_data[t_data.isnull().any(axis=1)]

Unnamed: 0,Week,eLearning,DataScience,MachineLearning,ArtificialIntelligence,DeepLearning
5,25/10/2015,38.0,10.0,,11.0,4.0
10,29/11/2015,,9.0,13.0,11.0,4.0
12,13/12/2015,30.0,9.0,14.0,11.0,


2a. Replacing missing values with 0

In [11]:
print(t_data.fillna(0))

          Week  eLearning  DataScience  MachineLearning  \
0   20/09/2015       32.0          9.0             11.0   
1   27/09/2015       35.0         10.0             11.0   
2    4/10/2015       38.0         10.0             12.0   
3   11/10/2015       37.0          9.0             12.0   
4   18/10/2015       38.0          9.0             12.0   
5   25/10/2015       38.0         10.0              0.0   
6    1/11/2015       37.0          9.0             13.0   
7    8/11/2015       35.0          8.0             14.0   
8   15/11/2015       35.0          9.0             13.0   
9   22/11/2015       31.0          8.0             12.0   
10  29/11/2015        0.0          9.0             13.0   
11   6/12/2015       33.0          9.0             13.0   
12  13/12/2015       30.0          9.0             14.0   
13  20/12/2015       17.0          7.0             11.0   
14  27/12/2015       17.0          7.0             10.0   
15   3/01/2016       24.0         10.0             12.0 

2b. Replacing missing value with a mean

In [12]:
print(t_data['eLearning'].fillna(t_data['eLearning'].mean()))

0     32.000000
1     35.000000
2     38.000000
3     37.000000
4     38.000000
5     38.000000
6     37.000000
7     35.000000
8     35.000000
9     31.000000
10    30.789474
11    33.000000
12    30.000000
13    17.000000
14    17.000000
15    24.000000
16    26.000000
17    26.000000
18    28.000000
19    28.000000
Name: eLearning, dtype: float64


2c. Use pad/forward fill to replace missing values

In [13]:
print(t_data.fillna(method='pad'))

          Week  eLearning  DataScience  MachineLearning  \
0   20/09/2015       32.0          9.0             11.0   
1   27/09/2015       35.0         10.0             11.0   
2    4/10/2015       38.0         10.0             12.0   
3   11/10/2015       37.0          9.0             12.0   
4   18/10/2015       38.0          9.0             12.0   
5   25/10/2015       38.0         10.0             12.0   
6    1/11/2015       37.0          9.0             13.0   
7    8/11/2015       35.0          8.0             14.0   
8   15/11/2015       35.0          9.0             13.0   
9   22/11/2015       31.0          8.0             12.0   
10  29/11/2015       31.0          9.0             13.0   
11   6/12/2015       33.0          9.0             13.0   
12  13/12/2015       30.0          9.0             14.0   
13  20/12/2015       17.0          7.0             11.0   
14  27/12/2015       17.0          7.0             10.0   
15   3/01/2016       24.0         10.0             12.0 

2d. use back fill to replace missing values

In [14]:
print(t_data.fillna(method='backfill'))

          Week  eLearning  DataScience  MachineLearning  \
0   20/09/2015       32.0          9.0             11.0   
1   27/09/2015       35.0         10.0             11.0   
2    4/10/2015       38.0         10.0             12.0   
3   11/10/2015       37.0          9.0             12.0   
4   18/10/2015       38.0          9.0             12.0   
5   25/10/2015       38.0         10.0             13.0   
6    1/11/2015       37.0          9.0             13.0   
7    8/11/2015       35.0          8.0             14.0   
8   15/11/2015       35.0          9.0             13.0   
9   22/11/2015       31.0          8.0             12.0   
10  29/11/2015       33.0          9.0             13.0   
11   6/12/2015       33.0          9.0             13.0   
12  13/12/2015       30.0          9.0             14.0   
13  20/12/2015       17.0          7.0             11.0   
14  27/12/2015       17.0          7.0             10.0   
15   3/01/2016       24.0         10.0             12.0 

2e. Drop all the missing values

In [15]:
print(t_data.dropna())

          Week  eLearning  DataScience  MachineLearning  \
0   20/09/2015       32.0          9.0             11.0   
1   27/09/2015       35.0         10.0             11.0   
2    4/10/2015       38.0         10.0             12.0   
3   11/10/2015       37.0          9.0             12.0   
4   18/10/2015       38.0          9.0             12.0   
6    1/11/2015       37.0          9.0             13.0   
7    8/11/2015       35.0          8.0             14.0   
8   15/11/2015       35.0          9.0             13.0   
9   22/11/2015       31.0          8.0             12.0   
11   6/12/2015       33.0          9.0             13.0   
13  20/12/2015       17.0          7.0             11.0   
14  27/12/2015       17.0          7.0             10.0   
15   3/01/2016       24.0         10.0             12.0   
16  10/01/2016       26.0          9.0             12.0   
17  17/01/2016       26.0         11.0             13.0   
18  24/01/2016       28.0         11.0             15.0 

In [16]:
del t_data['DeepLearning']
print(t_data)

          Week  eLearning  DataScience  MachineLearning  \
0   20/09/2015       32.0          9.0             11.0   
1   27/09/2015       35.0         10.0             11.0   
2    4/10/2015       38.0         10.0             12.0   
3   11/10/2015       37.0          9.0             12.0   
4   18/10/2015       38.0          9.0             12.0   
5   25/10/2015       38.0         10.0              NaN   
6    1/11/2015       37.0          9.0             13.0   
7    8/11/2015       35.0          8.0             14.0   
8   15/11/2015       35.0          9.0             13.0   
9   22/11/2015       31.0          8.0             12.0   
10  29/11/2015        NaN          9.0             13.0   
11   6/12/2015       33.0          9.0             13.0   
12  13/12/2015       30.0          9.0             14.0   
13  20/12/2015       17.0          7.0             11.0   
14  27/12/2015       17.0          7.0             10.0   
15   3/01/2016       24.0         10.0             12.0 

To Try on the complete dataset
- Replace the missing values in all the column by the mean and then using pad and backfill methods and check how it impacts the overall mean of the column. Which method gives result close to the actual (use the original dataset from previous day's folder)?
- Remove all the rows with a missing value and check how it impacts the overall mean of the columns as compared to the original values?