In [7]:
import pandas as pd
import numpy as np

In [8]:
dict_a = {
    'key1':1,
    'key2':2
}

In [9]:
dict_a['key1']

1

In [22]:
dummy_dict = {
    'Name': ['Sachin', 'Dhoni', 'Yuvraj', 'Ganguly', 'Kholi'],
    'Runs': [12234, 435, 5454, 2342 ,545],
    'Matches': [123, np.nan, 546, 41,np.nan ]
}


new_dataframe = pd.DataFrame.from_dict(dummy_dict)
new_dataframe

Unnamed: 0,Matches,Name,Runs
0,123.0,Sachin,12234
1,,Dhoni,435
2,546.0,Yuvraj,5454
3,41.0,Ganguly,2342
4,,Kholi,545


## Treating Missing Values

In [23]:
new_dataframe.isna()

Unnamed: 0,Matches,Name,Runs
0,False,False,False
1,True,False,False
2,False,False,False
3,False,False,False
4,True,False,False


In [24]:
new_dataframe.isna().sum()

Matches    2
Name       0
Runs       0
dtype: int64

In [25]:
new_dataframe.drop(['Matches'], axis=1) # drops specific columns that contain empty values

Unnamed: 0,Name,Runs
0,Sachin,12234
1,Dhoni,435
2,Yuvraj,5454
3,Ganguly,2342
4,Kholi,545


In [26]:
dummy_dict = {
    'Name': ['Sachin', 'Dhoni', 'Yuvraj', 'Ganguly', 'Kholi'],
    'Runs': [12234, 435, 5454, 2342 ,np.nan],
    'Matches': [123, np.nan, 546, 41,np.nan ]
}
new_dataframe = pd.DataFrame.from_dict(dummy_dict)
new_dataframe

Unnamed: 0,Matches,Name,Runs
0,123.0,Sachin,12234.0
1,,Dhoni,435.0
2,546.0,Yuvraj,5454.0
3,41.0,Ganguly,2342.0
4,,Kholi,


In [28]:
new_dataframe.dropna() # Drops all the rows that contain empty values

Unnamed: 0,Matches,Name,Runs
0,123.0,Sachin,12234.0
2,546.0,Yuvraj,5454.0
3,41.0,Ganguly,2342.0


When to use dropping
1) You can drop rows when you have lot of data and only few rows have missing values
2) You drop a column when it has a lot of missing values

In [80]:
dummy_dict = {
    'Name': ['Sachin', 'Dhoni', 'Yuvraj', 'Ganguly', 'Kholi'],
    'Runs': [12234, 435, 5454, 2342 ,545],
    'Matches': [123, np.nan, 546, 41,np.nan ]
}


new_dataframe = pd.DataFrame.from_dict(dummy_dict)
new_dataframe

Unnamed: 0,Matches,Name,Runs
0,123.0,Sachin,12234
1,,Dhoni,435
2,546.0,Yuvraj,5454
3,41.0,Ganguly,2342
4,,Kholi,545


## Imputing data

In [81]:
## forward filling technique

new_dataframe.fillna(method='ffill')

Unnamed: 0,Matches,Name,Runs
0,123.0,Sachin,12234
1,123.0,Dhoni,435
2,546.0,Yuvraj,5454
3,41.0,Ganguly,2342
4,41.0,Kholi,545


In [82]:
## backward filling technique

new_dataframe.fillna(method='bfill')

Unnamed: 0,Matches,Name,Runs
0,123.0,Sachin,12234
1,546.0,Dhoni,435
2,546.0,Yuvraj,5454
3,41.0,Ganguly,2342
4,,Kholi,545


In [83]:
## Filling with mean
mean=new_dataframe['Matches'].mean()
new_dataframe['Matches'].fillna(mean)

0    123.000000
1    236.666667
2    546.000000
3     41.000000
4    236.666667
Name: Matches, dtype: float64

In [84]:
mean

236.66666666666666

In [85]:
new_matches = new_dataframe['Matches'].fillna(mean)

In [86]:
new_dataframe['Matches']=new_matches

In [87]:
new_dataframe

Unnamed: 0,Matches,Name,Runs
0,123.0,Sachin,12234
1,236.666667,Dhoni,435
2,546.0,Yuvraj,5454
3,41.0,Ganguly,2342
4,236.666667,Kholi,545


In [91]:
## Filling with mode

dummy_dict = {
    'Name': ['Sachin', 'Dhoni', 'Yuvraj', 'Ganguly', 'Kholi'],
    'Runs': [12234, 435, 5454, 2342 ,545],
    'Matches': [123, np.nan, 123, 41,np.nan ]
}


new_dataframe = pd.DataFrame.from_dict(dummy_dict)
new_dataframe


mode=new_dataframe['Matches'].mode()[0]
mode
new_matches = new_dataframe['Matches'].fillna(mode)
new_dataframe['Matches'] = new_matches

In [92]:
new_dataframe

Unnamed: 0,Matches,Name,Runs
0,123.0,Sachin,12234
1,123.0,Dhoni,435
2,123.0,Yuvraj,5454
3,41.0,Ganguly,2342
4,123.0,Kholi,545


In [93]:
## Filling with median

In [94]:
dummy_dict = {
    'Name': ['Sachin', 'Dhoni', 'Yuvraj', 'Ganguly', 'Kholi'],
    'Runs': [12234, 435, 5454, 2342 ,545],
    'Matches': [123, np.nan, 546, 41,np.nan ]
}


new_dataframe = pd.DataFrame.from_dict(dummy_dict)
new_dataframe

Unnamed: 0,Matches,Name,Runs
0,123.0,Sachin,12234
1,,Dhoni,435
2,546.0,Yuvraj,5454
3,41.0,Ganguly,2342
4,,Kholi,545


In [95]:
## Filling with mean
median=new_dataframe['Matches'].median()

new_matches = new_dataframe['Matches'].fillna(median)
new_dataframe['Matches'] = new_matches
new_dataframe

Unnamed: 0,Matches,Name,Runs
0,123.0,Sachin,12234
1,123.0,Dhoni,435
2,546.0,Yuvraj,5454
3,41.0,Ganguly,2342
4,123.0,Kholi,545


In [96]:
## Assignment
## Modify the iris dataset to contain null values:



1) Train a decision tree classifier by removing all the  rows with missing values
2) Train a decision tree classifier by removing all the columns that have missing vlaues
3) Train a decision tree by:
    i) Filling missing values with mean
    ii) Filling missing values with ffill method
    iii) Filling missing values with bfill method
    iv) Filling missing values with median
    And check accuracy in all cases.

4) Create a csv file that has each method and its corresponding accuracy 

SyntaxError: invalid syntax (<ipython-input-96-028c8fa1b083>, line 6)