# Simple Imputation

In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer # Imputer object from Sklearn

# read in data
df = pd.read_csv('airquality.csv')
# clean up data a bit
df.drop('Unnamed: 0', 1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 6 columns):
Ozone      116 non-null float64
Solar.R    146 non-null float64
Wind       153 non-null float64
Temp       153 non-null int64
Month      153 non-null int64
Day        153 non-null int64
dtypes: float64(3), int64(3)
memory usage: 7.2 KB


# Make imputer object

look into the "Imputer" documentation to find all of the imputation methods that this object can handle

In [8]:
# create imputer object
imp = Imputer(missing_values = 'NaN', strategy = 'mean')
# imputer for Ozone
df['Ozone'] = imp.fit_transform(df[['Ozone']])
# impute for Solar.R
df['Solar.R'] = imp.fit_transform(df[['Solar.R']])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 6 columns):
Ozone      153 non-null float64
Solar.R    153 non-null float64
Wind       153 non-null float64
Temp       153 non-null int64
Month      153 non-null int64
Day        153 non-null int64
dtypes: float64(3), int64(3)
memory usage: 7.2 KB


# Feature Importances

In [9]:
from sklearn.ensemble import RandomForestRegressor

df['RANDOM'] = np.random.rand(len(df))*100

y = df['Ozone']
x = df.drop('Ozone', axis = 1)
df

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day,RANDOM
0,41.00000,190.000000,7.4,67,5,1,99.518548
1,36.00000,118.000000,8.0,72,5,2,10.096782
2,12.00000,149.000000,12.6,74,5,3,55.418138
3,18.00000,313.000000,11.5,62,5,4,78.716309
4,42.12931,185.931507,14.3,56,5,5,19.693314
5,28.00000,185.931507,14.9,66,5,6,91.285385
6,23.00000,299.000000,8.6,65,5,7,81.740462
7,19.00000,99.000000,13.8,59,5,8,79.904547
8,8.00000,19.000000,20.1,61,5,9,10.320575
9,42.12931,194.000000,8.6,69,5,10,82.571569


In [10]:
# model
forest = RandomForestRegressor(n_estimators = 100)

# train
forest.fit(x, y)

# feature importances
pd.DataFrame({'Importance': forest.feature_importances_}, 
                   index = x.columns).sort_values('Importance', 
                                                  ascending = False)

Unnamed: 0,Importance
Temp,0.459255
Wind,0.289587
Solar.R,0.091045
Day,0.076959
RANDOM,0.05595
Month,0.027204
