# Project 2 - Data cleaning
###  Dealing with missing data
by: Alissa Stover, Sophia Skowronski, Ying Hua

## Importing data

In [1]:
''' importing basic data analysis packages'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os,random, math, psutil, pickle 
import missingno as msno

''' For ML'''
from sklearn import metrics, svm
from sklearn.linear_model  import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn import preprocessing
from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
'''Reading in data'''
building_df = pd.read_csv('building_metadata.csv')
train_df = pd.read_csv('train.csv')

##  Initial filtering and merging

In [3]:
# only focusing on electricity meter:
train_df= train_df.loc[train_df['meter']==0]

# merging with buidling data
train_df = train_df.merge(building_df, on=['building_id'], how='left')

# Converting timestamp to right data type
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])


## Conver time stamp to time index

In [4]:
# adding time index varialbe that counts difference in time vs. beginning date 
train_df['time_index']= train_df['timestamp']- train_df['timestamp'].min()

# Coverting time difference into hours
train_df['time_index']= train_df['time_index'].apply (lambda x: x.days*24+x.seconds//3600) 
train_df['time_index'] = train_df ['time_index'].astype(int)

# Adding day of the week and hour of day
train_df['day_of_week'] = train_df['timestamp'].dt.dayofweek
train_df['hour_of_day'] = train_df['time_index'] % 24
train_df['hour_of_day'] = train_df['hour_of_day'].astype(int)

## Taking a look at missing data - Explanatory variable(Meter Reading)

In [5]:
# Examining how much data is missing
missing_count = len(train_df.loc[train_df['meter_reading'].isnull()])
missing_percent = missing_count / len(train_df)
print ("{0:.0%}".format(missing_percent), "of data is missing.")

# Exmaing how much data has 0 reading
zero_value = len(train_df.loc[train_df['meter_reading'] ==0])
zero_percent = zero_value / len(train_df)
print ("{0:.0%}".format(zero_percent), "of meter readings are 0. ")

0% of data is missing.
4% of meter readings are 0. 


> We note that 4% of the meter readings are zero.  We theorize that some of the 0 values are not actual meter readings but some sort of erroneous mistake.  

> One of the things we have noticed is that sometimes a 0 reading will be followed by a huge reading and we suspect those are "catch up" reading when one misses a meter reading.  We note that while 4% is not a big number (especially considering some of these data could actually be 0 readings, we decided that we should take two approaches.  One is the simple appraoch to get rid of these 0 readings and treat all as mistakes.  The other is to use ML techniques to impute these missing number. 

> We plan on using both set when we come up with the model to forecast meter reading and see which set perform better. 

### The simple approach - treating all 0 readings as erroneous and deleting all

To more accurately impute missing data, we first need a set of clean meter readings that will be representative (i.e. not erroneous.)  We begin our process by hilighting which data could be erroneous. 

In [6]:
# adding index as a column 
train_df['index'] = train_df.index 

In [7]:
# for data quality purpose, we decided to also delete the spike reading immediately post the 0 readings

# we first define spike reading to be readings that are >2x standard deviation away from mean for each buildign

building_meter_avg = pd.DataFrame(train_df.groupby('building_id')['meter_reading'].mean())
building_meter_std = pd.DataFrame(train_df.groupby('building_id')['meter_reading'].std())
building_meter_outlier = building_meter_avg.merge(building_meter_std, on='building_id')
building_meter_outlier= building_meter_outlier.rename(columns={"meter_reading_x": "avg", "meter_reading_y": "std"})
building_meter_outlier['outlier'] = building_meter_outlier['avg']+2*building_meter_outlier['std']

# Adding the outlier into the original dataframe
train_df= train_df.merge(building_meter_outlier, on='building_id', how='left')

In [8]:
# Determing spike readings that are immediately followed by 0 readings 

spike_index = []
grouped = train_df.groupby('building_id')
for key, group in grouped:
    group.sort_values(by = 'time_index')
    spike_index += list(group.loc[((group.meter_reading > group.outlier) & \
                                   (group.meter_reading.shift(1) == 0))].index)

In [9]:
# filtering out 0s and spike readings
clean_data_naive = train_df.loc[(train_df.meter_reading != 0 & \
                                 ~train_df.index.isin(spike_index))]

In [10]:
print ('With this simple way, we are capturing', "{0:.0%}".format(len(clean_data_naive)/ \
                                                                 len(train_df)), 'of the meter reading data.')

With this simple way, we are capturing 96% of the meter reading data.


In [11]:
clean_data_naive.isna().sum()

building_id            0
meter                  0
timestamp              0
meter_reading          0
site_id                0
primary_use            0
square_feet            0
year_built       6329280
floor_count      8617528
time_index             0
day_of_week            0
hour_of_day            0
index                  0
avg                    0
std                    0
outlier                0
dtype: int64

## The advanced approach 
> With this approach, we want to come up with a ML model to impute missing meter reading data.  To do so, we first need to distinguish which among the 0 readings are missing data and which are actual 0 readings.  We define such as 0 readings that are not followed by spikes and does not last longer than 7 consecutive days. 

In [12]:
# We first separate out all the zero readings as well as spikes, which is the naive clean dataset we calcualted above. 
# since we suspect these spikes are erroneous, we will put them in the to impute category
clean_data_advanced = clean_data_naive
data_to_impute = train_df.loc[~train_df.index.isin (clean_data_advanced.index)]



In [13]:
data_to_impute

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,time_index,day_of_week,hour_of_day,index,avg,std,outlier
0,0,0,2016-01-01 00:00:00,0.0,0,Education,7432,2008.0,,0,4,0,0,146.454978,121.897171,390.249320
1,1,0,2016-01-01 00:00:00,0.0,0,Education,2720,2004.0,,0,4,0,1,74.865393,61.765389,198.396171
2,2,0,2016-01-01 00:00:00,0.0,0,Education,5376,1991.0,,0,4,0,2,14.551385,16.063792,46.678969
3,3,0,2016-01-01 00:00:00,0.0,0,Education,23685,2002.0,,0,4,0,3,235.549966,205.985852,647.521671
4,4,0,2016-01-01 00:00:00,0.0,0,Education,116607,1975.0,,0,4,0,4,976.556746,779.694100,2535.944947
5,5,0,2016-01-01 00:00:00,0.0,0,Education,8000,2000.0,,0,4,0,5,17.894445,18.802042,55.498529
6,6,0,2016-01-01 00:00:00,0.0,0,Lodging/residential,27926,1981.0,,0,4,0,6,64.066607,57.406403,178.879412
7,7,0,2016-01-01 00:00:00,0.0,0,Education,121074,1989.0,,0,4,0,7,350.334167,287.787910,925.909987
8,8,0,2016-01-01 00:00:00,0.0,0,Education,60809,2003.0,,0,4,0,8,256.927804,208.758750,674.445304
9,9,0,2016-01-01 00:00:00,0.0,0,Office,27000,2010.0,,0,4,0,9,72.714742,66.862532,206.439806


In [14]:
# We then add back 0 values that are likely to be real zero readings- i.e. those that are not followed by spikes and does not last 7 consecutive days

grouped = train_df.groupby('building_id')
correct_zero_readings_index = []
for key, group in grouped:
    group = group.sort_values(by = 'time_index')
    correct_zero_readings_index += list(group.loc[(((group.meter_reading == 0) & \
                                                   (group.meter_reading.shift(-1)<=group.outlier)) & \
                                                   (group.meter_reading.groupby((group.meter_reading != \
                                                                                 group.meter_reading.shift()).cumsum()).\
                                                                                 transform('count').lt(8)))].index)
    

In [15]:
# we add these back to our clean data and exclude them from data to impute
clean_data_advanced = train_df.loc[((train_df.index.isin(list(clean_data_advanced.index))) | \
                                  (train_df.index.isin(correct_zero_readings_index)))]
data_to_impute = data_to_impute.drop (correct_zero_readings_index)

> From our visual charts, we can also see that in site 0, there are a large chunks of data that are missing at the beginning of the period.  There are some small readings here and there during this period at certain buildings but for the most part, the readings were non existent. We decided not to include these data. 

In [16]:
# Figuring out which part of the site 0 history to ignore
site_zero_issues = train_df.loc[train_df.site_id == 0].groupby('time_index')['meter_reading'].sum()
max_dff = (site_zero_issues - site_zero_issues.shift()).max()
inflection_time = site_zero_issues[site_zero_issues >=  max_dff].index[0]
size_zero_missing_data_index = list(clean_data_advanced.loc[((clean_data_advanced.site_id ==0) & \
                                                  (clean_data_advanced.time_index < inflection_time))].index)

In [17]:
# moving these data from clean_data_advanced to data_to_impute
clean_data_advanced = clean_data_advanced.drop (size_zero_missing_data_index)
data_to_impute =  train_df.loc[((train_df.index.isin(list(data_to_impute.index))) | \
                                  (train_df.index.isin(size_zero_missing_data_index)))]

> We now have a clean dataset (clean_data_advanced) to run ML algorithmn to impute missing data.  

In [18]:
print ('We are using', "{0:.0%}".format(len(clean_data_advanced)/ len(train_df)), \
       'of the meter reading data as clean data to run ML to impute', \
       "{0:0}".format(len(data_to_impute)), ', or', "{0:.0%}".format(len(data_to_impute)/ len(train_df)), "of the data." )

We are using 96% of the meter reading data as clean data to run ML to impute 517297 , or 4% of the data.


### Using ML to impute missing data

We decided to try different ML methods to impute missing data (in our case 0 readings).  The three mehtods we want to try is KNN, linear regression and naive bayes. 

Before we run different methodology and compare results, we first want to split our clean data into training and test set and define features that will be used to run the test. Since meter_reading is our explanatory variable that we want to use more features later to predict, we want to keep this part of feature engineering simpler.  

We picked 4 variables as features - site id, buidling id, time of the day and day of the week. The first 2 variables we think will indirectly give us some information about buildng specific as well as weather related inforamtion as well. 

In [19]:
clean_data_advanced_x = clean_data_advanced.loc[:,clean_data_advanced.columns.isin(['building_id', 'site_id','hour_of_day', 'day_of_week'])]
clean_data_advanced_y = clean_data_advanced.loc[:,clean_data_advanced.columns.isin(['meter_reading'])].values

In [20]:
# converting & encoding lable to avoid valueerror
# lab_enc = preprocessing.LabelEncoder()
# training_scores_encoded = lab_enc.fit_transform(clean_data_advanced_y)
training_scores_encoded= np.ravel(clean_data_advanced_y).astype('int')

In [21]:
# spliting the clean dataset into 70/30 for training/ test
X_train, X_test, y_train, y_test = train_test_split(clean_data_advanced_x, training_scores_encoded, test_size=0.3, random_state=1)

> METHOD: Liner regression

In [22]:
# Training model using linear regression
regressor = LinearRegression()  
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [23]:
# predicting y 
y_pred = regressor.predict(X_test)
r2_score(y_test, y_pred)

0.0028021190057201384

## Advanced approach, attempt 2

> Given a extremely low success score (especially even on the training set) we suspected that the model is flawed.  One of the issues we see is that we have essentially all categorical variables even though they are in numeric values.  So we decide to experiementing on reconfiguring these variables into more numeric values.  The way we do this is by taking the average that fits each catergory and use that train the dataset. 

In [24]:
# Creating a copy of the x, y variables to test our new method
clean_data_advanced_new_x = clean_data_advanced.loc[:,clean_data_advanced.columns.isin(['building_id', 'site_id','meter_reading','hour_of_day', 'day_of_week'])].copy()
clean_data_advanced_y = clean_data_advanced.loc[:,clean_data_advanced.columns.isin(['meter_reading'])].copy()
clean_data_advanced_y= np.ravel(clean_data_advanced_y).astype('int')

# lab_enc = preprocessing.LabelEncoder()
# training_scores_encoded = lab_enc.fit_transform(clean_data_advanced_y)


In [25]:
# spliting the clean dataset into 70/30 for training/ test
X_train, X_test, y_train, y_test = train_test_split(clean_data_advanced_new_x, clean_data_advanced_y, test_size=0.3, random_state=1)

In [26]:
# adding in avearage meter reading for each variable for training_set
X_train = X_train.copy()
X_train ['avg_building'] = X_train.groupby('building_id')['meter_reading'].transform('mean')
X_train ['avg_site'] = X_train.groupby('site_id')['meter_reading'].transform('mean')
X_train ['avg_dow'] = X_train.groupby('day_of_week')['meter_reading'].transform('mean')
X_train ['avg_hod'] = X_train.groupby('hour_of_day')['meter_reading'].transform('mean')

In [27]:
#creating a dictionary

grouped = X_train.groupby('building_id')
avg_building_dict= {key:group['avg_building'].mean() for key, group in grouped}

grouped = X_train.groupby('site_id')
avg_site_dict= {key:group['avg_site'].mean() for key, group in grouped}

grouped = X_train.groupby('day_of_week')
avg_dow_dict= {key:group['avg_dow'].mean() for key, group in grouped}

grouped = X_train.groupby('hour_of_day')
avg_hod_dict= {key:group['avg_hod'].mean() for key, group in grouped}


In [28]:
# adding in avearage meter reading for each variable for testing_set
X_test = X_test.copy()
X_test ['avg_building'] = X_test['building_id'].map(avg_building_dict)
X_test ['avg_site'] = X_test['site_id'].map(avg_site_dict)
X_test ['avg_dow'] = X_test['day_of_week'].map(avg_dow_dict)
X_test ['avg_hod'] = X_test['hour_of_day'].map(avg_hod_dict)

In [29]:
# dropping categorical columns
X_train = X_train.drop(columns =['building_id','meter_reading','site_id','day_of_week', 'hour_of_day'])
X_test = X_test.drop(columns=['building_id','meter_reading','site_id','day_of_week', 'hour_of_day'])

> **METHOD: Linear_ regression- ATTEMPT 2**

In [30]:
# Training model using linear regression
regressor = LinearRegression()  
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [31]:
# predicting y 
y_pred = regressor.predict(X_test)

In [32]:
# evaluating results 
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df.head(5)

Unnamed: 0,Actual,Predicted
0,140,112.800352
1,84,157.356328
2,13,-22.008835
3,60,102.87045
4,1353,1568.928718


In [33]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 41.331974002685754
Mean Squared Error: 18871.93305711458
Root Mean Squared Error: 137.37515443891076


In [34]:
r2_score(y_test, y_pred)

0.8732870352002534

## Using linear regression to impute missing value
> With a 87% R-square, we feel pretty good about our model and will go ahead and use the model to forecast the missing data

In [35]:
data_to_impute.head

<bound method NDFrame.head of           building_id  meter           timestamp  meter_reading  site_id  \
0                   0      0 2016-01-01 00:00:00            0.0        0   
1                   1      0 2016-01-01 00:00:00            0.0        0   
2                   2      0 2016-01-01 00:00:00            0.0        0   
3                   3      0 2016-01-01 00:00:00            0.0        0   
4                   4      0 2016-01-01 00:00:00            0.0        0   
5                   5      0 2016-01-01 00:00:00            0.0        0   
6                   6      0 2016-01-01 00:00:00            0.0        0   
7                   7      0 2016-01-01 00:00:00            0.0        0   
8                   8      0 2016-01-01 00:00:00            0.0        0   
9                   9      0 2016-01-01 00:00:00            0.0        0   
10                 10      0 2016-01-01 00:00:00            0.0        0   
11                 11      0 2016-01-01 00:00:00          

In [36]:
data_to_impute_cal = data_to_impute.loc[:,clean_data_advanced.columns.isin(['building_id', 'site_id','hour_of_day', 'day_of_week'])]

In [37]:
# Translating categorical data
data_to_impute_cal = data_to_impute_cal.copy()
data_to_impute_cal ['avg_building'] = data_to_impute_cal['building_id'].map(avg_building_dict)
data_to_impute_cal ['avg_site'] = data_to_impute_cal['site_id'].map(avg_site_dict)
data_to_impute_cal ['avg_dow'] = data_to_impute_cal['day_of_week'].map(avg_dow_dict)
data_to_impute_cal ['avg_hod'] = data_to_impute_cal['hour_of_day'].map(avg_hod_dict)

In [38]:
# dropping categorical columns
data_to_impute_cal = data_to_impute_cal.drop(columns =['building_id','site_id','day_of_week', 'hour_of_day'])

In [39]:
# predicting data_to_impute
data_to_impute_cal ['meter_reading'] = regressor.predict(data_to_impute_cal)

In [40]:
full_meter_reading = pd.DataFrame (clean_data_advanced['meter_reading'].append(data_to_impute_cal['meter_reading']))
full_meter_reading = full_meter_reading.sort_index()

In [41]:
train_df_imputed = train_df.copy()
train_df_imputed['meter_reading'] = full_meter_reading['meter_reading']

## Taking a look at missing data - Independent variables

In [42]:
missing_data = pd.DataFrame(train_df_imputed.isna().sum())
missing_data.columns = ['missing_count']
missing_data['missing_percent'] = missing_data['missing_count']/ len(train_df)

In [43]:
missing_data

Unnamed: 0,missing_count,missing_percent
building_id,0,0.0
meter,0,0.0
timestamp,0,0.0
meter_reading,0,0.0
site_id,0,0.0
primary_use,0,0.0
square_feet,0,0.0
year_built,6470035,0.536447
floor_count,9096083,0.754179
time_index,0,0.0


>  The two missing data are year_built and floor_count. We note that both of them have fairly high percent of missing data.  While we will use ML techniques to impute these missing data, we will be more inclined to use other non-missing data if possible. 

### Imputing year_built variable 

In [44]:
train_df_imputed.head(5)

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,time_index,day_of_week,hour_of_day,index,avg,std,outlier
0,0,0,2016-01-01,220.046471,0,Education,7432,2008.0,,0,4,0,0,146.454978,121.897171,390.24932
1,1,0,2016-01-01,101.917963,0,Education,2720,2004.0,,0,4,0,1,74.865393,61.765389,198.396171
2,2,0,2016-01-01,5.634698,0,Education,5376,1991.0,,0,4,0,2,14.551385,16.063792,46.678969
3,3,0,2016-01-01,366.496399,0,Education,23685,2002.0,,0,4,0,3,235.549966,205.985852,647.521671
4,4,0,2016-01-01,1568.406545,0,Education,116607,1975.0,,0,4,0,4,976.556746,779.6941,2535.944947


In [45]:
# Are there any buildings that are not missing year_built consistently?
grouped = train_df_imputed.groupby('building_id')
inconsistent_index = []
for key, group in grouped:
    if group['year_built'].isna().count() not in [len(group), 0]:
        inconsistent_index += [key]
    


In [46]:
# We confirm no building is missing partial data
len(inconsistent_index)

0

In [47]:
# we then separate the missing data form non-missing data 
yb_missing_data = train_df_imputed.loc[train_df_imputed['year_built'].isna()]
yb_clean_data = train_df_imputed.loc[~train_df_imputed['year_built'].isna()]

> For ML algorithm, we will split the non-missing data to 80/20 for train/test.  We will pick the followingvariable as features: meter_reading, day_of_week, hour_of_day, square_feet, primary_use, site_id

In [48]:
# Setting x-variables 
yb_clean_data_x = yb_clean_data.loc[:, yb_clean_data.columns.isin(['meter_reading', 'day_of_week', 'hour_of_day',\
                                                  'squre_feet', 'primary_use', 'site_id'])]
# Setting y-variables
yb_clean_data_y = yb_clean_data.loc[:, yb_clean_data.columns.isin(['year_built'])]

In [49]:
# converting primary_use into a numeric varialbe
lab_enc = preprocessing.LabelEncoder()
yb_clean_data_x = yb_clean_data_x.copy()
yb_clean_data_x['primary_use'] = lab_enc.fit_transform(yb_clean_data_x['primary_use'])

In [50]:
# splitting dataset into 70/30 for train/test
X_train, X_test, y_train, y_test = train_test_split(yb_clean_data_x, yb_clean_data_y, test_size=0.3, random_state=1)
y_train = np.ravel(y_train)

> ML Method: KNN

In [51]:
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 5)
# Fit the classifier to the data
knn.fit(X_train,y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [52]:
# Predicting on test data
y_predict = knn.predict(X_test)

In [53]:
#check accuracy of our model on the test data
print("Accuracy:",metrics.accuracy_score(y_test, y_predict))

Accuracy: 0.5039478006728819


> Linear regression

In [54]:
# Training model using linear regression
regressor = LinearRegression()  
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [55]:
# predicting y 
y_pred = regressor.predict(X_test)

In [56]:
r2_score(y_test, y_pred)

0.07435593085058112

 > Naive Bayes


In [57]:
gnb = GaussianNB()
# Train classifier
gnb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [58]:
# Predict values

y_pred = gnb.predict(X_test)

In [59]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.06018495608619519


> Of these methods, KNN gave us the best results.  We now explore accuracy with different n_neighbor


In [60]:
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 15)
# Fit the classifier to the data
knn.fit(X_train,y_train)

# Predicting on test data
y_predict = knn.predict(X_test)

#check accuracy of our model on the test data
print("Accuracy:",metrics.accuracy_score(y_test, y_predict))

Accuracy: 0.5106951026762052


> We think using KNN and 15 n-neighbor values gives us highest  accuracy. We will use this to impute missing year_built data

In [61]:
# predicting year_built data in  missing_data 
yb_missing_data_x= yb_missing_data.loc[:,yb_missing_data.columns.isin(['meter_reading', 'day_of_week', 'hour_of_day',\
                                                  'squre_feet', 'primary_use', 'site_id'])]
yb_missing_data_x = yb_missing_data_x.copy()
yb_missing_data_x['primary_use'] = lab_enc.fit_transform(yb_missing_data_x['primary_use'])
yb_missing_data = yb_missing_data.copy()
yb_missing_data ['year_built'] = knn.predict(yb_missing_data_x)

In [62]:
full_yb = pd.DataFrame (yb_clean_data['year_built'].append(yb_missing_data['year_built']))
full_yb = full_yb.sort_index()

In [63]:
train_df_imputed = train_df_imputed.copy()
train_df_imputed['year_built'] = full_yb['year_built']

### Imputing floor_count variable 

In [64]:
# we then separate the missing data form non-missing data 
fc_missing_data = train_df_imputed.loc[train_df_imputed['floor_count'].isna()]
fc_clean_data = train_df_imputed.loc[~train_df_imputed['floor_count'].isna()]

> For ML algorithm, we will split the non-missing data to 70/30 for train/test.  We will pick the followingvariable as features: meter_reading, day_of_week, hour_of_day, square_feet, primary_use, site_id, year_built

In [65]:
# Setting x-variables 
fc_clean_data_x = fc_clean_data.loc[:, fc_clean_data.columns.isin(['meter_reading', 'day_of_week', 'hour_of_day',\
                                                  'squre_feet', 'primary_use', 'site_id', 'year_built'])]
# Setting y-variables
fc_clean_data_y = fc_clean_data.loc[:, fc_clean_data.columns.isin(['floor_count'])]

In [66]:
# converting primary_use into a numeric varialbe
lab_enc = preprocessing.LabelEncoder()
fc_clean_data_x = fc_clean_data_x.copy()
fc_clean_data_x['primary_use'] = lab_enc.fit_transform(fc_clean_data_x['primary_use'])

In [67]:
# splitting dataset into 70/30 for train/test
X_train, X_test, y_train, y_test = train_test_split(fc_clean_data_x, fc_clean_data_y, test_size=0.3, random_state=1)
y_train = np.ravel(y_train)

> ML Method: KNN

In [68]:
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 5)
# Fit the classifier to the data
knn.fit(X_train,y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [69]:
# Predicting on test data
y_predict = knn.predict(X_test)

In [70]:
#check accuracy of our model on the test data
print("Accuracy:",metrics.accuracy_score(y_test, y_predict))

Accuracy: 0.8685174754257973


> 87% accuracy is pretty good. we will use this to impute missing floor_count data

In [71]:
# predicting floor_count data in  missing_data 
fc_missing_data_x= fc_missing_data.loc[:,fc_missing_data.columns.isin(['meter_reading', 'day_of_week', 'hour_of_day',\
                                                  'squre_feet', 'primary_use', 'site_id', 'year_built'])]
fc_missing_data_x = fc_missing_data_x.copy()
fc_missing_data_x['primary_use'] = lab_enc.fit_transform(fc_missing_data_x['primary_use'])
fc_missing_data = fc_missing_data.copy()
fc_missing_data ['floor_count'] = knn.predict(fc_missing_data_x)

In [72]:
full_fc = pd.DataFrame (fc_clean_data['floor_count'].append(fc_missing_data['floor_count']))
full_fc = full_fc.sort_index()

In [73]:
train_df_imputed = train_df_imputed.copy()
train_df_imputed['floor_count'] = full_fc['floor_count']

In [74]:
train_df_imputed.isna().sum()

building_id      0
meter            0
timestamp        0
meter_reading    0
site_id          0
primary_use      0
square_feet      0
year_built       0
floor_count      0
time_index       0
day_of_week      0
hour_of_day      0
index            0
avg              0
std              0
outlier          0
dtype: int64

In [75]:
# This is the final data set with imputed missing values for building data and meter reading
train_df_imputed.to_pickle('train_df_imputed.pkl')