In [28]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import random

In [29]:
# Add path of your data on your local drive

df = pd.read_csv("/Users/sally/Data MSDS/CS 5010 Programming and Systems for Data Science/CS-5010-Missing-Data/train.csv")

In [30]:
# Clean input dataframe

df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], format = '%Y-%m-%d %H:%M:%S')
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'], format = '%Y-%m-%d %H:%M:%S')

# Create minute column
df['pickup_minute'] = df['pickup_datetime'].dt.minute
df['dropoff_minutes'] = df['dropoff_datetime'].dt.minute

# Create hour column
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['dropoff_hour'] = df['dropoff_datetime'].dt.hour

# Create day of week column
df['pickup_dow'] = df['pickup_datetime'].dt.weekday
df['dropoff_dow'] = df['dropoff_datetime'].dt.weekday

# Create day of month column
df['pickup_dom'] = df['pickup_datetime'].dt.day
df['dropoff_dom'] = df['dropoff_datetime'].dt.day

# Create month column
df['pickup_month'] = df['pickup_datetime'].dt.month
df['dropoff_month'] = df['dropoff_datetime'].dt.month

# Map 'Y' and 'N' in 'store_and_fwd_flag' as 1 and 0
store_and_fwd_map = {'Y':1,'N':0}
df = df.copy()
df['store_and_fwd_flag'] = df['store_and_fwd_flag'].map(store_and_fwd_map)

In [31]:
# Create missing data

random.seed(123)

# Insert nan values in pickup_hour
nan_pickup_hours = df['pickup_hour'].sample(round(df.shape[0]/10)).index
df.loc[nan_pickup_hours,'pickup_hour']=np.nan

# Insert nan values in dropoff_longitude
nan_dropoff_longitude = df['dropoff_longitude'].sample(round(df.shape[0]/10)).index
df.loc[nan_dropoff_longitude,'dropoff_longitude']=np.nan

In [32]:
# Approach 1 - Delete rows with missing values
# Pass only the dataframe as argument

def approach1_rem_msg(messy_df):
    clean_df = messy_df.dropna()
    return clean_df

#------------------------------------------------------------------------------------------------------------------------

# Approach 2 - Impute missing values
# The following function imputes the missing values with mean/median/mode according to arguments passed
# User also has to pass as list the names of columns which have missing values 
# Call function  - approach2_impute_metric(<df>,<"mean">/<"median">/<"mode">,[<'missingcolname1'>,<'missingcolname2'])

def approach2_impute_metric(messy_df, metric, colnames):
    clean_df = messy_df.copy()
    if metric=="mean":
        for col in colnames:
            imputenum = messy_df[col].mean()
            clean_df[col] = messy_df[col].fillna(imputenum)
            
    if metric=="median":
        for col in colnames:
            imputenum = messy_df[col].median()
            clean_df[col] = messy_df[col].fillna(imputenum)
            
    if metric=="mode":
        for col in colnames:
            imputenum = messy_df[col].mode()
            clean_df[col] = messy_df[col].fillna(imputenum)
            
    return clean_df
 
#-------------------------------------------------------------------------------------------------------------------------

# Approach 3 - Predict missing values
# This is work in progress
    
def approach3_predict_msg(messy_df, metric, colnames):
    
    for col in messy_df[colnames]:
    
        messy_df_train = messy_df[~messy_df[col].isnull()]
        msg_cols_train_df = messy_df_train[col]
        messy_df_train = messy_df_train.drop(colnames, axis = 1)

        messy_df_test = messy_df[messy_df[col].isnull()]
        msg_cols_test_df = messy_df_test[col]
        messy_df_test = messy_df_test.drop(colnames,axis = 1)

#        print("train_X",messy_df_train.head(),"\n\n","train_Y",msg_cols_train_df.head(),"\n\n","test_X",messy_df_test.head(),"\n\n","test_Y",msg_cols_test_df.head())

        Y_train = msg_cols_train_df.copy()
        X_train = messy_df_train.copy()
        model = RandomForestRegressor(n_estimators = 10 , oob_score = True)
        model.fit(X_train,Y_train) 

        X_test = messy_df_train.copy()

        Y_test = model.predict(X_test)
        
        X_train.append(X_test)     
        
    return pred

In [33]:
# Test the approaches to missing data

df_test = df.drop(['id','pickup_datetime','dropoff_datetime','trip_duration'], axis = 1)
df_test = df_test.head(1000)

# Call function to clean missing data

test_pred1 = approach1_rem_msg(df_test)
test_pred2 = approach2_impute_metric(df_test,"mean",['dropoff_longitude','pickup_hour'])
test_pred3 = approach2_impute_metric(df_test,"median",['dropoff_longitude','pickup_hour'])
test_pred4 = approach2_impute_metric(df_test,"mode",['dropoff_longitude','pickup_hour'])

In [None]:
from ipywidgets import *
from IPython.display import display
from IPython.display import clear_output

# make 4 buttons
button1 = Button(description="Delete rows with missing values",
           layout=Layout(width='50%', height='50px'))
button2 = Button(description="Impute metric: Mean",
                layout=Layout(width='50%', height='50px'))
button3 = Button(description="Impute metric: Median",
                layout=Layout(width='50%', height='50px'))
button4 = Button(description="Impute metric: Mode",
                layout=Layout(width='50%', height='50px'))
display(button1, button2, button3, button4)

# define button calls

def button1Clicked(b):
    clear_output()
    display(test_pred1)
    
def button2Clicked(b):
    clear_output()
    display(test_pred2)
    
def button3Clicked(b):
    clear_output()
    display(test_pred3)
        
def button4Clicked(b):
    clear_output()
    display(test_pred4)
    
# specify which functions to be called when buttons are clicked
button1.on_click(button1Clicked)
button2.on_click(button2Clicked)
button3.on_click(button3Clicked)
button4.on_click(button4Clicked)

In [None]:
# Test code below - Do not run

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
character_cols = ['Origin','UniqueCarrier','Dest']

for col in character_cols:
    le.fit(X[col].values)
    X[col]=le.transform(X[col])

In [None]:
sum(Y==1)

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 10 , oob_score = True)
model.fit(X,Y)

In [None]:
model_features = pd.Series(model.feature_importances_,X.columns)
model_features.sort_values
model_features.plot(kind="barh",figsize=(7,6))

In [None]:
model.feature_importances_