In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import random

In [None]:
# Add path of your data on your local drive

df = pd.read_csv("C://Users//Rohan Bapat//Documents//Classes//CS 5010//Project//train//train.csv")

In [2]:
# Clean input dataframe

df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], format = '%Y-%m-%d %H:%M:%S')
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'], format = '%Y-%m-%d %H:%M:%S')

# Create minute column
df['pickup_minute'] = df['pickup_datetime'].dt.minute
df['dropoff_minutes'] = df['dropoff_datetime'].dt.minute

# Create hour column
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['dropoff_hour'] = df['dropoff_datetime'].dt.hour

# Create day of week column
df['pickup_dow'] = df['pickup_datetime'].dt.weekday
df['dropoff_dow'] = df['dropoff_datetime'].dt.weekday

# Create day of month column
df['pickup_dom'] = df['pickup_datetime'].dt.day
df['dropoff_dom'] = df['dropoff_datetime'].dt.day

# Create month column
df['pickup_month'] = df['pickup_datetime'].dt.month
df['dropoff_month'] = df['dropoff_datetime'].dt.month

# Map 'Y' and 'N' in 'store_and_fwd_flag' as 1 and 0
store_and_fwd_map = {'Y':1,'N':0}
df = df.copy()
df['store_and_fwd_flag'] = df['store_and_fwd_flag'].map(store_and_fwd_map)

          id  vendor_id     pickup_datetime    dropoff_datetime  \
0  id2875421          2 2016-03-14 17:24:55 2016-03-14 17:32:30   
1  id2377394          1 2016-06-12 00:43:35 2016-06-12 00:54:38   
2  id3858529          2 2016-01-19 11:35:24 2016-01-19 12:10:48   
3  id3504673          2 2016-04-06 19:32:31 2016-04-06 19:39:40   
4  id2181028          2 2016-03-26 13:30:55 2016-03-26 13:38:10   

   passenger_count  pickup_longitude  pickup_latitude  dropoff_longitude  \
0                1        -73.982155        40.767937         -73.964630   
1                1        -73.980415        40.738564         -73.999481   
2                1        -73.979027        40.763939         -74.005333   
3                1        -74.010040        40.719971         -74.012268   
4                1        -73.973053        40.793209         -73.972923   

   dropoff_latitude  store_and_fwd_flag      ...        pickup_minute  \
0         40.765602                   0      ...                   

In [7]:
# Create missing data

random.seed(123)

# Insert nan values in pickup_hour
nan_pickup_hours = df['pickup_hour'].sample(round(df.shape[0]/10)).index
df.loc[nan_pickup_hours,'pickup_hour']=np.nan

# Insert nan values in dropoff_longitude
nan_dropoff_longitude = df['dropoff_longitude'].sample(round(df.shape[0]/10)).index
df.loc[nan_dropoff_longitude,'dropoff_longitude']=np.nan

In [30]:
# Approach 1 - Delete rows with missing values
# Pass only the dataframe as argument

def approach1_rem_msg(messy_df):
    clean_df = messy_df.dropna()
    return clean_df

#------------------------------------------------------------------------------------------------------------------------

# Approach 2 - Impute missing values
# The following function imputes the missing values with mean/median/mode according to arguments passed
# User also has to pass as list the names of columns which have missing values 
# Call function  - approach2_impute_metric(<df>,<"mean">/<"median">/<"mode">,[<'missingcolname1'>,<'missingcolname2'])

def approach2_impute_metric(messy_df, metric, colnames):
    clean_df = messy_df.copy()
    if metric=="mean":
        for col in colnames:
            imputenum = messy_df[col].mean()
            clean_df[col] = messy_df[col].fillna(imputenum)
            
    if metric=="median":
        for col in colnames:
            imputenum = messy_df[col].median()
            clean_df[col] = messy_df[col].fillna(imputenum)
            
    if metric=="mode":
        for col in colnames:
            imputenum = messy_df[col].mode()
            clean_df[col] = messy_df[col].fillna(imputenum)
            
    return clean_df
 
#-------------------------------------------------------------------------------------------------------------------------

# Approach 3 - Predict missing values
# This is work in progress
    
def approach3_predict_msg(messy_df, metric, colnames):
    
    for col in messy_df[colnames]:
    
        messy_df_train = messy_df[~messy_df[col].isnull()]
        msg_cols_train_df = messy_df_train[col]
        messy_df_train = messy_df_train.drop(colnames, axis = 1)

        messy_df_test = messy_df[messy_df[col].isnull()]
        msg_cols_test_df = messy_df_test[col]
        messy_df_test = messy_df_test.drop(colnames,axis = 1)

#        print("train_X",messy_df_train.head(),"\n\n","train_Y",msg_cols_train_df.head(),"\n\n","test_X",messy_df_test.head(),"\n\n","test_Y",msg_cols_test_df.head())

        Y_train = msg_cols_train_df.copy()
        X_train = messy_df_train.copy()
        model = RandomForestRegressor(n_estimators = 10 , oob_score = True)
        model.fit(X_train,Y_train) 

        X_test = messy_df_train.copy()

        Y_test = model.predict(X_test)
        
        X_train.append(X_test)     
        
    return pred

In [31]:
# Test the approaches to missing data

df_test = df.drop(['id','pickup_datetime','dropoff_datetime','trip_duration'], axis = 1)
df_test = df_test.head(1000)

# Call function to clean missing data

test_pred1 = approach1_rem_msg(df_test)
test_pred2 = approach2_impute_metric(df_test,"mean",['dropoff_longitude','pickup_hour'])
test_pred3 = approach2_impute_metric(df_test,"median",['dropoff_longitude','pickup_hour'])
test_pred4 = approach2_impute_metric(df_test,"mode",['dropoff_longitude','pickup_hour'])

In [None]:
# Test code below - Do not run

In [41]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
character_cols = ['Origin','UniqueCarrier','Dest']

for col in character_cols:
    le.fit(X[col].values)
    X[col]=le.transform(X[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [48]:
sum(Y==1)

0

In [43]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 10 , oob_score = True)
model.fit(X,Y)

  warn("Some inputs do not have OOB scores. "


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=True, random_state=42,
           verbose=0, warm_start=False)

In [27]:
model_features = pd.Series(model.feature_importances_,X.columns)
model_features.sort_values
model_features.plot(kind="barh",figsize=(7,6))

In [44]:
model.feature_importances_

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.])