In [59]:
import pandas as pd

In [60]:
import numpy as np

# Read and view data

In [61]:
df_train = pd.read_csv('train.csv')

In [62]:
df_train.head(5)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [63]:
# Num of rows

In [64]:
df_train.shape

(1458644, 11)

In [65]:
# read test data

In [66]:
df_test = pd.read_csv('test.csv')

In [68]:
# Brainstorming ideas:
# 3rd party data
# Google API real-time traffic
# Traffic accident (one side factor 0.5)
# Weather (snow probability)
# Attributes of fastest routes: traffic lights, left turns, right turns, # of lanes (allocate based on time proportion to 
# fastest route, slowest route)
# Holiday, special events
# Open Street Routing Machine
# Multicollinearity
# Day to categorical
# Time to categorical

# Feature extraction (time)

In [69]:
# Check total # of Nulls in data

In [70]:
# Replace all empty cells to np.nan
# Count total number of Nulls

In [76]:
df_train = df_train.replace('', np.nan, regex=True)
df_test = df_test.replace('', np.nan, regex=True)

In [77]:
print df_train.isnull().sum().sum() # One kernel shows 3 missing values but I got 0:
# https://www.kaggle.com/maheshdadhich/strength-of-visualization-python-visuals-tutorial
print df_test.isnull().sum().sum()

0
0


In [73]:
# Convert the date time column in data table to date time format

In [78]:
df_train['pickup_datetime'] = pd.to_datetime(df_train.pickup_datetime)
df_train['dropoff_datetime'] = pd.to_datetime(df_train.dropoff_datetime)

In [79]:
df_train.loc[:, 'pick_month'] = df_train['pickup_datetime'].dt.month
df_train.loc[:, 'hour'] = df_train['pickup_datetime'].dt.hour
df_train.loc[:, 'week_of_year'] = df_train['pickup_datetime'].dt.weekofyear
df_train.loc[:, 'day_of_year'] = df_train['pickup_datetime'].dt.dayofyear
df_train.loc[:, 'day_of_week'] = df_train['pickup_datetime'].dt.dayofweek

In [80]:
df_train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pick_month,hour,week_of_year,day_of_year,day_of_week
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,3,17,11,74,0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,6,0,23,164,6
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,1,11,3,19,1
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,4,19,14,97,2
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,3,13,12,86,5


In [81]:
# Convert pick_month, hour, day_of_week to categorical (dummy) variables 

In [82]:
# First convert those cols to string

In [83]:
for i in ['pick_month','hour','day_of_week']:
    df_train[i] = df_train[i].astype(str)

In [84]:
df_train = df_train.join(pd.get_dummies(df_train[['pick_month','hour','day_of_week']],prefix=['pick_month', 'hour','day_of_week']))

In [85]:
df_train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,hour_7,hour_8,hour_9,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [4]:

from sklearn.decomposition import PCA
# from sklearn.cluster import MiniBatchKMeans
# train.loc[:, 'pickup_cluster'] = kmeans.predict(train[['pickup_latitude', 'pickup_longitude']])

ImportError: cannot import name '__check_build'