# What will be the best location to pick-up customers for each day of the week of a month?


## Preamble

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd

## Load the data

In [2]:
df=pd.read_csv("datasets/yellow_tripdata_2013-01.csv")

## Preprocess data

In [3]:
#create new column call weekday
timestamp = pd.to_datetime(pd.Series(df['pickup_datetime']))
df['weekday'] = timestamp.dt.weekday_name
df.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount,weekday
0,CMT,2013-01-01 15:11:48,2013-01-01 15:18:10,4,1.0,-73.978165,40.757977,1,N,-73.98984,40.751173,CSH,6.5,0.0,0.5,0.0,0.0,7.0,Tuesday
1,CMT,2013-01-06 00:18:35,2013-01-06 00:22:54,1,1.5,-74.00668,40.731781,1,N,-73.994499,40.750659,CSH,6.0,0.5,0.5,0.0,0.0,7.0,Sunday
2,CMT,2013-01-05 18:49:41,2013-01-05 18:54:23,1,1.1,-74.004711,40.73777,1,N,-74.009831,40.726,CSH,5.5,1.0,0.5,0.0,0.0,7.0,Saturday
3,CMT,2013-01-07 23:54:15,2013-01-07 23:58:20,2,0.7,-73.9746,40.759945,1,N,-73.984737,40.759388,CSH,5.0,0.5,0.5,0.0,0.0,6.0,Monday
4,CMT,2013-01-07 23:25:03,2013-01-07 23:34:24,1,2.1,-73.976252,40.748528,1,N,-74.002583,40.747867,CSH,9.5,0.5,0.5,0.0,0.0,10.5,Monday


In [None]:
#drop unnecessary column
df = df.drop(['vendor_id','passenger_count','trip_distance','rate_code',
              'store_and_fwd_flag','payment_type','fare_amount','surcharge','mta_tax',
             'tip_amount','tolls_amount','total_amount','dropoff_datetime',
              'dropoff_longitude','dropoff_latitude'], axis=1)

#get rid off some garbage data
df=df[(df['pickup_latitude'] > 40.492083) & (df['pickup_latitude']<40.944536) &
     (df['pickup_longitude']> -74.267880)& (df['pickup_longitude']< -73.662022)]

df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,weekday
0,2013-01-01 15:11:48,-73.978165,40.757977,Tuesday
1,2013-01-06 00:18:35,-74.00668,40.731781,Sunday
2,2013-01-05 18:49:41,-74.004711,40.73777,Saturday
3,2013-01-07 23:54:15,-73.9746,40.759945,Monday
4,2013-01-07 23:25:03,-73.976252,40.748528,Monday


In [None]:
#get all the selected weekdays in selected month
my_weekday="Tuesday"
my_montn=1
df_select=df[(df['weekday']==my_weekday) & 
                 (pd.to_datetime(df['pickup_datetime']) < pd.datetime(2013,my_montn+1,1))&
                (pd.to_datetime(df['pickup_datetime']) > pd.datetime(2013,my_montn,1))]
df_select=df_select[:50000]
df_select.head()

## Kmean Clustering

In [None]:
%%time
#use Kmean to group data by longitude and latitude
my_cluster=100
from sklearn.cluster import KMeans
lon=df_select['pickup_longitude'].values
lat=df_select['pickup_latitude'].values
coodinate_array=np.array([[lon[i],lat[i]] for i in range(len(lon))])

kmeans_n = KMeans(n_clusters=my_cluster,  n_init=1, random_state=1000)
kmeans_n.fit(coodinate_array)
labels = kmeans_n.labels_
print(labels)

In [None]:
# add new column call cluster
df_select['Cluster']=labels
df_select.head()

In [None]:
#prepare for regression
Cluster_size=df_select.groupby('Cluster').size()
Cluster_size=np.array([[Cluster_size[i]] for i in range(len(Cluster_size))])
Cluster_center=kmeans_n.cluster_centers_

## Training data and testing data

In [None]:
train_size=int(len(Cluster_size)*0.8)
test_size=int(len(Cluster_size)*0.2)
train_feature=Cluster_size[:train_size]
train_response=Cluster_center[:train_size]
test_feature=Cluster_size[test_size:]
test_response=Cluster_center[test_size:]

## Validation - coefficient of determination (R^2)

In [None]:
#coefficient of determination (R^2)
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline


def fit_model(X, y):
    model = Pipeline([('poly', PolynomialFeatures(degree=3)),
                ('linear', LinearRegression(fit_intercept=False))])
    model.fit(X, y)
    return model

def score_model(model, X, y, Xv, yv):
    return tuple([model.score(X, y), model.score(Xv, yv)])

def fit_model_and_score(data, response, validation, val_response):
    model = fit_model(data, response)
    return score_model(model, data, response, validation, val_response)

print (fit_model_and_score(train_feature, train_response,
                           test_feature, test_response))


## MSE

In [None]:
#use mean squared error to evaluation model
from sklearn.metrics import mean_squared_error

MSE_model=Pipeline([('poly', PolynomialFeatures(degree=3)),
                ('linear', LinearRegression(fit_intercept=False))])
MSE_model.fit(train_feature, train_response)
X_MSE=(test_feature)
y_MSE = MSE_model.predict(X_MSE)
mean_squared_error(test_response, y_MSE)


## Prediction

In [None]:
#predict best location

X=Cluster_size
y=Cluster_center

prediction_model=Pipeline([('poly', PolynomialFeatures(degree=3)),
                ('linear', LinearRegression(fit_intercept=False))])
prediction_model.fit(X, y)
X_predict=([max(Cluster_size)])
y_predict = prediction_model.predict(X_predict)
y_predict


## Visualization

In [None]:
#prepare for visualization
max_size_cluster=0


for data in y_predict:
    visual_x=data[[0]]
    visual_y=data[[1]]
    
for i in range(len(Cluster_size)):
    if (Cluster_size[i]==Cluster_size.max()):
        max_size_cluster=i
        
actual_value=kmeans_n.cluster_centers_[max_size_cluster]
actual_x=actual_value[0]
actual_y=actual_value[1]

In [None]:
#visualization for kmean cluster
from random import randint
colors = []

for i in range(my_cluster):
    colors.append('#%06X' % randint(0, 0xFFFFFF))

plt.figure(figsize=(18,9))
for i in range(my_cluster):
    my_cluster_df=df_select[df_select['Cluster']==i]
    lon_x=my_cluster_df.pickup_longitude.values
    lat_y=my_cluster_df.pickup_latitude.values
    plt.scatter(lon_x,lat_y,alpha=0.2,s=100,c=colors[i])

plt.axis([visual_x-0.1,visual_x+0.1,visual_y-0.1,visual_y+0.1])
plt.title("visualization for kmean")
plt.show()


In [None]:
#scatter plot all the data for selected weekday and prediction(best location in red)
x_points=lon
y_points=lat
plt.figure(figsize=(18,9))
plt.scatter(lon,lat,alpha=0.2,s=100)
plt.scatter(visual_x,visual_y ,c='r',s=100)
plt.scatter(actual_x,actual_y ,c='y',s=100)
plt.axis([visual_x-0.05,visual_x+0.05,visual_y-0.05,visual_y+0.05])
plt.show()


use link here to check visualization on carto if below codes don't show map
https://shenghuayou.carto.com/builder/ac7e268c-b4ee-11e6-b213-0e98b61680bf/embed

In [None]:
#load data from carto (data for 2013/01/01 Tuesday)
from IPython.display import HTML
HTML('<iframe width=100% height=520 frameborder=0 src=https://shenghuayou.carto.com/builder/ac7e268c-b4ee-11e6-b213-0e98b61680bf/embed allowfullscreen webkitallowfullscreen mozallowfullscreen oallowfullscreen msallowfullscreen></iframe>')