In [1]:
# import libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import datetime as dt

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering

from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn import metrics

In [2]:
df = pd.read_csv('../input/nyc-taxi-trip-duration/NYC.csv')
df.head()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(18,12))
sns.heatmap(df.corr(),annot=True)
plt.show()

In [None]:
df.shape 

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [3]:
## we drop id and vendor_id columns

df.drop(['id','vendor_id'],axis=1,inplace=True)

In [4]:
# work with time data
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

In [5]:
df['pickup_d_num'] = df['pickup_datetime'].dt.weekday
df['droff_d_num'] = df['dropoff_datetime'].dt.weekday

In [6]:
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['dropoff_hour'] = df['dropoff_datetime'].dt.hour
df['pickup_month'] = df['pickup_datetime'].dt.month

In [None]:
df.sample(5)

In [None]:
# Data Visualisation
# pickup and dropoff distribution over a week

fig, axes = plt.subplots(1,2, figsize=(18,6))

# pickup_d_num
sns.countplot(x="pickup_d_num", data=df, ax=axes[0])
axes[0].set_xticks([0,1,2,3,4,5,6],['Mon','Tue','Wed','Thu','Fri','Sat','Sun'])

# droff_d_num
sns.countplot(x="droff_d_num", data=df, ax=axes[1])
axes[1].set_xticks([0,1,2,3,4,5,6],['Mon','Tue','Wed','Thu','Fri','Sat','Sun'])

plt.show()

In [None]:
# pickup and dropoff hour over a day

fig, axes = plt.subplots(1,2, figsize=(18,6))

# pickup_hour
sns.countplot(x="pickup_hour", data=df, ax=axes[0])
axes[0].set_xlabel('Pickup hours')
axes[0].set_ylabel('Number of rides')
axes[0].grid()

# dropoff_hour
sns.countplot(x="dropoff_hour", data=df, ax=axes[1])
axes[1].set_xlabel('Dropoff_hour hours')
axes[1].set_ylabel('Number of rides')
axes[1].grid()
plt.show()

In [None]:
df['pickup_month'].value_counts()

In [None]:
# Number of rides per month

plt.figure(figsize=(10,6))
plt.title('Number of rides per month in the first half year')
sns.countplot(x='pickup_month',data=df)
sns.color_palette("crest", as_cmap=True)
plt.xticks([0,1,2,3,4,5],['Jan','Feb','Mar','Apr','May','Jun'])

plt.grid()
plt.show()

In [7]:
df['passenger_count'] = df['passenger_count'].astype(int)

In [8]:
# The number of passengers in the vehicle

pie = df['passenger_count'].value_counts()
myexplode = [0.05,0.05,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03]
plt.figure(figsize=(20,10))
plt.pie(x=pie,explode=myexplode,shadow=True,autopct='%.1f%%')
plt.legend(pie.index)
plt.title('The number of passengers in the vehicle')
plt.show()

In [None]:
# Trip distribution over the city shown on the map
import folium
mymap= folium.Map(
    location=[40.730610,-73.935242],
    zoom_start=8)

In [None]:
office_togo = df[(df['pickup_hour'] > 7) & (df['pickup_hour'] > 10) ]
for i in office_togo.index[:500]:
    folium.Circle(location=[office_togo['pickup_longitude'].iloc[i],
                            office_togo['pickup_latitude'].iloc[i]]).add_to(mymap)

In [None]:
# Total trip_duration share per hour

trip_duration_during_day = df.groupby(['pickup_hour'])['trip_duration'].sum()/len(df)
plt.figure(figsize=(15,8))
plt.plot(trip_duration_during_day,color='red',linestyle='dashed',marker='o')
plt.xticks(range(24))
plt.xlabel('Hours')
plt.ylabel('Trip share in %')
plt.grid()
plt.show()

In [9]:
# To eliminate outlier we drop passenger_count that equals 0 and smaller that 6
df_new = df[(df["passenger_count"]!=0) & (df["passenger_count"]<=6)].copy()

In [10]:
df_new['store_and_fwd_flag'] = df_new['store_and_fwd_flag'].map({'N':0,'Y':1})

In [11]:
df_new = df_new.drop(['pickup_datetime','dropoff_datetime'],axis=1)

In [24]:
df_new

In [26]:
X_location = df_new[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']].copy()
X_total = df_new.copy()
X_pickup_dropoff = df_new[['pickup_d_num','droff_d_num','pickup_hour','dropoff_hour','pickup_month']].copy()

X_location = preprocessing.StandardScaler().fit(X_location).transform(X_location.astype(float))
X_total = preprocessing.StandardScaler().fit(X_total).transform(X_total.astype(float))
X_pickup_dropoff = preprocessing.StandardScaler().fit(X_pickup_dropoff).transform(X_pickup_dropoff.astype(float))

In [14]:
# Kmeans based for total independent categories

clusterNum = 5
k_means_total = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)
k_means_total.fit(X_total)
labels_total = k_means_total.labels_
print(labels_total)

In [15]:
df_new['kmeans_total'] = labels_total
df_new['kmeans_total'].value_counts()

In [None]:
df_new.groupby('kmeans_total').mean()

In [27]:
# Kmeans based for total pickup dropoff times

clusterNum = 5
k_means_pick = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)
k_means_pick.fit(X_pickup_dropoff)
labels_pick = k_means_pick.labels_
print(labels_pick)

In [28]:
df_new['kmeans_pick'] = labels_pick
df_new['kmeans_pick'].value_counts()

In [30]:
df_new.groupby('kmeans_pick').mean()

In [32]:
# Kmeans based on location

clusterNum = 5
k_means_location = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)
k_means_location.fit(X_location)
labels_location = k_means_location.labels_
print(labels_location)

In [34]:
df_new['kmeans_location'] = labels_location
df_new['kmeans_location'].value_counts()

In [35]:
df_new.groupby('kmeans_location').describe()

In [None]:
Sum_of_squared_distances = []
K = range(1,10)
for num_clusters in K :
     kmeans = KMeans(n_clusters=num_clusters)
     kmeans.fit(X_total)
     Sum_of_squared_distances.append(kmeans.inertia_)
        
plt.plot(K,Sum_of_squared_distances)
plt.xlabel("Values of K") 
plt.ylabel("Sum of squared distances Inertia") 
plt.title("Elbow Method For Optimal k")
plt.show()

In [None]:
df_new [df_new['kmeans_pick'] == 1 ].hist(figsize=(25,15))
plt.show()

In [None]:
# DBSCAN

# dbscan = DBSCAN(eps=0.15, min_samples=5).fit(X_total)
# labels = dbscan.labels_
# labels

In [None]:
# Hierarchical 

# import scipy
# leng = X_total.shape[0]
# D = scipy.zeros([leng,leng])
# for i in range(leng):
#     for j in range(leng):
#         D[i,j] = scipy.spatial.distance.euclidean(X_total[i], X_total[j])

In [39]:
# save to csv file

df = df[(df["passenger_count"]!=0) & (df["passenger_count"]<=6)]
df['kmeans_total'] = labels_total
df['kmeans_pick'] = labels_pick
df['kmeans_location'] = labels_location

df.to_csv('clustering_NYC_taxi.csv')