# **Install Sklearn Fuzzy**

In [None]:
pip install -U scikit-fuzzy

# I. Original data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sb
import folium #visualize map
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar #calendar
from sklearn.cluster import KMeans #k-means clustering
from yellowbrick.cluster import KElbowVisualizer #Elbow visualize K-means
from skfuzzy.cluster import cmeans, cmeans_predict #fuzzy clustering
from sklearn import mixture #Gaussian Mixture Modelling
from sklearn.mixture import GaussianMixture

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



**#Read dataset**

In [None]:
df1 = pd.read_csv("/kaggle/input/uber-pickups-in-new-york-city/uber-raw-data-apr14.csv")
df2 = pd.read_csv("/kaggle/input/uber-pickups-in-new-york-city/uber-raw-data-aug14.csv")
df3 = pd.read_csv("/kaggle/input/uber-pickups-in-new-york-city/uber-raw-data-jul14.csv")
df4 = pd.read_csv("/kaggle/input/uber-pickups-in-new-york-city/uber-raw-data-jun14.csv")
df5 = pd.read_csv("/kaggle/input/uber-pickups-in-new-york-city/uber-raw-data-may14.csv")
df6 = pd.read_csv("/kaggle/input/uber-pickups-in-new-york-city/uber-raw-data-sep14.csv")

**#Row bind using pandas concatenate**

In [None]:
df_ori = pd.concat([df1, df2, df3, df4, df5, df6])

In [None]:
df_ori.shape

In [None]:
df_ori.head()

# 1. K-means clustering

#Selecting Feature

In [None]:
clus_k_ori = df_ori[['Lat', 'Lon']]
clus_k_ori.dtypes

#Plot the number of cluster

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

model_ori = KMeans()
visualizer = KElbowVisualizer(model_ori, k = (1, 18)) #k = 1 to 17
visualizer.fit(clus_k_ori)
visualizer.show()

#Assign number of cluster K-Means Algorithm

In [None]:
kmeans_ori = KMeans(n_clusters = 5, random_state = 0) #k = 5
kmeans_ori.fit(clus_k_ori)

#Storing the Centroids

In [None]:
centroids_k_ori = kmeans_ori.cluster_centers_
centroids_k_ori

In [None]:
clocation_k_ori = pd.DataFrame(centroids_k_ori, columns = ['Latitude', 'Longitude'])

In [None]:
clocation_k_ori

#Visualize Centroids using map

In [None]:
plt.scatter(clocation_k_ori['Latitude'], clocation_k_ori['Longitude'], marker = "x", color = 'R', s = 200)

In [None]:
centroid_k_ori = clocation_k_ori.values.tolist()

map_k_ori = folium.Map(location = [40.71600413400166, -73.98971408426613], zoom_start = 10)
for point in range(0, len(centroid_k_ori)):
    folium.Marker(centroid_k_ori[point], popup = centroid_k_ori[point]).add_to(map_k_ori)

map_k_ori

#Grouping and visualizing the total number of cluster

In [None]:
label_k_ori = kmeans_ori.labels_
label_k_ori

In [None]:
df_new_k = df_ori.copy()
df_new_k['Clusters'] = label_k_ori
df_new_k

In [None]:
sb.factorplot(data = df_new_k, x = "Clusters", kind = "count", size = 7, aspect = 2)

#Compare the cluster

In [None]:
count_3 = 0
count_0 = 0
for value in df_new_k['Clusters']:
    if value == 3:
        count_3 += 1
    if value == 0:
        count_0 += 1
print(count_0, count_3)

#Predict cluster for new location

In [None]:
new_location_ori = [(40.86, -75.56)]
kmeans_ori.predict(new_location_ori)

# 2. Sklearn Fuzzy

#Selecting Feature

In [None]:
clus_fuz_ori = df_ori[['Lat', 'Lon']]

#For loop to determine the number of cluster

In [None]:
fpcs_ori = []

# n_cluster = 1 to 11
for n_cluster in range(1, 11):
    cntr, u, u0, d, jm, p, fpc_ori = cmeans(clus_fuz_ori.T, n_cluster, 2 , error=0.01, maxiter=1000)
    fpcs_ori.append(fpc_ori)

#Plot the number of centers

In [None]:
fig, ax = plt.subplots()
ax.plot(np.r_[1:11], fpcs_ori)
ax.set_xlabel("Number of centers")
ax.set_ylabel("Fuzzy partition coefficient")

In [None]:
cntr_ori, u_orig, _, _, _, _, _ = cmeans(clus_fuz_ori.T, 5, 2 , error=0.01, maxiter=1000)

In [None]:
u_ori, u0, d, jm, p, pc = cmeans_predict(clus_fuz_ori.T, cntr_ori, 2 , error=0.01, maxiter=1000)

#Storing the Centroid

In [None]:
centroids_fuz_ori = cntr_ori
centroids_fuz_ori

In [None]:
clocation_fuz_ori = pd.DataFrame(centroids_fuz_ori, columns = ['Latitude', 'Longitude'])
clocation_fuz_ori

#Visualize Centroids

In [None]:
plt.scatter(clocation_fuz_ori['Latitude'], clocation_fuz_ori['Longitude'], marker = "x", color = 'R', s = 200)

In [None]:
centroid_fuz_ori = clocation_fuz_ori.values.tolist()

map_fuz_ori = folium.Map(location = [40.71600413400166, -73.98971408426613], zoom_start = 10)
for point in range(0, len(centroid_fuz_ori)):
    folium.Marker(centroid_fuz_ori[point], popup = centroid_fuz_ori[point]).add_to(map_fuz_ori)

map_fuz_ori

#Grouping and visualizing the total number of clusters

In [None]:
label_fuz_ori = np.argmax(u_ori, axis = 0)
label_fuz_ori

In [None]:
df_new_fuz = df_ori.copy()
df_new_fuz['Clusters'] = label_fuz_ori
df_new_fuz

In [None]:
sb.factorplot(data = df_new_fuz, x = "Clusters", kind = "count", size = 7, aspect = 2)

#Visualize datapoint

In [None]:
plt.scatter(df_new_fuz['Lat'], df_new_fuz['Lon'], c = label_fuz_ori, cmap='viridis');

# 3. Gaussian Mixture Modelling

#Selecting Feature

In [None]:
clus_gmm_ori = df_ori[['Lat', 'Lon']]

In [None]:
n_components = np.arange(1, 15)
models = [GaussianMixture(n, covariance_type='full', random_state=0).fit(clus_gmm_ori) for n in n_components]
plt.plot(n_components, [m.bic(clus_gmm_ori) for m in models], label='BIC')
plt.plot(n_components, [m.aic(clus_gmm_ori) for m in models], label='AIC')
plt.legend(loc='best')
plt.xlabel('n_components');

In [None]:
clf = mixture.GaussianMixture(n_components=5, covariance_type='full')
clf.fit(clus_gmm_ori)

In [None]:
label_gmm_ori = clf.predict(clus_gmm_ori)

In [None]:
df_new_gmm = df_ori.copy()
df_new_gmm['Clusters'] = label_gmm_ori
df_new_gmm

In [None]:
df_new_gmm['Clusters'].unique()

In [None]:
plt.scatter(df_new_gmm['Lat'], df_new_gmm['Lon'], c = label_gmm_ori, cmap='viridis');

#Predict new location

In [None]:
new_location_gmm = [(43.86, -71.56)]
clf.predict(new_location_gmm)

# II. Pre-processed data & Analysis

#Row bind using concatenate

In [None]:
df_prep = df_ori.copy()

In [None]:
df_prep = df_prep.rename(columns = {'Date/Time' : 'Date_Time'}) #rename Date/Time -> Date_Time

In [None]:
df_prep.shape

In [None]:
df_prep.head()

#Split Date/Time

In [None]:
df_prep['Date_Time'] =  pd.to_datetime(df_prep['Date_Time']) #convert to datetime
df_prep['Month'] = df_prep.Date_Time.apply(lambda x: x.month) #Month
df_prep['Day'] = df_prep.Date_Time.apply(lambda x: x.day) #Day
df_prep['Hour'] =  df_prep.Date_Time.apply(lambda x: x.hour) #hour
df_prep['Minute'] =  df_prep.Date_Time.apply(lambda x: x.minute) #minute
df_prep['Weekday'] = df_prep.Date_Time.apply(lambda x: x.weekday()) #weekday & onehot encoder

holidays = calendar().holidays(start = df_prep['Date_Time'].min(), end = df_prep['Date_Time'].max()) #get the holiday in US calendar
df_prep['Holiday'] = df_prep['Date_Time'].isin(holidays).astype('int') #holidays

In [None]:
df_prep

# 1. Kmeans clustering

#Selecting Feature

In [None]:
clus_k_prep = df_prep[['Lat', 'Lon', 'Day', 'Month', 'Hour', 'Minute', 'Weekday', 'Holiday']]
clus_k_prep.dtypes

#Plot the number of clusters

In [None]:
model_prep = KMeans()
visualizer_prep = KElbowVisualizer(model_prep, k = (1, 18)) #k = 1 to 17
visualizer_prep.fit(clus_k_prep)
visualizer_prep.show()

#Assign number of cluster in K-Means Algorithm

In [None]:
kmeans_prep = KMeans(n_clusters = 3, random_state = 0) #k = 3
kmeans_prep.fit(clus_k_prep) 

#Storing the Centroid

In [None]:
centroids_k_prep = kmeans_prep.cluster_centers_
centroids_k_prep

In [None]:
clocation_k_prep = pd.DataFrame(centroids_k_prep, columns = ['Latitude', 'Longitude','Day', 'Month', 'Hour', 'Minute', 'Weekday', 'Holiday'])

#Get the Lat and Lon to visualize

In [None]:
clocation_k_prep_map = clocation_k_prep[['Latitude', 'Longitude']]

#Visualize Centroids

In [None]:
plt.scatter(clocation_k_prep_map['Latitude'], clocation_k_prep_map['Longitude'], marker = "x", color = 'R', s = 200)

#Visualize Latitude and Longitude

In [None]:
centroid_k_prep = clocation_k_prep_map.values.tolist()

map_k_prep = folium.Map(location = [40.7392, -73.973], zoom_start = 20)
for point in range(0, len(centroid_k_prep)):
    folium.Marker(centroid_k_prep[point], popup = centroid_k_prep[point]).add_to(map_k_prep)

map_k_prep

#Grouping and visualizing the total number of clusters

In [None]:
label_k_prep = kmeans_prep.labels_
label_k_prep

In [None]:
df_prep_new_k = df_prep.copy()
df_prep_new_k['Clusters'] = label_k_prep
df_prep_new_k

In [None]:
sb.factorplot(data = df_prep_new_k, x = "Clusters", kind = "count", size = 7, aspect = 2)

#Predict cluster

In [None]:
new_location_prep = [(40.86, -75.56, 20, 8, 5, 37, 0, 0)]
kmeans_prep.predict(new_location_prep)

# 2. Sklearn Fuzzy

#Selecting feature

In [None]:
clus_fuz_prep = df_prep[['Lat', 'Lon','Day', 'Month', 'Hour', 'Minute', 'Weekday', 'Holiday']]

#For loop to determine the number of cluster

In [None]:
fpcs_prep = []

# n_cluster = 1 to 11
for n_cluster in range(1, 11):
    cntr_prep, u, u0, d, jm, p, fpc_prep = cmeans(clus_fuz_prep.T, n_cluster, 2 , error=0.01, maxiter=1000)
    fpcs_prep.append(fpc_prep)

#Plot the number of cluster

In [None]:
fig, ax = plt.subplots()
ax.plot(np.r_[1:11], fpcs_prep)
ax.set_xlabel("Number of centers")
ax.set_ylabel("Fuzzy partition coefficient")

#C-mean algorithm

In [None]:
cntr_prep, u_orig, _, _, _, _, _ = cmeans(clus_fuz_prep.T, 5, 2 , error=0.01, maxiter=1000)

#Predict the label

In [None]:
u_prep, u0, d, jm, p, pc = cmeans_predict(clus_fuz_prep.T, cntr_prep, 2 , error=0.01, maxiter=1000)

#Storing the centroid

In [None]:
centroids_fuz_prep = cntr_prep
centroids_fuz_prep

#Get the Lat and Lon to visualize

In [None]:
clocation_fuz_prep = pd.DataFrame(centroids_fuz_prep, columns = ['Latitude', 'Longitude','Day', 'Month', 'Hour', 'Minute', 'Weekday', 'Holiday'])

In [None]:
clocation_fuz_prep_map = clocation_fuz_prep[['Latitude', 'Longitude']]

#Visualize Centroids

In [None]:
plt.scatter(clocation_fuz_prep_map['Latitude'], clocation_fuz_prep_map['Longitude'], marker = "x", color = 'R', s = 200)

#Visualize Latitude and Longitude

In [None]:
centroid_fuz_prep = clocation_fuz_prep_map.values.tolist()

map_fuz_prep = folium.Map(location = [40.71600413400166, -73.98971408426613], zoom_start = 10)
for point in range(0, len(centroid_fuz_prep)):
    folium.Marker(centroid_fuz_prep[point], popup = centroid_fuz_prep[point]).add_to(map_fuz_prep)

map_fuz_prep

#Grouping and visualizing the total number of clusters

In [None]:
label_fuz_prep = np.argmax(u_prep, axis = 0)
label_fuz_prep

In [None]:
df_prep_new_fuz = df_prep.copy()
df_prep_new_fuz['Clusters'] = label_fuz_prep
df_prep_new_fuz

In [None]:
sb.factorplot(data = df_prep_new_fuz, x = "Clusters", kind = "count", size = 7, aspect = 2)

#Visualize datapoint

In [None]:
plt.scatter(df_prep_new_fuz['Lat'], df_prep_new_fuz['Lon'], c = label_fuz_prep, cmap='viridis');

# 3. Gaussian Mixture Modelling

#Selecting Feature

In [None]:
clus_gmm_prep = df_prep[['Lat', 'Lon','Day', 'Month', 'Hour', 'Minute', 'Weekday', 'Holiday']]

#For loop to determine the number of cluster

In [None]:
n_components = np.arange(1, 15)
models_gmm_prep = [GaussianMixture(n, covariance_type='full', random_state=0).fit(clus_gmm_prep) for n in n_components]
plt.plot(n_components, [m.bic(clus_gmm_prep) for m in models_gmm_prep], label='BIC')
plt.plot(n_components, [m.aic(clus_gmm_prep) for m in models_gmm_prep], label='AIC')
plt.legend(loc='best')
plt.xlabel('n_components');

In [None]:
clf_prep = mixture.GaussianMixture(n_components=5, covariance_type='full')
clf_prep.fit(clus_gmm_prep)


In [None]:
label_gmm_prep = clf_prep.predict(clus_gmm_prep)

In [None]:
df_new_prep_gmm = df_ori.copy()
df_new_prep_gmm['Clusters'] = label_gmm_prep
df_new_prep_gmm

In [None]:
df_new_prep_gmm['Clusters'].unique()

In [None]:
plt.scatter(df_new_prep_gmm['Lat'], df_new_prep_gmm['Lon'], c = label_gmm_prep, cmap='viridis');

#Predict new location

In [None]:
new_location_prep_gmm = [(40.86, -75.56, 20, 8, 5, 37, 0, 0)]
clf_prep.predict(new_location_prep_gmm)