<a href="https://colab.research.google.com/github/DBhugwandas/Uber-Nairobi-Ambulance-Perambulation-Challenge/blob/main/Uber_Nairobi_Ambulance_Challenge_Clustering_Approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Importing the Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
!pip install -q geopandas
import geopandas as gdp

!apt install libspatialindex-dev
!pip install  rtree
from rtree import index
from rtree.index import Rtree 

In [2]:
#File Paths
Train = "/content/drive/My Drive/Uber Nairobi Challenge/Train.csv"
Segment_Info = "/content/drive/My Drive/Uber Nairobi Challenge/Segment_info.csv"
Weather = "/content/drive/My Drive/Uber Nairobi Challenge/Weather_Nairobi_Daily_GFS.csv"
Segments_geometry = "/content/drive/My Drive/Uber Nairobi Challenge/segments_geometry.geojson"
submission = "/content/drive/My Drive/Uber Nairobi Challenge/SampleSubmission.csv"

In [3]:
#Reading in Files
df_train = pd.read_csv(Train)
df_submit = pd.read_csv(submission)

#Creating Additional Variables 
df_train['Month'] = pd.DatetimeIndex(df_train['datetime']).month
df_train['Day'] = pd.DatetimeIndex(df_train['datetime']).day

In [4]:
df_train.head()

Unnamed: 0,uid,datetime,latitude,longitude,Month,Day
0,1,2018-01-01 00:25:46,-1.18885,36.931382,1,1
1,2,2018-01-01 02:02:39,-0.662939,37.20873,1,1
2,3,2018-01-01 02:31:49,-0.662939,37.20873,1,1
3,4,2018-01-01 03:04:01,-1.288087,36.826583,1,1
4,5,2018-01-01 03:58:49,-1.18885,36.931382,1,1


In [5]:
def GetCentroids(input_df,cluster_object):
  '''
  Iterating over the each month to generate a static location for each month
  Uses KNN Clustering Algorithm 
  '''

  loc_list = list()

  for i in range(1,13):
    loc_dict = dict()
    #Selecting the Month
    df = input_df[input_df['Month'] == i].copy()

    #Creating the clusters
    df['cluster'] = cluster.fit_predict(df[['latitude','longitude']])

    #Getting the centroids for each month
    loc_list.append(dict(zip(list(range(1,7)),cluster.cluster_centers_)))

  return loc_list


def CreateLocationDF(loc_list):
  ''' 
  Takes in the list of locations by month
  and creates a DataFrame of Ambulence Locations By Month
  '''

  lat_coldict = dict()
  log_coldict = dict()
  for i in range(6):
      lat_coldict[i] = 'A'+str(i)+"_Latitude"
      log_coldict[i] = 'A'+str(i)+"_Longitude"


  #Centroid Locations
  df_locations = pd.DataFrame()


  #Creating a dataframe with 6 ambulence locations indexed my month

  for j in range(len(loc_list)):
    df1 = pd.DataFrame(pd.Series(list((loc_list[j][i][0] for i in range(1,7))))).T.rename(columns=lat_coldict)
    df2 = pd.DataFrame(pd.Series(list((loc_list[j][i][1] for i in range(1,7))))).T.rename(columns=log_coldict)
    df_final = pd.concat([df1,df2],axis=1)
    df_final = df_final.reindex(sorted(df_final.columns), axis=1)
    df_final['Month'] = j+1
    df_locations = df_locations.append(df_final)

  return df_locations


In [6]:
#Clustering and Obtaining DataFrame of Amubulence Locations by Month

from sklearn.cluster import KMeans
cluster = KMeans(n_clusters = 6, random_state=100,max_iter=10000,n_init=500)
centroids = GetCentroids(df_train,cluster)
df_locations = CreateLocationDF(centroids)

In [7]:
df_locations

Unnamed: 0,A0_Latitude,A0_Longitude,A1_Latitude,A1_Longitude,A2_Latitude,A2_Longitude,A3_Latitude,A3_Longitude,A4_Latitude,A4_Longitude,A5_Latitude,A5_Longitude,Month
0,-1.484737,37.065708,-1.296374,36.843034,-1.196782,36.917833,-2.249014,37.481256,-0.921517,37.08779,-1.25315,36.736794,1
0,-1.281177,36.818029,-1.207509,36.904438,-0.962757,37.111391,-1.490665,37.073884,-1.219742,36.677337,-1.328581,36.891185,2
0,-2.498674,37.545756,-1.340356,36.920867,-1.204205,36.906091,-1.026579,37.139265,-1.048782,36.61572,-1.288858,36.797852,3
0,-1.276896,36.752946,-1.153345,36.972121,-2.352517,37.197529,-0.836367,36.811116,-1.290287,36.867374,-1.491277,37.230982,4
0,-1.246428,36.725311,-1.483452,37.014532,-1.142919,36.975362,-2.5967,37.404011,-0.804638,37.167677,-1.288678,36.848498,5
0,-1.104912,37.010306,-1.29285,36.796234,-1.554406,37.121514,-0.997045,36.632616,-1.082427,37.7248,-1.263257,36.891229,6
0,-1.258284,36.830831,-2.1,36.75,-2.020349,37.471734,-0.984838,37.049175,-1.340999,36.885082,-1.183745,36.668363,7
0,-1.280205,36.843561,-1.096181,37.002599,-0.992283,36.620845,-1.360536,36.9274,-1.272715,36.743906,-1.42951,37.64657,8
0,-1.305786,36.830811,-2.1,36.75,-2.079875,37.473019,-0.975937,37.072411,-1.205519,36.909934,-1.257813,36.741323,9
0,-1.282233,36.843954,-1.47719,37.456489,-1.069271,36.633681,-1.27615,36.752462,-1.373646,36.990199,-1.142691,36.961778,10


In [8]:
#Haversine distances between the centoids and actual incident locations
from sklearn.metrics.pairwise import haversine_distances
from math import radians

#Merging centroid locations to training dataset
df_train_clust = df_train.merge(df_locations,on='Month')

#Col Name Pais on latitude and longitude
loc_pairs = list((list((list(df_locations.columns)[:12][i],list(df_locations.columns)[:12][i+1])) for i in range(0,11,2)))

#Interate over loc rows in dataframe and loc pairs to find the min and max distance between the incident and ambulence
all_distance = []
for i in range(len(df_train_clust)):
  incident_loc = np.array(df_train_clust.loc[i,['latitude','longitude']]).reshape(1,2)
  incident_loc = np.array([radians(x) for x in incident_loc[0,:]])
  distance = []
  for j in range(len(loc_pairs)):
    ambulence_loc = np.array(df_train_clust.loc[i,loc_pairs[j]]).reshape(1,2)
    ambulence_loc = np.array([radians(x) for x in ambulence_loc[0,:]])
    distance.append(haversine_distances([incident_loc, ambulence_loc]).max()* 6371000/1000 )
  
  all_distance.append(distance)

In [9]:
#Statistics of Distance to Incident location
distance_array = np.array(all_distance)
pd.Series(np.min(distance_array,axis=1)).describe()

count    6318.000000
mean        6.067041
std         5.901887
min         0.000000
25%         2.981949
50%         4.814183
75%         7.217481
max        84.419741
dtype: float64

In [None]:
#Joining locations to sumbission file based on monthly moving of ambulence locations
df_submit['Month'] = pd.DatetimeIndex(df_submit['date']).month
df_submit = df_submit.loc[:,['date','Month']]
df_submit = df_submit.merge(df_locations,on='Month').drop(columns='Month')


In [None]:
df_submit.to_csv("/content/drive/My Drive/Uber Nairobi Challenge/Submission9-MonthMoveKNN.csv")

In [None]:
df_submit

Unnamed: 0,date,A0_Latitude,A0_Longitude,A1_Latitude,A1_Longitude,A2_Latitude,A2_Longitude,A3_Latitude,A3_Longitude,A4_Latitude,A4_Longitude,A5_Latitude,A5_Longitude
0,7/1/2019 0:00,-1.258284,36.830831,-2.100000,36.750000,-2.020349,37.471734,-0.984838,37.049175,-1.340999,36.885082,-1.183745,36.668363
1,7/1/2019 3:00,-1.258284,36.830831,-2.100000,36.750000,-2.020349,37.471734,-0.984838,37.049175,-1.340999,36.885082,-1.183745,36.668363
2,7/1/2019 6:00,-1.258284,36.830831,-2.100000,36.750000,-2.020349,37.471734,-0.984838,37.049175,-1.340999,36.885082,-1.183745,36.668363
3,7/1/2019 9:00,-1.258284,36.830831,-2.100000,36.750000,-2.020349,37.471734,-0.984838,37.049175,-1.340999,36.885082,-1.183745,36.668363
4,7/1/2019 12:00,-1.258284,36.830831,-2.100000,36.750000,-2.020349,37.471734,-0.984838,37.049175,-1.340999,36.885082,-1.183745,36.668363
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1467,12/31/2019 9:00,-1.225084,36.890526,-1.275202,36.743709,-0.984675,37.070044,-1.523628,37.116011,-0.994851,36.615976,-1.306865,36.844689
1468,12/31/2019 12:00,-1.225084,36.890526,-1.275202,36.743709,-0.984675,37.070044,-1.523628,37.116011,-0.994851,36.615976,-1.306865,36.844689
1469,12/31/2019 15:00,-1.225084,36.890526,-1.275202,36.743709,-0.984675,37.070044,-1.523628,37.116011,-0.994851,36.615976,-1.306865,36.844689
1470,12/31/2019 18:00,-1.225084,36.890526,-1.275202,36.743709,-0.984675,37.070044,-1.523628,37.116011,-0.994851,36.615976,-1.306865,36.844689
