<a href="https://colab.research.google.com/github/rkp74/Smart_Transportation_System/blob/main/ML_mini_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gpxpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gpxpy
  Downloading gpxpy-1.5.0.tar.gz (111 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.6/111.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gpxpy
  Building wheel for gpxpy (setup.py) ... [?25l[?25hdone
  Created wheel for gpxpy: filename=gpxpy-1.5.0-py3-none-any.whl size=42898 sha256=a5a739cfa4e34572b66d8d3f7828c1855d74539ef77fd67a51cba995b0eeb601
  Stored in directory: /root/.cache/pip/wheels/7e/9b/8d/b4812540cd01add3ca698dc5903c53b99d15ffbd61f23fdf0a
Successfully built gpxpy
Installing collected packages: gpxpy
Successfully installed gpxpy-1.5.0


In [None]:
import time
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans, KMeans
import gpxpy.geo # Get the haversine distance
from sklearn.linear_model import LinearRegression
from sklearn import tree
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import math
from prettytable import PrettyTable

In [None]:
columns=['tpep_pickup_datetime',
           'tpep_dropoff_datetime',
           'trip_distance',
           'pickup_longitude',
           'pickup_latitude',
           'dropoff_longitude',
           'dropoff_latitude',
           'total_amount']

In [None]:
# Connecting Google Drive with Google Colab
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
data = pd.read_csv("/content/drive/My Drive/Smart_Transportation_System/taxi_demand_prediction/yellow_tripdata_2015-01.csv")

In [None]:
original_data_len = data.shape[0]
original_data_len

12748986

In [None]:
def clean_data(df, test=False, predict=False):
    df = df.dropna(how='any', axis='rows')
    df = df[(df.dropoff_latitude != 0) | (df.dropoff_longitude != 0)]
    df = df[(df.pickup_latitude != 0) | (df.pickup_longitude != 0)]
    
    if "total_amount" in list(df):
        df = df[df.total_amount.between(5, 45)]
    
    return df

data_cleaned = clean_data(data)

#Data Cleaning

In [None]:
# to decide where to start removing outliers
def remove_outliers(data, start=0, end=100):
    data=np.sort(data)
    for i in np.linspace(start, end, 10):
        i=round(i, 6)
        print(str(i).zfill(5) + " percentile value is " + str(round(data[int(len(data)*(float(i)/100))-1], 1)))
    print(str(float(end)).zfill(3) + " percentile value is " + str(data[-1]))

In [None]:
def clean_coordinates(df):
    nrows = df.shape[0]
    df.drop(df.index[
        
            ~((df['pickup_latitude'].between(40.496115395170364, 40.91553277700258)) &
              (df['pickup_longitude'].between(-74.25559136315209, -73.7000090639354))) 
        
    ], inplace=True)
    print("Number of rows removed due to wrong coordinates is {}".format(nrows - df.shape[0]))
    
clean_coordinates(data_cleaned)

Number of rows removed due to wrong coordinates is 825


In [None]:
#2 trip duration
def clean_trip_duration(df):
    # convert from object to datetime
    df['tpep_pickup_datetime']  = pd.to_datetime(df['tpep_pickup_datetime'])
    df['tpep_dropoff_datetime']  = pd.to_datetime(df['tpep_dropoff_datetime'])
    
    # copute the time diffrance between pickup & dropoff
    # to covert from nanosecondes to minutes we devide by 1000000000 then by 60
    # store trip_duratin column
    trip_duration = np.array(df['tpep_dropoff_datetime']-df['tpep_pickup_datetime'])
    trip_duration = trip_duration/1000000000/60
    df['trip_duration'] = trip_duration.astype(float)
    
    # drop all records that have trip_duration > 2 hours
    #                            trip_duration <= 0
    #                            trip_distance <= 0
    nrows = df.shape[0]
    df.drop(df[(df['trip_duration'] > 160) | 
               (df['trip_duration'] <= 0)].index, inplace = True)
    print("Number of rows removed due to wrong trip_duration {}".format(nrows - df.shape[0]))
    
    
clean_trip_duration(data_cleaned)

Number of rows removed due to wrong trip_duration 20229


In [None]:
#3 pickup_time
def clean_pickuptime(df):
    return df.rename(columns={'tpep_pickup_datetime': 'pickup_time'})

data_cleaned = clean_pickuptime(data_cleaned)

In [None]:
#4 trip_distance
def clean_trip_distance(df):
    nrows = df.shape[0]
    df.drop(df[(df['trip_distance'] <= 0) | (df['trip_distance'] > 77.5)].index, inplace = True)
    print("Number of rows removed due to speed outliers {}".format(nrows - df.shape[0]))
    
clean_trip_distance(data_cleaned)

Number of rows removed due to speed outliers 6089


In [None]:
def compute_speed(df):
    # computing Taxi speed average (mile/hour)
    df['speed'] = df['trip_distance']/df['trip_duration']*60
    
def clean_speed(df):

    # Removing speed anomaly/outliers
    nrows = df.shape[0]
    df.drop(df[((df['speed'] <= 0) | (df['speed'] > 63.0))].index, inplace = True)
    print("Number of rows removed due to speed outliers {}".format(nrows - df.shape[0]))


compute_speed(data_cleaned)

clean_speed(data_cleaned)

Number of rows removed due to speed outliers 340


In [None]:
!pip install pandarallel

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandarallel
  Downloading pandarallel-1.6.4.tar.gz (12 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill>=0.3.1
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25l[?25hdone
  Created wheel for pandarallel: filename=pandarallel-1.6.4-py3-none-any.whl size=16677 sha256=3fad0f65411cd8e3fb874949017d15ecbbea26859e629fec59decb3fcb499145
  Stored in directory: /root/.cache/pip/wheels/41/01/29/deaa71fe596f8d857e57c4fb388db8861e23e6ed0b03204dcb
Successfully built pandarallel
Installing collected packages: dill, pandarallel
Successfully installed dill-0.3.6 pandarallel-1.6.4


In [None]:
# clustering using K-Means with respect to longitude and latitude
from datetime import datetime, timedelta
from sklearn.cluster import MiniBatchKMeans, KMeans
from pandarallel import pandarallel


#Clustering pickups
print("Getting clusters")
coord = data_cleaned[["pickup_latitude", "pickup_longitude"]].values
regions = MiniBatchKMeans(n_clusters = 30, batch_size = 10000).fit(coord)

print("Predicting clusters")
cluster_column = regions.predict(data_cleaned[["pickup_latitude", "pickup_longitude"]])
data_cleaned["pickup_cluster"] = cluster_column

Getting clusters




Predicting clusters




In [None]:
# Replacing mins and sec with 0
print("Removing Minutes and seconds")
pandarallel.initialize()
data_cleaned['pickup_time'] = data_cleaned.pickup_time.parallel_apply(lambda x : pd.to_datetime(x).replace(minute=0, second=0) + timedelta(hours=1))

Removing Minutes and seconds
INFO: Pandarallel will run on 1 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
print("Group by Cluster and time")
df1 = data_cleaned.groupby(['pickup_time','pickup_cluster']).size().reset_index(name='count')

print("Converting counts to demand percentage")

df1['count'] = df1['count'].parallel_apply(lambda x :  (x / df1['count'].max()))


df1['month'] = pd.DatetimeIndex(df1['pickup_time']).month
df1['day'] = pd.DatetimeIndex(df1['pickup_time']).day
df1['dayofweek'] = pd.DatetimeIndex(df1['pickup_time']).dayofweek
df1['hour'] = pd.DatetimeIndex(df1['pickup_time']).hour

Group by Cluster and time
Converting counts to demand percentage


Split data into train and test, X and y

In [None]:
X = df1[['pickup_cluster', 'month', 'day', 'hour', 'dayofweek']]
y = df1['count']

In [None]:
print(len(X))
print(len(y))

22260
22260


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(X,y , random_state=42, test_size=0.25,  shuffle=True)

Models Training

In [None]:


print('model training 0/3 (creating model)', end='\r')
LReg = LinearRegression()

print('model training 1/3 (fitting model)', end='\r')
LReg.fit(X_train, y_train)

print('model training 2/3 (training model)', end='\r')
LReg_y_pred = LReg.predict(X_test)

print('model training 3/3 done!           ', end='\r')



model training 0/3 (creating model)model training 1/3 (fitting model)model training 2/3 (training model)model training 3/3 done!           

In [None]:
print('model training 0/3 (creating model)', end='\r')
RFRegr = RandomForestRegressor()

print('model training 1/3 (fitting model)', end='\r')
RFRegr.fit(X_train, y_train)

print('model training 2/3 (training model)', end='\r')
RFRegr_y_pred = RFRegr.predict(X_test)

print('model training 3/3 done!           ', end='\r')



In [None]:
print('model training 0/3 (creating model)', end='\r')
GBRegr = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)

print('model training 1/3 (fitting model)', end='\r')
GBRegr.fit(X_train, y_train)

print('model training 2/3 (training model)', end='\r')
GBRegr_y_pred = GBRegr.predict(X_test)

print('model training 3/3 done!           ', end='\r')



In [None]:
def model_evaluation(algorithem_name, X_Test, y_pred, y_true):
    
    # R2 and Adjasted R2
    r2 = r2_score(y_true, y_pred)
    adj_r2 = 1-(1-r2)*((len(X_Test)-1)/(len(X_Test)-X_Test.shape[1]-1))
    # MSE and RMSE
    mse = mean_squared_error(y_true, y_pred)
    rmse = math.sqrt(mse)
    
    # print in table
    x = PrettyTable()
    x.add_row(['R2', r2])
    x.add_row(['Adjusted R2', adj_r2])
    x.add_row(['MSE',mse])
    x.add_row(['RMSE', rmse])
    x.title = algorithem_name
    print(x)

In [None]:
model_evaluation('y True',X_Test=X_test, y_pred=y_test, y_true=y_test)

+-------------+---------+
|   Field 1   | Field 2 |
+-------------+---------+
|      R2     |   1.0   |
| Adjusted R2 |   1.0   |
|     MSE     |   0.0   |
|     RMSE    |   0.0   |
+-------------+---------+


In [None]:
model_evaluation('Linear Regression',X_Test=X_test, y_pred=LReg_y_pred, y_true=y_test)

+-------------+----------------------+
|   Field 1   |       Field 2        |
+-------------+----------------------+
|      R2     |  0.0950983319699581  |
| Adjusted R2 |  0.0942844250909961  |
|     MSE     | 0.016586335870444738 |
|     RMSE    | 0.12878794924388204  |
+-------------+----------------------+


In [None]:
model_evaluation('Random Forest',X_Test=X_test, y_pred=RFRegr_y_pred, y_true=y_test)

+-------------+-----------------------+
|   Field 1   |        Field 2        |
+-------------+-----------------------+
|      R2     |   0.9572284350793755  |
| Adjusted R2 |   0.9571899645226921  |
|     MSE     | 0.0007839785984950337 |
|     RMSE    |  0.02799961782766032  |
+-------------+-----------------------+


In [None]:
model_evaluation('Gradient Boosting',X_Test=X_train, y_pred=GBRegr_y_pred, y_true=y_test)

+-------------+----------------------+
|   Field 1   |       Field 2        |
+-------------+----------------------+
|      R2     |  0.9727217283871309  |
| Adjusted R2 |  0.9727135558568376  |
|     MSE     | 0.00049999529332423  |
|     RMSE    | 0.022360574530280522 |
+-------------+----------------------+
