In [1]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import os

## I will be using only a quater of the dataset as that is all my computer can handle

In [2]:
import os
import pandas as pd

data_path = 'HelsinkiRegion_TravelTimeMatrix2015\HelsinkiRegion_TravelTimeMatrix2015'

# Create a list of all the folders containing data
folders = [os.path.join(data_path, f) for f in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, f))]

# Load the data from each file and append to a list
data_list = []
for folder in folders:
    files = [os.path.join(folder, f) for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
    files = files[:len(files)//4]  # Select the first half of the files
    
    for file in files:
        # Load the data from each file
        data = pd.read_csv(file, sep=';')
        
        # Append to the list
        data_list.append(data)

# Concatenate all the data into a single DataFrame
df = pd.concat(data_list, axis=0)

In [3]:
df

Unnamed: 0,from_id,to_id,walk_t,walk_d,pt_r_tt,pt_r_t,pt_r_d,pt_m_tt,pt_m_t,pt_m_d,car_r_t,car_r_d,car_m_t,car_m_d
0,5785640,5785640,0,0,0,0,0,0,0,0,0,0,0,0
1,5785641,5785640,48,3353,48,48,3353,48,48,3353,10,985,10,985
2,5785642,5785640,50,3471,50,50,3471,50,50,3471,33,12167,31,12167
3,5785643,5785640,54,3764,54,54,3764,54,54,3764,30,10372,29,10370
4,5787544,5785640,38,2658,38,38,2658,38,38,2658,12,2183,11,2183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13226,6016698,6016691,-1,-1,-1,-1,-1,-1,-1,-1,16,3769,15,3769
13227,6016699,6016691,-1,-1,-1,-1,-1,-1,-1,-1,17,3456,17,3456
13228,6018252,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
13229,6018253,6016691,-1,-1,-1,-1,-1,-1,-1,-1,16,3946,16,3946


In [6]:
duplicate_rows = df[df.duplicated()]

duplicate_rows

Unnamed: 0,from_id,to_id,walk_t,walk_d,pt_r_tt,pt_r_t,pt_r_d,pt_m_tt,pt_m_t,pt_m_d,car_r_t,car_r_d,car_m_t,car_m_d
58,5797069,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
419,5818069,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
420,5818070,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
421,5818071,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
986,5831306,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12352,5978631,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
12815,5988357,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
13012,5996387,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
13224,6016696,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


### The dataset contained duplicated rows these where removed

In [8]:
df = df.drop_duplicates()

In [9]:
duplicate_rows = df[df.duplicated()]

duplicate_rows

Unnamed: 0,from_id,to_id,walk_t,walk_d,pt_r_tt,pt_r_t,pt_r_d,pt_m_tt,pt_m_t,pt_m_d,car_r_t,car_r_d,car_m_t,car_m_d


## The dataset was split up into five parts as seen below so it can be trained and used for the api

In [12]:
# Walking dataframe
walking_df = df.loc[:, ['from_id', 'to_id', 'walk_t', 'walk_d']]

# Public transportation rush hour dataframe
public_transit_r_df = df.loc[:, ['from_id', 'to_id', 'pt_r_tt', 'pt_r_t', 'pt_r_d']]

# Public transportation midday dataframe
public_transit_m_df = df.loc[:, ['from_id', 'to_id', 'pt_m_tt', 'pt_m_t', 'pt_m_d']]

# Car rush hour dataframe
car_r_df = df.loc[:, ['from_id', 'to_id', 'car_r_t', 'car_r_d']]

# Car midday dataframe
car_m_df = df.loc[:, ['from_id', 'to_id', 'car_m_t', 'car_m_d']]

In [3]:
del df

# Public Transit rush hour

In [6]:
public_transit_r_df

Unnamed: 0,from_id,to_id,pt_r_tt,pt_r_t,pt_r_d
0,5785640,5785640,0,0,0
1,5785641,5785640,48,48,3353
2,5785642,5785640,50,50,3471
3,5785643,5785640,54,54,3764
4,5787544,5785640,38,38,2658
...,...,...,...,...,...
13226,6016698,6016691,-1,-1,-1
13227,6016699,6016691,-1,-1,-1
13228,6018252,-1,-1,-1,-1
13229,6018253,6016691,-1,-1,-1


### The dataset contained some rows where certain columns had a value of -1. This could have posed a problem when training the model. After considering different approaches, we decided that deleting these rows was the best course of action. Imputation wasn't possible in this case because we didn't have enough information to infer what the missing values should be, and because the number of affected rows was relatively small. While deleting data can potentially reduce the amount of available training data, in this case the impact on the overall dataset was negligible.

In [7]:
public_transit_r_df.eq(-1).sum()

from_id          0
to_id       518766
pt_r_tt    2038899
pt_r_t     2038899
pt_r_d     2038899
dtype: int64

In [13]:
# a boolean mask for rows to keep
mask = public_transit_r_df.ne(-1).all(axis=1)

# a new dataframe without -1
ptr_filtered = public_transit_r_df.loc[mask]

del public_transit_r_df

In [10]:
ptr_filtered.eq(-1).sum()

from_id    0
to_id      0
pt_r_tt    0
pt_r_t     0
pt_r_d     0
dtype: int64

### The correlation between distance and time was understandably high. However, since the API would only be taking in origin, destination, mode, and time of day as inputs, we decided not to include distance as an input feature in our model. Instead, we will only use 'from_id' and 'to_id' as input features. 

### While the correlation between 'from_id,' 'to_id,' and time is low, we can still use machine learning models such as decision trees, random forests, or neural networks that can effectively capture nonlinear relationships between input features and output variables. Therefore, we can still develop a reliable model without relying on the correlation between input features.

In [11]:
# Create a correlation matrix
corr_matrix = ptr_filtered.corr()

corr_matrix

Unnamed: 0,from_id,to_id,pt_r_tt,pt_r_t,pt_r_d
from_id,1.0,2e-06,-0.277959,-0.271549,-0.306382
to_id,2e-06,1.0,-0.347669,-0.26654,-0.135342
pt_r_tt,-0.277959,-0.347669,1.0,0.979909,0.704874
pt_r_t,-0.271549,-0.26654,0.979909,1.0,0.706359
pt_r_d,-0.306382,-0.135342,0.704874,0.706359,1.0


In [14]:
X = ptr_filtered.drop(['pt_r_tt','pt_r_t','pt_r_d'], axis = 1)
y = ptr_filtered['pt_r_t']

In [14]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the decision tree model
model = DecisionTreeRegressor(random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

r2 = model.score(X_test, y_test)
print("R^2 score:", r2)

Mean squared error: 13.563783750882754
R^2 score: 0.989330382957821


### The Decision Tree Regressor model performed well on the test set, as evidenced by a low Mean Squared Error (MSE) of 13.56. This implies that the average deviation of the predictions from the actual values is approximately 3.68 units. Additionally, the model achieved a high R-squared score of 0.9893, which indicates that it can explain approximately 98.93% of the variability in the target variable based on the input features. The combination of a low MSE and high R-squared score is a strong indication of the model's ability to make accurate predictions on the test set. 

In [15]:
from joblib import dump

dump(model, 'ptr_decision_tree_model.joblib', compress=3)

['ptr_decision_tree_model.joblib']

## We applied the same method used for modeling public transport rush hour to all the other models.

# Public Transport Midday

In [5]:
public_transit_m_df

Unnamed: 0,from_id,to_id,pt_m_tt,pt_m_t,pt_m_d
0,5785640,5785640,0,0,0
1,5785641,5785640,48,48,3353
2,5785642,5785640,50,50,3471
3,5785643,5785640,54,54,3764
4,5787544,5785640,38,38,2658
...,...,...,...,...,...
13226,6016698,6016691,-1,-1,-1
13227,6016699,6016691,-1,-1,-1
13228,6018252,-1,-1,-1,-1
13229,6018253,6016691,-1,-1,-1


In [6]:
public_transit_m_df.eq(-1).sum()

from_id          0
to_id       518766
pt_m_tt    2038903
pt_m_t     2038903
pt_m_d     2038903
dtype: int64

In [7]:
# a boolean mask for rows to keep
mask = public_transit_m_df.ne(-1).all(axis=1)

# a new dataframe without -1
ptm_filtered = public_transit_m_df.loc[mask]

del public_transit_m_df

In [8]:
ptm_filtered.eq(-1).sum()

from_id    0
to_id      0
pt_m_tt    0
pt_m_t     0
pt_m_d     0
dtype: int64

In [10]:
corr_matrix = ptm_filtered.corr()

corr_matrix

Unnamed: 0,from_id,to_id,pt_m_tt,pt_m_t,pt_m_d
from_id,1.0,2e-06,-0.304708,-0.275406,-0.320709
to_id,2e-06,1.0,-0.326296,-0.256591,-0.121022
pt_m_tt,-0.304708,-0.326296,1.0,0.978853,0.697812
pt_m_t,-0.275406,-0.256591,0.978853,1.0,0.698786
pt_m_d,-0.320709,-0.121022,0.697812,0.698786,1.0


In [12]:
X = ptm_filtered.drop(['pt_m_tt','pt_m_t','pt_m_d'], axis = 1)
y = ptm_filtered['pt_m_t']

In [13]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeRegressor(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

r2 = model.score(X_test, y_test)
print("R^2 score:", r2)

Mean squared error: 13.37076750366821
R^2 score: 0.9889836973254116


In [14]:
from joblib import dump

dump(model, 'ptm_decision_tree_model.joblib', compress=3)

['ptm_decision_tree_model.joblib']

# Car rush hour

In [6]:
car_r_df

Unnamed: 0,from_id,to_id,car_r_t,car_r_d
0,5785640,5785640,0,0
1,5785641,5785640,10,985
2,5785642,5785640,33,12167
3,5785643,5785640,30,10372
4,5787544,5785640,12,2183
...,...,...,...,...
13226,6016698,6016691,16,3769
13227,6016699,6016691,17,3456
13228,6018252,-1,-1,-1
13229,6018253,6016691,16,3946


In [7]:
car_r_df.eq(-1).sum()

from_id         0
to_id      518766
car_r_t    518766
car_r_d    518766
dtype: int64

In [8]:
# a boolean mask for rows to keep
mask = car_r_df.ne(-1).all(axis=1)

# a new dataframe without -1
carr_filtered = car_r_df.loc[mask]

del car_r_df

In [10]:
carr_filtered.eq(-1).sum()

from_id    0
to_id      0
car_r_t    0
car_r_d    0
dtype: int64

In [11]:
# Create a correlation matrix
corr_matrix = carr_filtered.corr()

corr_matrix

Unnamed: 0,from_id,to_id,car_r_t,car_r_d
from_id,1.0,-1.9479e-11,-0.088013,-0.108536
to_id,-1.9479e-11,1.0,-0.223731,-0.071458
car_r_t,-0.08801334,-0.2237308,1.0,0.887255
car_r_d,-0.1085357,-0.07145772,0.887255,1.0


In [16]:
X = carr_filtered.drop(['car_r_t','car_r_d'], axis = 1)
y = carr_filtered['car_r_t']

In [17]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeRegressor(random_state=42)


model.fit(X_train, y_train)


y_pred = model.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

r2 = model.score(X_test, y_test)
print("R^2 score:", r2)

Mean squared error: 2.563602398921871
R^2 score: 0.9884924636112575


In [18]:
from joblib import dump

dump(model, 'carr_decision_tree_model.joblib', compress=3)

['carr_decision_tree_model.joblib']

# Car midday

In [5]:
car_m_df

Unnamed: 0,from_id,to_id,car_m_t,car_m_d
0,5785640,5785640,0,0
1,5785641,5785640,10,985
2,5785642,5785640,31,12167
3,5785643,5785640,29,10370
4,5787544,5785640,11,2183
...,...,...,...,...
13226,6016698,6016691,15,3769
13227,6016699,6016691,17,3456
13228,6018252,-1,-1,-1
13229,6018253,6016691,16,3946


In [6]:
car_m_df.eq(-1).sum()

from_id         0
to_id      518766
car_m_t    518768
car_m_d    518766
dtype: int64

In [7]:
# a boolean mask for rows to keep
mask = car_m_df.ne(-1).all(axis=1)

# a new dataframe without -1
carm_filtered = car_m_df.loc[mask]

del car_m_df

In [8]:
carm_filtered.eq(-1).sum()

from_id    0
to_id      0
car_m_t    0
car_m_d    0
dtype: int64

In [10]:
# Create a correlation matrix
corr_matrix = carm_filtered.corr()

corr_matrix

Unnamed: 0,from_id,to_id,car_m_t,car_m_d
from_id,1.0,1.178597e-08,-0.105902,-0.105896
to_id,1.178597e-08,1.0,-0.234061,-0.064912
car_m_t,-0.1059021,-0.234061,1.0,0.863605
car_m_d,-0.1058955,-0.06491189,0.863605,1.0


In [11]:
X = carm_filtered.drop(['car_m_t','car_m_d'], axis = 1)
y = carm_filtered['car_m_t']

In [12]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = DecisionTreeRegressor(random_state=42)

model.fit(X_train, y_train)


y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

r2 = model.score(X_test, y_test)
print("R^2 score:", r2)

Mean squared error: 2.5628366501638262
R^2 score: 0.9848926126016945


In [13]:
from joblib import dump

dump(model, 'carm_decision_tree_model.joblib', compress=3)

['carm_decision_tree_model.joblib']

# Walk

In [6]:
# a boolean mask for rows to keep
mask = walking_df.ne(-1).all(axis=1)

# a new dataframe without -1
walking_df_filtered = walking_df.loc[mask]

del walking_df

In [14]:
walking_df_filtered

Unnamed: 0,from_id,to_id,walk_t,walk_d
0,5785640,5785640,0,0
1,5785641,5785640,48,3353
2,5785642,5785640,50,3471
3,5785643,5785640,54,3764
4,5787544,5785640,38,2658
...,...,...,...,...
13226,6016698,6015136,37,2565
13227,6016699,6015136,38,2692
13229,6018253,6015136,42,2922
13230,6018254,6015136,40,2792


In [7]:
walking_df_filtered.eq(-1).sum()

from_id    0
to_id      0
walk_t     0
walk_d     0
dtype: int64

In [8]:
# a correlation matrix
corr_matrix = walking_df_filtered.corr()

corr_matrix

Unnamed: 0,from_id,to_id,walk_t,walk_d
from_id,1.0,0.000877,-0.122191,-0.122192
to_id,0.000877,1.0,-0.048208,-0.048208
walk_t,-0.122191,-0.048208,1.0,0.999998
walk_d,-0.122192,-0.048208,0.999998,1.0


In [9]:
X = walking_df_filtered.drop(['walk_t','walk_d'], axis = 1)
y = walking_df_filtered['walk_t']

In [10]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeRegressor(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

r2 = model.score(X_test, y_test)
print("R^2 score:", r2)

Mean squared error: 11.378245508242772
R^2 score: 0.9994410178529


In [30]:
from joblib import dump

dump(model, 'walk_decision_tree_model2.joblib', compress=3)

['walk_decision_tree_model2.joblib']