In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

import numpy as np
from datetime import datetime, timedelta
import random
import pandas as pd


In [3]:
# Define the start and end date
start_date = pd.to_datetime('2023-06-01')
end_date = pd.to_datetime('2023-09-30')

date_range = pd.date_range(start_date, end_date, freq='D')

In [4]:

# Create a DataFrame with the date_time column
data = pd.DataFrame({'date_time': pd.date_range(start_date, end_date, freq='H')})
data['Date'] = data['date_time'].dt.date
data['Time'] = data['date_time'].dt.strftime('%H:%M')
data['Day_of_Week'] = data['date_time'].dt.day_name()


In [5]:
data

Unnamed: 0,date_time,Date,Time,Day_of_Week
0,2023-06-01 00:00:00,2023-06-01,00:00,Thursday
1,2023-06-01 01:00:00,2023-06-01,01:00,Thursday
2,2023-06-01 02:00:00,2023-06-01,02:00,Thursday
3,2023-06-01 03:00:00,2023-06-01,03:00,Thursday
4,2023-06-01 04:00:00,2023-06-01,04:00,Thursday
...,...,...,...,...
2900,2023-09-29 20:00:00,2023-09-29,20:00,Friday
2901,2023-09-29 21:00:00,2023-09-29,21:00,Friday
2902,2023-09-29 22:00:00,2023-09-29,22:00,Friday
2903,2023-09-29 23:00:00,2023-09-29,23:00,Friday


In [6]:
data['school'] = 'lite'

weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
morning_rush_hours = range(6, 10)  # 6am to 9am
afternoon_rush_hours = range(13, 16)  # 1pm to 3pm

data.loc[
    (data['Day_of_Week'].isin(weekdays) & data['date_time'].dt.hour.isin(morning_rush_hours)) |
    (data['Day_of_Week'].isin(weekdays) & data['date_time'].dt.hour.isin(afternoon_rush_hours)),
    'school'
] = 'heavy'




In [7]:
print(data[(data['school'] == 'heavy') & (data['Day_of_Week'] == 'Monday')])

               date_time        Date   Time Day_of_Week school
102  2023-06-05 06:00:00  2023-06-05  06:00      Monday  heavy
103  2023-06-05 07:00:00  2023-06-05  07:00      Monday  heavy
104  2023-06-05 08:00:00  2023-06-05  08:00      Monday  heavy
105  2023-06-05 09:00:00  2023-06-05  09:00      Monday  heavy
109  2023-06-05 13:00:00  2023-06-05  13:00      Monday  heavy
...                  ...         ...    ...         ...    ...
2792 2023-09-25 08:00:00  2023-09-25  08:00      Monday  heavy
2793 2023-09-25 09:00:00  2023-09-25  09:00      Monday  heavy
2797 2023-09-25 13:00:00  2023-09-25  13:00      Monday  heavy
2798 2023-09-25 14:00:00  2023-09-25  14:00      Monday  heavy
2799 2023-09-25 15:00:00  2023-09-25  15:00      Monday  heavy

[119 rows x 5 columns]


In [8]:
data['company'] = 'lite'

In [9]:
morning_rush_hours = range(6, 10)  # 6am to 9am
afternoon_rush_hours = range(16, 21)  # 1pm to 3pm

data.loc[
    (data['Day_of_Week'].isin(weekdays) & data['date_time'].dt.hour.isin(morning_rush_hours)) |
    (data['Day_of_Week'].isin(weekdays) & data['date_time'].dt.hour.isin(afternoon_rush_hours)),
    'company'
] = 'heavy'

In [10]:
# Define the months for which it's raining (September, November, April)
rainy_months = [9, 11, 4]

data['weather'] = 'sunny'
data.loc[data['date_time'].dt.month.isin(rainy_months), 'weather'] = [random.choice(['rain', 'sunny']) for _ in range(data['date_time'].dt.month.isin(rainy_months).sum())]

In [11]:
data['traffic'] = 'lite'
moderate_time = range(9, 16)

data.loc[
    (data['Day_of_Week'].isin(weekdays) & data['date_time'].dt.hour.isin(morning_rush_hours)) |
    (data['Day_of_Week'].isin(weekdays) & data['date_time'].dt.hour.isin(afternoon_rush_hours)),
    'school'
] = 'heavy'

data.loc[
    (data['Day_of_Week'].isin(['Saturday', 'Sunday'])),
    'traffic'
] = 'lite'

data.loc[
    (data['Day_of_Week'].isin(weekdays) & data['date_time'].dt.hour.isin(moderate_time)),
    'traffic'
] = 'moderate'

data.loc[
    (data['Day_of_Week'].isin(weekdays) & (data['school']=='heavy') & (data['company']=='heavy')),
    'traffic'
] = 'heavy'


In [12]:
data.head(25)

Unnamed: 0,date_time,Date,Time,Day_of_Week,school,company,weather,traffic
0,2023-06-01 00:00:00,2023-06-01,00:00,Thursday,lite,lite,sunny,lite
1,2023-06-01 01:00:00,2023-06-01,01:00,Thursday,lite,lite,sunny,lite
2,2023-06-01 02:00:00,2023-06-01,02:00,Thursday,lite,lite,sunny,lite
3,2023-06-01 03:00:00,2023-06-01,03:00,Thursday,lite,lite,sunny,lite
4,2023-06-01 04:00:00,2023-06-01,04:00,Thursday,lite,lite,sunny,lite
5,2023-06-01 05:00:00,2023-06-01,05:00,Thursday,lite,lite,sunny,lite
6,2023-06-01 06:00:00,2023-06-01,06:00,Thursday,heavy,heavy,sunny,heavy
7,2023-06-01 07:00:00,2023-06-01,07:00,Thursday,heavy,heavy,sunny,heavy
8,2023-06-01 08:00:00,2023-06-01,08:00,Thursday,heavy,heavy,sunny,heavy
9,2023-06-01 09:00:00,2023-06-01,09:00,Thursday,heavy,heavy,sunny,heavy


In [13]:
data['travel_time'] = ''

In [14]:
for index, row in data.iterrows():
    if row['traffic'] == 'lite':
        start_time = datetime.strptime('03:00', '%H:%M')
        end_time = datetime.strptime('03:15', '%H:%M')
    elif row['traffic'] == 'moderate':
        start_time = datetime.strptime('03:16', '%H:%M')
        end_time = datetime.strptime('03:45', '%H:%M')
    elif row['traffic'] == 'heavy':
        start_time = datetime.strptime('03:46', '%H:%M')
        end_time = datetime.strptime('04:15', '%H:%M')

    if row['weather'] == 'rain':
        end_time += timedelta(minutes=15)

    random_time = start_time + timedelta(minutes=random.randint(0, int((end_time - start_time).total_seconds() / 60)))

    data.at[index, 'travel_time'] = random_time.strftime('%H:%M')


In [15]:
data.head(25)

Unnamed: 0,date_time,Date,Time,Day_of_Week,school,company,weather,traffic,travel_time
0,2023-06-01 00:00:00,2023-06-01,00:00,Thursday,lite,lite,sunny,lite,03:13
1,2023-06-01 01:00:00,2023-06-01,01:00,Thursday,lite,lite,sunny,lite,03:07
2,2023-06-01 02:00:00,2023-06-01,02:00,Thursday,lite,lite,sunny,lite,03:01
3,2023-06-01 03:00:00,2023-06-01,03:00,Thursday,lite,lite,sunny,lite,03:01
4,2023-06-01 04:00:00,2023-06-01,04:00,Thursday,lite,lite,sunny,lite,03:08
5,2023-06-01 05:00:00,2023-06-01,05:00,Thursday,lite,lite,sunny,lite,03:15
6,2023-06-01 06:00:00,2023-06-01,06:00,Thursday,heavy,heavy,sunny,heavy,03:54
7,2023-06-01 07:00:00,2023-06-01,07:00,Thursday,heavy,heavy,sunny,heavy,04:03
8,2023-06-01 08:00:00,2023-06-01,08:00,Thursday,heavy,heavy,sunny,heavy,03:57
9,2023-06-01 09:00:00,2023-06-01,09:00,Thursday,heavy,heavy,sunny,heavy,03:55


In [16]:
data['day'] = data['date_time'].dt.day
data['hour'] = data['date_time'].dt.hour

In [17]:
data

Unnamed: 0,date_time,Date,Time,Day_of_Week,school,company,weather,traffic,travel_time,day,hour
0,2023-06-01 00:00:00,2023-06-01,00:00,Thursday,lite,lite,sunny,lite,03:13,1,0
1,2023-06-01 01:00:00,2023-06-01,01:00,Thursday,lite,lite,sunny,lite,03:07,1,1
2,2023-06-01 02:00:00,2023-06-01,02:00,Thursday,lite,lite,sunny,lite,03:01,1,2
3,2023-06-01 03:00:00,2023-06-01,03:00,Thursday,lite,lite,sunny,lite,03:01,1,3
4,2023-06-01 04:00:00,2023-06-01,04:00,Thursday,lite,lite,sunny,lite,03:08,1,4
...,...,...,...,...,...,...,...,...,...,...,...
2900,2023-09-29 20:00:00,2023-09-29,20:00,Friday,heavy,heavy,rain,heavy,04:16,29,20
2901,2023-09-29 21:00:00,2023-09-29,21:00,Friday,lite,lite,sunny,lite,03:13,29,21
2902,2023-09-29 22:00:00,2023-09-29,22:00,Friday,lite,lite,sunny,lite,03:10,29,22
2903,2023-09-29 23:00:00,2023-09-29,23:00,Friday,lite,lite,sunny,lite,03:11,29,23


In [18]:
data['travel_time_minutes'] = data['travel_time'].apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]))

In [19]:
print(data[(data['traffic'] == 'heavy') & (data['weather'] == 'rain')])

               date_time        Date   Time Day_of_Week school company  \
2214 2023-09-01 06:00:00  2023-09-01  06:00      Friday  heavy   heavy   
2215 2023-09-01 07:00:00  2023-09-01  07:00      Friday  heavy   heavy   
2217 2023-09-01 09:00:00  2023-09-01  09:00      Friday  heavy   heavy   
2224 2023-09-01 16:00:00  2023-09-01  16:00      Friday  heavy   heavy   
2225 2023-09-01 17:00:00  2023-09-01  17:00      Friday  heavy   heavy   
...                  ...         ...    ...         ...    ...     ...   
2887 2023-09-29 07:00:00  2023-09-29  07:00      Friday  heavy   heavy   
2889 2023-09-29 09:00:00  2023-09-29  09:00      Friday  heavy   heavy   
2896 2023-09-29 16:00:00  2023-09-29  16:00      Friday  heavy   heavy   
2899 2023-09-29 19:00:00  2023-09-29  19:00      Friday  heavy   heavy   
2900 2023-09-29 20:00:00  2023-09-29  20:00      Friday  heavy   heavy   

     weather traffic travel_time  day  hour  travel_time_minutes  
2214    rain   heavy       03:50    1     6 

In [20]:
#data.to_csv('travel_time_v2.csv')

In [3]:
data = pd.read_csv('travel_time_v2.csv')

In [4]:
categorical_columns = ['school', 'company', 'traffic','weather']

In [5]:
label_encoder = LabelEncoder()

for col in categorical_columns:
    data[col + '_encoded'] = label_encoder.fit_transform(data[col])

In [6]:
data.head(25)

Unnamed: 0,date_time,Date,Time,Day_of_Week,school,company,weather,traffic,travel_time,day,hour,travel_time_minutes,school_encoded,company_encoded,traffic_encoded,weather_encoded
0,6/1/2023 0:00,6/1/2023,0:00,Thursday,lite,lite,sunny,lite,3:07,1,0,187,1,1,1,1
1,6/1/2023 1:00,6/1/2023,1:00,Thursday,lite,lite,sunny,lite,3:13,1,1,193,1,1,1,1
2,6/1/2023 2:00,6/1/2023,2:00,Thursday,lite,lite,sunny,lite,3:04,1,2,184,1,1,1,1
3,6/1/2023 3:00,6/1/2023,3:00,Thursday,lite,lite,sunny,lite,3:04,1,3,184,1,1,1,1
4,6/1/2023 4:00,6/1/2023,4:00,Thursday,lite,lite,sunny,lite,3:01,1,4,181,1,1,1,1
5,6/1/2023 5:00,6/1/2023,5:00,Thursday,lite,lite,sunny,lite,3:10,1,5,190,1,1,1,1
6,6/1/2023 6:00,6/1/2023,6:00,Thursday,heavy,heavy,sunny,heavy,3:50,1,6,230,0,0,0,1
7,6/1/2023 7:00,6/1/2023,7:00,Thursday,heavy,heavy,sunny,heavy,3:56,1,7,236,0,0,0,1
8,6/1/2023 8:00,6/1/2023,8:00,Thursday,heavy,heavy,sunny,heavy,4:10,1,8,250,0,0,0,1
9,6/1/2023 9:00,6/1/2023,9:00,Thursday,heavy,heavy,sunny,heavy,4:13,1,9,253,0,0,0,1


In [7]:
X = data[['school_encoded', 'company_encoded', 'traffic_encoded','weather_encoded', 'day', 'hour']]
y = data['travel_time_minutes']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [9]:
# Create a KNN model
knn_model = KNeighborsRegressor(n_neighbors=5)

# Train the model
knn_model.fit(X_train, y_train)



In [10]:
y_pred = knn_model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 9.146694394622783


In [11]:
# list of k values 
k_values = [1, 3, 5, 7, 9]
cv_scores = []

for k in k_values:
    knn_model = KNeighborsRegressor(n_neighbors=k)
    
    # Perform cross-validation and calculate the RMSE
    scores = cross_val_score(knn_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    
    cv_scores.append(np.sqrt(-scores.mean()))

# Find the optimal k with the lowest RMSE
optimal_k = k_values[cv_scores.index(min(cv_scores))]
print(f"Optimal k: {optimal_k}")


Optimal k: 3


In [12]:
X = data[['school_encoded', 'company_encoded', 'traffic_encoded','weather_encoded', 'day', 'hour']]
y = data['travel_time_minutes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16)

knn_model = KNeighborsRegressor(n_neighbors=3) 

knn_model.fit(X_train, y_train)

In [13]:
y_pred = knn_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 8.889742568588801


In [14]:
def categorize_classes(date_time):
    # Extract day, hour, and day_of_week
    input_day = date_time.day
    input_hour = date_time.hour
    input_day_of_week = date_time.day_name()

    print(input_day_of_week)
 
    school = 'lite'
    company = 'lite'
    traffic = 'lite'
    rainy_months = [9, 11, 4]

    if input_day_of_week in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']:
        if 6 <= input_hour < 10 or 13 <= input_hour < 16:
            school = 'heavy'

    if input_day_of_week in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']:
        if 6 <= input_hour < 10 or 16 <= input_hour < 21:
            company = 'heavy'

    if date_time.month in rainy_months:
        weather = random.choice(['rain', 'sunny'])
    else:
        weather = 'sunny' 

    if input_day_of_week in ['Saturday', 'Sunday']:
        traffic = 'lite'
    elif input_hour in range(9, 16):
        traffic = 'moderate'
    elif input_day_of_week in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] and school == 'heavy' and company == 'heavy':
        traffic = 'heavy'

    return school, company, traffic, weather

In [15]:
all_categories = ['lite', 'heavy', 'rain', 'sunny', 'moderate']
label_encoder.fit(all_categories)

In [16]:
input_date_time = pd.to_datetime('2023-09-1 7:00:00')
input_school, input_company, input_traffic, input_weather = categorize_classes(input_date_time)

Friday


In [17]:
print(input_school, input_company, input_traffic, input_weather)

heavy heavy heavy sunny


In [18]:
input_day = input_date_time.day
input_hour = input_date_time.hour
input_school_encoded = label_encoder.transform([input_school])[0]
input_company_encoded = label_encoder.transform([input_company])[0]
input_traffic_encoded = label_encoder.transform([input_traffic])[0]
input_weather_encoded = label_encoder.transform([input_weather])[0]

input_data = pd.DataFrame({
    'school_encoded': [input_school_encoded],
    'company_encoded': [input_company_encoded],
    'traffic_encoded': [input_traffic_encoded],
    'weather_encoded':[input_weather_encoded],
    'day': [input_day],
    'hour': [input_hour]
})

# predict travel time
predicted_travel_time = knn_model.predict(input_data)

print(f"Predicted Travel Time (minutes): {predicted_travel_time[0]}")


Predicted Travel Time (minutes): 230.66666666666666


In [21]:

import joblib

# Save the model to a file
filename = 'time_pred_model.sav'
joblib.dump(knn_model, filename)


['time_pred_model.sav']

In [23]:
from keras.models import load_model

filename = 'C:/Users/shana/OneDrive/Desktop/bus time pred/time_pred_model.sav'
# Load the model from the file
loaded_model = joblib.load(filename)

In [24]:
type(loaded_model)

sklearn.neighbors._regression.KNeighborsRegressor