In [None]:
# Import all required libraries
# Read csv the dataframe
# Inspect the dataframe and clean if required
# Make new dataframe with selected feature columns
# Scale the features
# Test train split (Randon seed, shuffle, stratify)
# Create KNN model and fit the training data
# Predict the test data
# Evaluate the model using recall and precision. Present a confusion matrix.
# Perform cross validation with 5 folds and report average recall and precision
# Perform hyperparameter tuning using GridSearchCV for n_neighbors from 1 to 20
# Create a pipeline including scaling and KNN model
# Add visualisations


In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

In [25]:
ttc_initial=pd.read_csv("C:/Users/sonus/OneDrive/Desktop/DSI/Rohan Kulkarni/Final Project/ml13-ttc-delays-project/src_files/ttc_delays_final_df.csv")

In [9]:
ttc_initial.head()

Unnamed: 0,date,year,month,day_of_month,weekday_num,time,hour,minute,day,day_of_week,...,delay_category,min_delay,location_bound,line,major_delay_flag,rush_hour_flag,controllable_delay_flag,season,major_event_flag,major_event_desc
0,2014-01-01,2014,1,1,3,00:21,0,21,Wednesday,3,...,Passenger,55,W,BD,1,0,0,Winter,0,
1,2014-01-01,2014,1,1,3,02:06,2,6,Wednesday,3,...,Passenger,3,W,BD,0,0,0,Winter,0,
2,2014-01-01,2014,1,1,3,03:10,3,10,Wednesday,3,...,Passenger,3,W,BD,0,0,0,Winter,0,
3,2014-01-01,2014,1,1,3,03:20,3,20,Wednesday,3,...,Passenger,5,S,YU,0,0,1,Winter,0,
4,2014-01-01,2014,1,1,3,08:48,8,48,Wednesday,3,...,Staff,5,E,BD,0,1,1,Winter,0,


In [26]:
# label target variable major_delay_flag
ttc_initial["major_delay_flag"] = ttc_initial["major_delay_flag"].replace({
    "0" : "no_major_delay",
    "1" : "major_delay",
})

In [27]:
# Select features and target variable and create new dataframe
# month	day_of_month	hour	day_of_week	delay_category	line	major_delay_flag

ttc=ttc_initial[['month','day_of_month','weekday_num','hour','day_of_week','delay_category','line','rush_hour_flag','season','major_delay_flag']]

In [28]:
ttc.head()

Unnamed: 0,month,day_of_month,weekday_num,hour,day_of_week,delay_category,line,rush_hour_flag,season,major_delay_flag
0,1,1,3,0,3,Passenger,BD,0,Winter,1
1,1,1,3,2,3,Passenger,BD,0,Winter,0
2,1,1,3,3,3,Passenger,BD,0,Winter,0
3,1,1,3,3,3,Passenger,YU,0,Winter,0
4,1,1,3,8,3,Staff,BD,1,Winter,0


In [32]:
# One-hot encode categorical variables
from sklearn.preprocessing import OneHotEncoder
categorical_features = ['delay_category', 'line', 'season']
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_features = encoder.fit_transform(ttc[categorical_features])
encoded_feature_names = encoder.get_feature_names_out(categorical_features)
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names)
ttc_encoded = pd.concat([ttc.drop(columns=['delay_category', 'line', 'season']), encoded_df], axis=1)
ttc_encoded

Unnamed: 0,month,day_of_month,weekday_num,hour,day_of_week,rush_hour_flag,major_delay_flag,delay_category_Passenger,delay_category_Staff,delay_category_Technical,delay_category_Weather,line_SHP,line_YU,season_Spring,season_Summer,season_Winter
0,1,1,3,0,3,0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,1,3,2,3,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,1,3,3,3,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,1,3,3,3,0,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1,1,3,8,3,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75499,9,30,2,20,2,0,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
75500,9,30,2,20,2,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75501,9,30,2,20,2,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75502,9,30,2,21,2,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# Feature Scaling

# Create a copy of the original 'ttc' dataframe to ensure we're not modifying the original data
standardized_ttc = ttc_encoded.copy()

# Specify the columns that we do NOT want to scale
columns_to_exclude = ['rush_hour_flag','delay_category_Passenger','delay_category_Staff','delay_category_Technical','delay_category_Weather','delay_category_Weather','line_SHP','line_YU','season_Spring','season_Summer','season_Winter','major_delay_flag']

# Select the columns that we want to scale by excluding the 'id' and 'diagnosis' columns
# This will return a list of the numeric columns we need to scale
columns_to_scale = standardized_ttc.columns.difference(columns_to_exclude)

# Initialize the StandardScaler to standardize the selected numeric columns
scaler = StandardScaler()

# Apply the scaler to the selected columns. This transforms the data so that each feature
# has a mean of 0 and a standard deviation of 1, which is essential to prevent larger
# scale features from dominating the analysis, especially for distance-based algorithms like KNN.
standardized_ttc[columns_to_scale] = scaler.fit_transform(ttc[columns_to_scale])

# Output the standardized dataframe with the scaled numeric columns
standardized_ttc


Unnamed: 0,month,day_of_month,weekday_num,hour,day_of_week,rush_hour_flag,major_delay_flag,delay_category_Passenger,delay_category_Staff,delay_category_Technical,delay_category_Weather,line_SHP,line_YU,season_Spring,season_Summer,season_Winter
0,-1.519599,-1.683939,-0.031463,-2.142370,-0.031463,0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-1.519599,-1.683939,-0.031463,-1.813028,-0.031463,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-1.519599,-1.683939,-0.031463,-1.648357,-0.031463,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-1.519599,-1.683939,-0.031463,-1.648357,-0.031463,0,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,-1.519599,-1.683939,-0.031463,-0.825001,-0.031463,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75499,0.781520,1.607590,-0.562198,1.151054,-0.562198,0,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
75500,0.781520,1.607590,-0.562198,1.151054,-0.562198,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75501,0.781520,1.607590,-0.562198,1.151054,-0.562198,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75502,0.781520,1.607590,-0.562198,1.315725,-0.562198,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
