In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split  
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten
from catboost import CatBoostRegressor

# Load Data

In [3]:
data  =  pd.read_csv('train delay data.csv')

In [4]:
data

Unnamed: 0,Distance Between Stations (km),Weather Conditions,Day of the Week,Time of Day,Train Type,Historical Delay (min),Route Congestion
0,100,Clear,Monday,Morning,Express,5,Low
1,150,Rainy,Tuesday,Afternoon,Superfast,10,Medium
2,200,Foggy,Wednesday,Evening,Local,15,High
3,50,Clear,Thursday,Night,Express,2,Low
4,75,Rainy,Friday,Morning,Superfast,8,Medium
...,...,...,...,...,...,...,...
2873,945,Clear,Tuesday,Night,Local,1210,Medium
2874,925,Rainy,Wednesday,Morning,Express,1215,High
2875,950,Foggy,Thursday,Afternoon,Superfast,1220,Low
2876,930,Clear,Friday,Evening,Local,1225,Medium


# Data Exploration

In [5]:
data.shape

(2878, 7)

In [6]:
data.dtypes

Distance Between Stations (km)     int64
Weather Conditions                object
Day of the Week                   object
Time of Day                       object
Train Type                        object
Historical Delay (min)             int64
Route Congestion                  object
dtype: object

In [7]:
data.describe()

Unnamed: 0,Distance Between Stations (km),Historical Delay (min)
count,2878.0,2878.0
mean,184.423211,93.324878
std,169.394783,195.923817
min,0.0,0.0
25%,70.0,13.0
50%,155.0,35.0
75%,225.0,74.0
max,955.0,1230.0


In [8]:
data.isnull().sum()

Distance Between Stations (km)    0
Weather Conditions                0
Day of the Week                   0
Time of Day                       0
Train Type                        0
Historical Delay (min)            0
Route Congestion                  0
dtype: int64

There are no null values in this dataset

# Feature Engineering

In [9]:
# Define a function to encode binary categorical variable
def encode_binary_category(day):
    return 0 if day in ['Saturday', 'Sunday'] else 1

# Encode 'Day of the Week' into binary format
data['Weekday'] = data['Day of the Week'].apply(encode_binary_category)

In [10]:
data.head()

Unnamed: 0,Distance Between Stations (km),Weather Conditions,Day of the Week,Time of Day,Train Type,Historical Delay (min),Route Congestion,Weekday
0,100,Clear,Monday,Morning,Express,5,Low,1
1,150,Rainy,Tuesday,Afternoon,Superfast,10,Medium,1
2,200,Foggy,Wednesday,Evening,Local,15,High,1
3,50,Clear,Thursday,Night,Express,2,Low,1
4,75,Rainy,Friday,Morning,Superfast,8,Medium,1


New column named Weekday has been added.

In [11]:
# Define categorical columns for one-hot encoding
categorical_cols = ['Time of Day', 'Train Type', 'Route Congestion', 'Weather Conditions']

# Perform one-hot encoding for each categorical column
for col in categorical_cols:
    one_hot_encoded = pd.get_dummies(data[col], prefix=col)
    data = pd.concat([data, one_hot_encoded], axis=1)

In [12]:
data.head()

Unnamed: 0,Distance Between Stations (km),Weather Conditions,Day of the Week,Time of Day,Train Type,Historical Delay (min),Route Congestion,Weekday,Time of Day_Afternoon,Time of Day_Evening,...,Time of Day_Night,Train Type_Express,Train Type_Local,Train Type_Superfast,Route Congestion_High,Route Congestion_Low,Route Congestion_Medium,Weather Conditions_Clear,Weather Conditions_Foggy,Weather Conditions_Rainy
0,100,Clear,Monday,Morning,Express,5,Low,1,False,False,...,False,True,False,False,False,True,False,True,False,False
1,150,Rainy,Tuesday,Afternoon,Superfast,10,Medium,1,True,False,...,False,False,False,True,False,False,True,False,False,True
2,200,Foggy,Wednesday,Evening,Local,15,High,1,False,True,...,False,False,True,False,True,False,False,False,True,False
3,50,Clear,Thursday,Night,Express,2,Low,1,False,False,...,True,True,False,False,False,True,False,True,False,False
4,75,Rainy,Friday,Morning,Superfast,8,Medium,1,False,False,...,False,False,False,True,False,False,True,False,False,True


We implemeted one-hot encoding on all the categorical features except "Day of the week".

In [13]:
data.dtypes

Distance Between Stations (km)     int64
Weather Conditions                object
Day of the Week                   object
Time of Day                       object
Train Type                        object
Historical Delay (min)             int64
Route Congestion                  object
Weekday                            int64
Time of Day_Afternoon               bool
Time of Day_Evening                 bool
Time of Day_Morning                 bool
Time of Day_Night                   bool
Train Type_Express                  bool
Train Type_Local                    bool
Train Type_Superfast                bool
Route Congestion_High               bool
Route Congestion_Low                bool
Route Congestion_Medium             bool
Weather Conditions_Clear            bool
Weather Conditions_Foggy            bool
Weather Conditions_Rainy            bool
dtype: object

# Data Split

In [14]:
# Split the data into features and target variable
X = data.drop(columns=['Historical Delay (min)','Weather Conditions','Day of the Week','Time of Day','Train Type','Route Congestion'])
y = data['Historical Delay (min)']

We set our target variable to Historical Delay (min).

In [15]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model

# XG BOOST

In [16]:
# Convert data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define XGBoost parameters
params = {
    'objective': 'reg:squarederror',  # Use squared error for regression
    'eval_metric': 'rmse'  # Root Mean Squared Error as evaluation metric
}

# Train the XGBoost model
num_rounds = 100  # Number of boosting rounds
model = xgb.train(params, dtrain, num_rounds)

# Make predictions on the test set
y_pred = model.predict(dtest)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("XGBoost Root Mean Squared Error (RMSE):", rmse)

r2_gb = r2_score(y_test, y_pred)
print(f"XGBoost R-squared Score: {r2_gb}")

XGBoost Root Mean Squared Error (RMSE): 48.49686009180093
XGBoost R-squared Score: 0.9458438008717384


# Gradient Boosting Machine (GBM)

In [17]:
# Instantiate the Gradient Boosting Machine (GBM) model
gbm_model = GradientBoostingRegressor()

# Fit the model to the training data
gbm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = gbm_model.predict(X_test)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("GBM Root Mean Squared Error (RMSE):", rmse)

r2_gb = r2_score(y_test, y_pred)
print(f"GBM R-squared Score: {r2_gb}")

GBM Root Mean Squared Error (RMSE): 47.88772460393422
GBM R-squared Score: 0.9471956941733888


# Cat Boost

In [18]:
pip install catboost

Note: you may need to restart the kernel to use updated packages.


In [19]:
 #Instantiate the CatBoost model
catboost_model = CatBoostRegressor()

# Fit the model to the training data
catboost_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_catboost = catboost_model.predict(X_test)

# Evaluate the model
rmse_catboost = mean_squared_error(y_test, y_pred_catboost, squared=False)
print("CatBoost Root Mean Squared Error (RMSE):", rmse_catboost)

r2_catboost = r2_score(y_test, y_pred_catboost)
print(f"CatBoost R-squared Score: {r2_catboost}")

Learning rate set to 0.046708
0:	learn: 185.7390163	total: 150ms	remaining: 2m 30s
1:	learn: 179.2675838	total: 152ms	remaining: 1m 15s
2:	learn: 172.5062440	total: 153ms	remaining: 50.9s
3:	learn: 166.6457837	total: 155ms	remaining: 38.6s
4:	learn: 161.0350508	total: 157ms	remaining: 31.1s
5:	learn: 155.1505041	total: 158ms	remaining: 26.2s
6:	learn: 149.7307670	total: 159ms	remaining: 22.6s
7:	learn: 144.2213315	total: 161ms	remaining: 19.9s
8:	learn: 139.0999267	total: 162ms	remaining: 17.8s
9:	learn: 134.5521401	total: 163ms	remaining: 16.2s
10:	learn: 130.5137110	total: 165ms	remaining: 14.8s
11:	learn: 126.0765267	total: 166ms	remaining: 13.7s
12:	learn: 121.7929818	total: 167ms	remaining: 12.7s
13:	learn: 117.4691821	total: 169ms	remaining: 11.9s
14:	learn: 113.5821793	total: 170ms	remaining: 11.2s
15:	learn: 110.1740616	total: 172ms	remaining: 10.6s
16:	learn: 106.9720142	total: 173ms	remaining: 9.98s
17:	learn: 103.5192123	total: 174ms	remaining: 9.5s
18:	learn: 100.3843197	to

# Linear Regression

In [20]:
# Instantiate the Linear Regression model
linear_model = LinearRegression()

# Fit the model to the training data
linear_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = linear_model.predict(X_test)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Linear Regression Root Mean Squared Error (RMSE):", rmse)

r2_gb = r2_score(y_test, y_pred)
print(f"Linear Regression R-squared Score: {r2_gb}")

Linear Regression Root Mean Squared Error (RMSE): 112.00324776630505
Linear Regression R-squared Score: 0.7111434857905125


# Support Vector Regression (SVR) 

In [21]:
from sklearn.svm import SVR

# Instantiate the SVR model
svr_model = SVR()

# Fit the SVR model to the training data
svr_model.fit(X_train, y_train)

# Make predictions on the test set using SVR
y_pred_svr = svr_model.predict(X_test)

# Evaluate the model
rmse_svr = mean_squared_error(y_test, y_pred_svr, squared=False)
print("SVR Root Mean Squared Error (RMSE):", rmse_svr)

r2_svr = r2_score(y_test, y_pred_svr)
print(f"SVR R-squared Score: {r2_svr}")

SVR Root Mean Squared Error (RMSE): 173.1320024591218
SVR R-squared Score: 0.30979891428879347


# Region-based Convolutional Neural Network (RCNN) 

In [22]:
pip install keras

Note: you may need to restart the kernel to use updated packages.


In [23]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [24]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape the features for compatibility with Conv1D layer
X_train_reshaped = np.expand_dims(X_train_scaled, axis=2)
X_test_reshaped = np.expand_dims(X_test_scaled, axis=2)

# Define the RCNN model architecture
model_rcnn = Sequential()
model_rcnn.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train_reshaped.shape[1], 1)))
model_rcnn.add(MaxPooling1D(pool_size=2))
model_rcnn.add(LSTM(50, activation='relu'))
model_rcnn.add(Dense(1))

# Compile the RCNN model
model_rcnn.compile(optimizer='adam', loss='mse')

# Train the RCNN model
model_rcnn.fit(X_train_reshaped, y_train, epochs=50, batch_size=32, verbose=1)

# Make predictions using RCNN model
y_pred_rcnn = model_rcnn.predict(X_test_reshaped)

# Evaluate the RCNN model
mse_rcnn = mean_squared_error(y_test, y_pred_rcnn, squared=False)
r2_rcnn = r2_score(y_test, y_pred_rcnn)
print("RCNN Root Mean Squared Error (RMSE):", mse_rcnn)
print("RCNN R-squared Score:", r2_rcnn)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 44082.4922
Epoch 2/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 18246.6250
Epoch 3/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 3467.7920
Epoch 4/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 2893.7051
Epoch 5/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 2470.0693
Epoch 6/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 2594.6970
Epoch 7/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 2438.4900
Epoch 8/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 2306.1824
Epoch 9/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 2257.6440
Epoch 10/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/

# Conclusion

| Model             | RMSE       | R-squared |
|-------------------|------------|-----------|
| XG BOOST          | 48.496     | 0.945     |
| GBM               | 47.887     | 0.947     |
| Linear Regression | 112.003    | 0.711     |
| SVR               | 173.132    | 0.309     |
| Cat Boost         | 40.523     | 0.962     |
| RCNN              | 50.526     | 0.941     |

After analyzing the performance of various models, it is evident that CatBoost outperformed other models. While the R-squared values were comparable across models, CatBoost demonstrated superior predictive accuracy, as evidenced by its lower RMSE. This suggests that CatBoost is better suited for the task at hand, delivering more precise predictions and capturing the underlying patterns in the data more effectively.