In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
data = pd.read_csv('/content/FeatureEngineering - Sheet1.csv',
                   parse_dates=['DepartureTime', 'ArrivalTime'],
                   infer_datetime_format=True)

# Inspect your data (Column names will reflect the new ones)
print(data.head())
print(data.info())
print(data.describe())
print(data.isna().sum())

  data = pd.read_csv('/content/FeatureEngineering - Sheet1.csv',


        MMSI       DepartureTime      LATd      LONd         ArrivalTime  \
0  205366000 2017-11-26 19:51:00  57.40660  21.53463 2017-11-26 19:51:00   
1  205366000 2017-11-27 13:18:00  57.40660  21.53437 2017-12-28 13:57:00   
2  205366000 2018-06-02 07:38:00  54.16003  12.14130 2018-06-03 05:43:00   
3  205366000 2018-06-03 12:16:00  57.59602  11.69184 2018-06-08 05:43:00   
4  205366000 2018-09-15 08:27:00  59.42259  27.72447 2018-10-01 15:16:00   

       LATa      LONa  AVGSPDkmph  DistanceKm  VesselType  Length  Width  \
0  57.40660  21.53463       28.65   723.43028        80.0   143.0   21.0   
1  54.16002  12.14124        2.73  2030.43443        80.0   143.0   21.0   
2  57.59724  11.69437       21.09   465.67234        80.0   143.0   21.0   
3  54.16005  12.14124        5.49   622.78374        80.0   143.0   21.0   
4  54.16008  12.14128        6.03  2356.37863        80.0   143.0   21.0   

   Draft  
0    8.2  
1    8.2  
2    8.2  
3    8.2  
4    8.2  
<class 'pandas.core.

In [None]:
# Handle missing values (choose appropriate strategy)
data.fillna(method='ffill', inplace=True)

# Convert datetimes, calculate duration, and extract features
data['DepartureTime'] = pd.to_datetime(data['DepartureTime'], format="%Y-%m-%d %H:%M:%S")
data['ArrivalTime'] = pd.to_datetime(data['ArrivalTime'], format="%Y-%m-%d %H:%M:%S")
data['TravelTime'] = data['ArrivalTime'] - data['DepartureTime']
data['TravelTime_seconds'] = data['TravelTime'].dt.total_seconds()
data['DepartureHour'] = data['DepartureTime'].dt.hour

In [None]:
X = data[['LATd', 'LONd', 'LATa', 'LONa', 'AVGSPDkmph', 'DistanceKm', 'DepartureHour', 'VesselType', 'Length', 'Width', 'Draft']]
y = data['TravelTime_seconds']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Initial KNN model
knn_model = KNeighborsRegressor()

# Hyperparameter tuning
param_grid = {
    'n_neighbors': range(3, 15),  # Number of neighbors to consider
    'weights': ['uniform', 'distance']  # Weighting scheme
}

grid_search = GridSearchCV(knn_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

print(grid_search.best_params_)

{'n_neighbors': 3, 'weights': 'distance'}


In [None]:
best_params = grid_search.best_params_
knn_model = KNeighborsRegressor(**best_params)
knn_model.fit(X_train_scaled, y_train)

y_pred = knn_model.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (Unix timestamps):", mae)
print("Mean Squared Error (Unix timestamps):", mse)
print("R-squared:", r2)

Mean Absolute Error (Unix timestamps): 34391.86742770948
Mean Squared Error (Unix timestamps): 11127013476.355192
R-squared: 0.9446468127346249


In [None]:
from sklearn.model_selection import cross_validate

# Perform cross-validation for MAE, MSE, and R2
cv_scores = cross_validate(knn_model, X_train_scaled, y_train, cv=5,
                           scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'])

# Extract cross-validation scores for each metric
cv_mse = -cv_scores['test_neg_mean_squared_error'].mean()  # MSE
cv_mae = -cv_scores['test_neg_mean_absolute_error'].mean()  # MAE
cv_r2 = cv_scores['test_r2'].mean()  # R-squared

print("Cross-Validation Mean Squared Error:", cv_mse)
print("Cross-Validation Mean Absolute Error:", cv_mae)
print("Cross-Validation R-squared (R2):", cv_r2)

Cross-Validation Mean Squared Error: 6214187918.301867
Cross-Validation Mean Absolute Error: 27689.1196616195
Cross-Validation R-squared (R2): 0.9617156972057694
