In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# Load the dataset
data = pd.read_csv('/content/FeatureEngineering - Sheet1.csv')

# Inspect your data
print(data.head())
print(data.info())
print(data.describe())
print(data.isna().sum())

        MMSI        DepartureTime      LATd      LONd          ArrivalTime  \
0  205366000  2017-11-26 19:51:00  57.40660  21.53463  2017-11-26 19:51:00   
1  205366000  2017-11-27 13:18:00  57.40660  21.53437  2017-12-28 13:57:00   
2  205366000   2018-06-02 7:38:00  54.16003  12.14130   2018-06-03 5:43:00   
3  205366000  2018-06-03 12:16:00  57.59602  11.69184   2018-06-08 5:43:00   
4  205366000   2018-09-15 8:27:00  59.42259  27.72447  2018-10-01 15:16:00   

       LATa      LONa  AVGSPDkmph  DistanceKm  VesselType  Length  Width  \
0  57.40660  21.53463       28.65   723.43028        80.0   143.0   21.0   
1  54.16002  12.14124        2.73  2030.43443        80.0   143.0   21.0   
2  57.59724  11.69437       21.09   465.67234        80.0   143.0   21.0   
3  54.16005  12.14124        5.49   622.78374        80.0   143.0   21.0   
4  54.16008  12.14128        6.03  2356.37863        80.0   143.0   21.0   

   Draft  
0    8.2  
1    8.2  
2    8.2  
3    8.2  
4    8.2  
<class '

In [None]:
# Handle datetime columns appropriately
data['DepartureTime'] = pd.to_datetime(data['DepartureTime'], format="%Y-%m-%d %H:%M:%S")
data['ArrivalTime'] = pd.to_datetime(data['ArrivalTime'], format="%Y-%m-%d %H:%M:%S")

# Drop any missing values
data.dropna(inplace=True)

In [None]:
# Separate features and target variable
X = data[['LATd', 'LONd', 'LATa', 'LONa', 'AVGSPDkmph', 'DistanceKm', 'VesselType', 'Length', 'Width', 'Draft']]
y = data['ArrivalTime']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize the RandomForestRegressor model
rf = RandomForestRegressor()

# Define a grid of hyperparameters to explore
param_grid = {'n_estimators': [50, 100, 200],
              'max_depth': [5, 10, None]}

# Perform grid search cross-validation
grid_search = GridSearchCV(rf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model
rf = grid_search.best_estimator_

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 444, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 723, in score
    return r2_score(y, y_pred, sample_weight=sample_weight)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py", line 927, in r2_score
    numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)
numpy.core._exceptions._UFuncBinaryResolutionError: ufunc 'subtract' cannot use operands with types dtype('<M8[ns]') and dtype('float64')

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_tes

In [None]:
# Make predictions using the best model
y_pred = rf.predict(X_test)

# Convert y_test (actual arrival times) to Unix timestamps (seconds since epoch)
y_test_unix = y_test.astype(np.int64) // 10**9  # Integer division for seconds

# Calculate evaluation metrics using the converted timestamps
mae = mean_absolute_error(y_test_unix, y_pred)
mse = mean_squared_error(y_test_unix, y_pred)
r2 = r2_score(y_test_unix, y_pred)

print("MAE:", mae)
print("MSE:", mse)
print("R-squared:", r2)

MAE: 1.529023619717593e+18
MSE: 2.3379327628802244e+36
R-squared: -1.951317328460732e+22
