In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [None]:
# Load your CSV data
data = pd.read_csv('/content/FeatureEngineering - Sheet1.csv',
                   parse_dates=['DepartureTime', 'ArrivalTime'],
                   infer_datetime_format=True)

# Inspect your data (Column names will reflect the new ones)
print(data.head())  # Print first few rows
print(data.info())  # Information about columns, data types, etc.
print(data.describe())  # Summary statistics
print(data.isna().sum())  # Count of missing values per column

        MMSI       DepartureTime      LATd      LONd         ArrivalTime  \
0  205366000 2017-11-26 19:51:00  57.40660  21.53463 2017-11-26 19:51:00   
1  205366000 2017-11-27 13:18:00  57.40660  21.53437 2017-12-28 13:57:00   
2  205366000 2018-06-02 07:38:00  54.16003  12.14130 2018-06-03 05:43:00   
3  205366000 2018-06-03 12:16:00  57.59602  11.69184 2018-06-08 05:43:00   
4  205366000 2018-09-15 08:27:00  59.42259  27.72447 2018-10-01 15:16:00   

       LATa      LONa  AVGSPDkmph  DistanceKm  VesselType  Length  Width  \
0  57.40660  21.53463       28.65   723.43028        80.0   143.0   21.0   
1  54.16002  12.14124        2.73  2030.43443        80.0   143.0   21.0   
2  57.59724  11.69437       21.09   465.67234        80.0   143.0   21.0   
3  54.16005  12.14124        5.49   622.78374        80.0   143.0   21.0   
4  54.16008  12.14128        6.03  2356.37863        80.0   143.0   21.0   

   Draft  
0    8.2  
1    8.2  
2    8.2  
3    8.2  
4    8.2  
<class 'pandas.core.

In [None]:
# Handle missing values
data.fillna(method='ffill', inplace=True)

# Extract features
data['DepartureTime'] = pd.to_datetime(data['DepartureTime'], format="%Y-%m-%d %H:%M:%S")
data['ArrivalTime'] = pd.to_datetime(data['ArrivalTime'], format="%Y-%m-%d %H:%M:%S")
data['DepartureHour'] = data['DepartureTime'].dt.hour

# Convert ArrivalTime to Unix timestamps (for target variable)
data['ArrivalTime_unix'] = data['ArrivalTime'].astype(np.int64) // 10**9

In [None]:
# Select features and target variable
X = data[['MMSI', 'LATd', 'LONd', 'LATa', 'LONa', 'AVGSPDkmph', 'DistanceKm', 'DepartureHour', 'VesselType', 'Length', 'Width', 'Draft']]
y = data['ArrivalTime_unix']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Model definition (for regression)
model = Sequential()
model.add(Dense(units=32, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=1, activation='linear'))  # Output layer with 1 neuron for ArrivalTime prediction (in Unix timestamps)

In [None]:
# Model training
model.compile(optimizer='adam', loss='mean_squared_error')  # Compile the model
history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_data=(X_test_scaled, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
# Evaluation
y_pred = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Absolute Error (seconds):", mae)
print("Mean Squared Error (seconds^2):", mse)
print("R-squared:", r2)

Mean Absolute Error (seconds): 1529279978.754047
Mean Squared Error (seconds^2): 2.338817060127203e+18
R-squared: -19519.553926894965
