# MODEL 1


## Første Utkast Autogulon

#### 1. Environment Setup

In [33]:
# Autogluon import

from autogluon.tabular import TabularPredictor, TabularDataset



#### 2. Loading and Preparing the Data


In [None]:
%run "lakkenNoteBook.ipynb"



In [37]:
import pandas as pd

# Preprocessed training and test data imported from lakkenNoteBook

train_data = ais_train_enriched
test_data = ais_test_enriched

# Display first few rows of the data to verify loading
# train_data



In [None]:
print(ais_train_enriched.columns)

#### Splitting training data into training and validation

In [38]:
def train_val_split(to_model_df, split_date="2024-03-15", holdout_frac=0.2, version="date_split", seed=42):
    """
    Custom train-validation split for vessel trajectory prediction based on time.
    
    Params:
    - to_model_df: The dataframe containing all features and target columns.
    - split_date: The cutoff date to separate training and validation sets.
    - holdout_frac: Fraction of the validation data to hold out for testing after training.
    - version: The type of splitting to perform. Options: "date_split", "random_sampled", etc.
    - seed: Random seed for reproducibility.
    
    Returns:
    - training_data: DataFrame of training data.
    - validation_data: DataFrame of validation data.
    - holdout: DataFrame for holdout (testing).
    """
    
    print(f"Splitting dataset based on {version}...")

    # Sort the dataframe by 'time' to respect the temporal order
    to_model_df = to_model_df.sort_values(by="time").reset_index(drop=True)
    
    if version == "date_split":
        # Split into training and validation based on the split_date
        training_data = to_model_df[to_model_df["time"] < split_date]
        validation_data = to_model_df[to_model_df["time"] >= split_date]
    elif version == "random_sampled":
        # Randomly split a portion of the data for validation
        training_data, validation_data = train_test_split(to_model_df, test_size=0.2, random_state=seed)
    else:
        print("Invalid splitting type, defaulting to date-based split")
        training_data = to_model_df[to_model_df["time"] < split_date]
        validation_data = to_model_df[to_model_df["time"] >= split_date]

    print(f"Training data from {training_data['time'].min()} to {training_data['time'].max()}")
    print(f"Validation data from {validation_data['time'].min()} to {validation_data['time'].max()}")

    # Create holdout set from the validation data, with a random sample from validation
    holdout = validation_data.sample(frac=holdout_frac, random_state=seed)
    validation_data = validation_data.drop(holdout.index)

    return training_data, validation_data, holdout


#### Training the model

In [None]:
import pandas as pd
from autogluon.tabular import TabularPredictor, TabularDataset

# Split the data
split_date = "2024-03-15"  # The date chosen for splitting
train_data, val_data, holdout_data = train_val_split(ais_train_enriched, split_date=split_date, holdout_frac=0.2)

# Prepare the data for AutoGluon (remove columns that should not be used for training)
drop_columns = ['time', 'vesselId']  # Adjust this list based on features to exclude
train_data = TabularDataset(train_data.drop(columns=drop_columns))
val_data = TabularDataset(val_data.drop(columns=drop_columns))
holdout_data = TabularDataset(holdout_data.drop(columns=drop_columns))

# Define the targets separately
latitude_target = 'latitude'
longitude_target = 'longitude'

# Train the first model for latitude prediction
latitude_predictor = TabularPredictor(
    problem_type="regression",
    label=latitude_target,
    eval_metric="mean_absolute_error",  # You can adjust this based on your preference
    path="autogluon_latitude_model",
    verbosity=2
)

print("Training the model for latitude prediction...")
latitude_predictor.fit(
    train_data,
    tuning_data=val_data,  # Use validation data for tuning
    time_limit=60 * 60,  # Time limit of 1 hour
    presets="best_quality",  # Prioritize best quality
    excluded_model_types=["KNN"],  # Exclude unsuitable models for trajectory prediction
)
print("Latitude model training complete!")

# Train the second model for longitude prediction
longitude_predictor = TabularPredictor(
    problem_type="regression",
    label=longitude_target,
    eval_metric="mean_absolute_error",  # You can adjust this based on your preference
    path="autogluon_longitude_model",
    verbosity=2
)

print("Training the model for longitude prediction...")
longitude_predictor.fit(
    train_data,
    tuning_data=val_data,  # Use validation data for tuning
    time_limit=60 * 60,  # Time limit of 1 hour
    presets="best_quality",  # Prioritize best quality
    excluded_model_types=["KNN"],  # Exclude unsuitable models for trajectory prediction
)
print("Longitude model training complete!")

# Once both models are trained, you can evaluate them on the holdout data or test data


#### Make Predictions

In [None]:
# Make predictions on the test data
latitude_predictions = latitude_predictor.predict(test_data)
longitude_predictions = longitude_predictor.predict(test_data)

# Create the output DataFrame with predictions
output = pd.DataFrame({
    'ID': test_data['ID'],  # Row ID from the test set
    'longitude_predicted': longitude_predictions,  # Longitude prediction
    'latitude_predicted': latitude_predictions     # Latitude prediction
})

# Display the first few rows of the output DataFrame
print(output.head())

# Save the output to a CSV file
output.to_csv('ais_predictions.csv', index=False)
