In [12]:
# I dont know if this is something typically done, I assume so though
# in this file I will experiment with how I can use ice mask to predict ice velocity
# since the ice mask prediction is quite good, I thought I could predict ice mask, then add that predicted
# column back in as a feature to be used in the ice velocity prediction. Since velocity is not being used
# as a predictor of ice mask, i believe there should be no data leak from this. 

import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

EF = pd.read_csv('../data/AIS_data/EF_full_df.csv')

velocity = EF[['ice_velocity']]
thickness = EF[['ice_thickness']]

#shifting the columns so the target is the one furthest to the right -> needed for scaling to avoid scaling ice mask
EF = EF[['x-axis', 'y-axis', 'precipitation', 'precip_roll', 'air_temp', 'air_roll', 'ocean_temp',
         'ocean_roll', 'temp_diff', 'dist', 'ice_velocity', 'ice_thickness','ice_mask']]

#method to normalise the data -> scale each column between 0 and 1 
def scale (df):
    """uses the formula scaled val = (val-column minimum)/(column maximum - column minimum)"""
    scaled_df = df.copy()
    # for column in df.columns[:-1]  -> use this line instead for not having a scaled target 
    for column in scaled_df.columns[:-1]:
        min_value = scaled_df[column].min()
        max_value = scaled_df[column].max()
        scaled_df[column] = (scaled_df[column] - min_value) / (max_value - min_value)
            
    return scaled_df

def unscale_rmse(scaled_rmse, target_name, original_df):
    target_min = original_df[target_name].min()
    target_max = original_df[target_name].max()

    return scaled_rmse * (target_max - target_min)



In [2]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np

def doPolySVC(preprocessed_df, seed, n_runs):
    overall_accuracies = []
    training_accuracies = []
    category_accuracies = {'grounded_ice': [], 'floating_ice': [], 'open_ocean': []}
    training_category_accuracies = {'grounded_ice': [], 'floating_ice': [], 'open_ocean': []}

    for i in range(n_runs):
        randomState = i * seed
        X = preprocessed_df.iloc[:, :-1]
        y = preprocessed_df.iloc[:, -1]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=randomState)
        
        #then remove the ice velocity from both xy train and test, but keep aside 
        
        model = SVC(kernel='poly', random_state=randomState, class_weight = 'balanced')
        model.fit(X_train, y_train)

        # Test data evaluation
        y_predicted = model.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_predicted)
        overall_accuracies.append(test_accuracy)
        
        test_cm = confusion_matrix(y_test, y_predicted, labels=[2.0, 3.0, 4.0])
        category_accuracies['grounded_ice'].append(test_cm[0, 0] / test_cm[0, :].sum() if test_cm[0, :].sum() > 0 else 0)
        category_accuracies['floating_ice'].append(test_cm[1, 1] / test_cm[1, :].sum() if test_cm[1, :].sum() > 0 else 0)
        category_accuracies['open_ocean'].append(test_cm[2, 2] / test_cm[2, :].sum() if test_cm[2, :].sum() > 0 else 0)

        # Training data evaluation
        y_train_predicted = model.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_predicted)
        training_accuracies.append(train_accuracy)

        train_cm = confusion_matrix(y_train, y_train_predicted, labels=[2.0, 3.0, 4.0])
        training_category_accuracies['grounded_ice'].append(train_cm[0, 0] / train_cm[0, :].sum() if train_cm[0, :].sum() > 0 else 0)
        training_category_accuracies['floating_ice'].append(train_cm[1, 1] / train_cm[1, :].sum() if train_cm[1, :].sum() > 0 else 0)
        training_category_accuracies['open_ocean'].append(train_cm[2, 2] / train_cm[2, :].sum() if train_cm[2, :].sum() > 0 else 0)

    # Calculate averages and standard deviations
    average_accuracy = np.mean(overall_accuracies)
    std_accuracy = np.std(overall_accuracies)
    average_training_accuracy = np.mean(training_accuracies)
    std_training_accuracy = np.std(training_accuracies)

    average_category_accuracies = {k: np.mean(v) for k, v in category_accuracies.items()}
    std_category_accuracies = {k: np.std(v) for k, v in category_accuracies.items()}
    average_training_category_accuracies = {k: np.mean(v) for k, v in training_category_accuracies.items()}
    std_training_category_accuracies = {k: np.std(v) for k, v in training_category_accuracies.items()}

    # Print average and standard deviation of accuracies
    print(f"Average Training Accuracy: {average_training_accuracy:.3f}, Std: {std_training_accuracy:.3f}")
    print("Training Category Metrics:")
    for category in training_category_accuracies:
        print(f"  {category}: Avg: {average_training_category_accuracies[category]:.3f}, Std: {std_training_category_accuracies[category]:.3f}") 

    print(f"\nAverage Test Accuracy: {average_accuracy:.3f}, Std: {std_accuracy:.3f}")
    print("Test Category Metrics:")
    for category in category_accuracies:
        print(f"  {category}: Avg: {average_category_accuracies[category]:.3f}, Std: {std_category_accuracies[category]:.3f}")
    


In [21]:
# NOTE NEED TO DO STRATIFIED SPLITTING SINCE THE CLASS IS SO IMBALANCED 
def doPiggyBack(preprocessed_df, seed, n_runs):

    for i in range(n_runs):
        randomState = i * seed
        X = preprocessed_df.iloc[:, :-1]
        y = preprocessed_df.iloc[:, -1]
        
        #split into training and test 
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=randomState)
        
        #then remove the ice velocity from both xy train and test, but keep aside 
        X_train_velocity = X_train['ice_velocity']
        X_train.drop(columns = ['ice_velocity'], inplace = True)
        X_test_velocity = X_test['ice_velocity']
        X_test.drop(columns = ['ice_velocity'], inplace = True)
        
        model = SVC(kernel='poly', random_state=randomState, class_weight = 'balanced')
        model.fit(X_train, y_train)

        # Training data evaluation
        y_train_predicted = model.predict(X_train)
        
        # Test data evaluation
        y_test_predicted = model.predict(X_test)
        
        #adding the data back together
        
        
seed = 101
preprocessed_df = EF.drop(columns = ['ice_thickness'], inplace = False)
preprocessed_df.fillna(value = -1, inplace = True)
scaled_preprocessed_df = scale(preprocessed_df)
scaled_preprocessed_df.head()  

doPiggyBack(scaled_preprocessed_df, seed, 1)


X_train_velocity
1623    0.000000
1103    0.000000
1851    0.004555
1933    0.001471
113     0.000000
          ...   
1033    0.000512
1731    0.000000
763     0.000685
835     0.001269
1653    0.010505
Name: ice_velocity, Length: 1579, dtype: float64
X train
      x-axis  y-axis  precipitation  precip_roll  air_temp  air_roll  \
1623    0.20    0.30       0.303468     0.486340  0.861506  0.861147   
1103    0.00    0.50       0.338658     0.540796  0.978627  0.977795   
1851    0.80    0.22       0.260878     0.500682  0.543994  0.588928   
1933    0.62    0.18       0.107907     0.207050  0.428930  0.475921   
113     0.18    0.92       0.283395     0.433198  0.900400  0.897791   
...      ...     ...            ...          ...       ...       ...   
1033    0.64    0.54       0.016987     0.001387  0.007904  0.000730   
1731    0.34    0.26       0.294340     0.479494  0.819511  0.818413   
763     0.34    0.64       0.077438     0.132742  0.531724  0.555130   
835     0.76    0.6