In [20]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor


# Read the data - note: ice mask not included in this regression as it is a classification problem 
xy_ice_thickness = pd.read_csv('data/ice_thickness.csv')
xy_ice_velocity = pd.read_csv('data/ice_velocity.csv')



# Separate out into ones with x and y coordinates and ones without 
ice_thickness = xy_ice_thickness.drop(['x-axis', 'y-axis'], axis=1)
ice_velocity = xy_ice_velocity.drop(['x-axis', 'y-axis'], axis=1)


#normalise the data -> scale each column between 0 and 1 
def scale (df):
    """uses the formula scaled val = (val-column minimum)/(column maximum - column minimum)"""
    scaled_df = df.copy()
    # for column in df.columns[:-1]  -> use this line instead for not having a scaled target 
    for column in df.columns:
        min_value = df[column].min()
        max_value = df[column].max()
        scaled_df[column] = (df[column] - min_value) / (max_value - min_value)
            
    return scaled_df

xy_ice_thickness = scale(xy_ice_thickness)
xy_ice_velocity = scale (xy_ice_velocity)
ice_thickness = scale(ice_thickness)
ice_velocity = scale(ice_velocity) 


def split_data(df, test_size = 0.2, random_state=None):
    X = df.iloc[:, :-1] #selects all but the last column 
    y = df.iloc[:, -1] #selects only the last column

    # Splitting the data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    return x_train, x_test, y_train, y_test

ice_thickness.head()


Unnamed: 0,precipitation,air_temp,ocean_temp,ocean_salinity,ice_thickness
0,0.200196,0.861554,0.252584,0.355845,0.003529
1,0.187974,0.858077,0.23745,0.34803,0.003529
2,0.181589,0.855422,0.226182,0.338022,0.003529
3,0.177061,0.853527,0.218045,0.325885,0.003529
4,0.175708,0.853318,0.210816,0.3117,0.003529


In [21]:
# linear regression 
def linearRegression(df):
    x_train, x_test, y_train, y_test = split_data(df) #split into training and testing
    model = LinearRegression() #define the model 
    model.fit(x_train, y_train) #fit the model

    y_predicted = model.predict(x_test) #predict the values
    mse = mean_squared_error(y_test, y_predicted) #calculate the mean squared error
    print(f"score {model.score(x_test, y_test):,.4f}")
    
    return mse

# non linear regression -> Random Forest 
def randomForest(df):
    x_train, x_test, y_train, y_test = split_data(df) #split into training and testing
    model = RandomForestRegressor() #define the model 
    model.fit(x_train, y_train) #fit the model

    y_predicted = model.predict(x_test) #predict the values
    print(f"score {model.score(x_test, y_test):,.4f}")
    mse = mean_squared_error(y_test, y_predicted) #calculate the mean squared error
    
    return mse

def printing():
    print("NO XY LINEAR REGRESSION RESULTS")
    print(f"Ice Thickness: {linearRegression(ice_thickness):,.4f}")
    print(f"Ice Velocity: {linearRegression(ice_velocity):,.4f}")
    print('\n')
    print("XY LINEAR REGRESSION RESULTS")
    print(f"Ice Thickness: {linearRegression(xy_ice_thickness):,.4f}")
    print(f"Ice Velocity: {linearRegression(xy_ice_velocity):,.3f}")
    print('\n')
    print("NO XY RANDOM FOREST RESULTS")
    print(f"Ice Thickness: {randomForest(ice_thickness):,.3f}")
    print(f"Ice Velocity: {randomForest(ice_velocity):,.4f}")
    print('\n')
    print("XY RANDOM FOREST RESULTS")
    print(f"Ice Thickness: {randomForest(xy_ice_thickness):,.4f}")
    print(f"Ice Velocity: {randomForest(xy_ice_velocity):,.4f}")


printing()



NO XY LINEAR REGRESSION RESULTS
score 0.8634
Ice Thickness: 0.0082
score 0.0243
Ice Velocity: 0.0018


XY LINEAR REGRESSION RESULTS
score 0.8561
Ice Thickness: 0.0093
score 0.0643
Ice Velocity: 0.004


NO XY RANDOM FOREST RESULTS
score 0.9195
Ice Thickness: 0.006
score -0.1403
Ice Velocity: 0.0043


XY RANDOM FOREST RESULTS
score 0.9509
Ice Thickness: 0.0032
score -0.6503
Ice Velocity: 0.0013


In [25]:
# metrics to be able to interpret the MSE 

metrics_table = pd.read_csv('data/full_df-0.csv')
metrics_table.drop(columns=['x-axis', 'y-axis', 'ice_mask'], inplace=True)
metrics_table.describe()

Unnamed: 0,precipitation,air_temp,ocean_temp,ocean_salinity,ice_thickness,ice_velocity
count,2257.0,2257.0,2257.0,2257.0,2257.0,2257.0
mean,459.218673,253.250122,272.66112,14.612805,786.86356,32.871003
std,321.010247,17.451287,0.989742,7.036216,1165.98834,179.3694
min,-26.648338,212.949539,271.142242,0.112596,-16.252274,-123.940895
25%,128.605972,239.71521,271.964874,8.552329,0.0,-1.0
50%,477.346161,260.951752,272.414734,14.867147,0.0,-1.0
75%,704.524414,266.593628,273.05838,18.721558,1657.55835,5.325965
max,2290.933838,275.501678,277.330841,35.370872,4588.459961,3421.989746
