# Data Preprocessing

The following data preprocessing steps are the same ones as were used in the initial code for data cleaning.

In [61]:
pip install -U scikit-learn



In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [63]:
df = pd.read_csv('https://raw.githubusercontent.com/snehasri2600/Ocean-Classification/refs/heads/main/Ocean_Class_Cleaned_Data.csv')
display(df)

Unnamed: 0,atm_surface_temp,atm_surface_humidity,zonal_wind_speed,meridional_wind_speed,wind_speed,atm_surface_pressure,seawater_density_anomaly,hydrostatic_pressure_anomaly,realm
0,258.80704,0.002396,-1.421412,1.504944,3.966997,98732.860,-1.497837,120.33860,Southern Cold Water
1,258.80704,0.002396,-1.421412,1.504944,3.966997,98732.860,-1.497837,120.33860,Southern Cold Water
2,259.48505,0.002413,-1.475978,1.528884,4.023276,98726.880,-1.499950,123.39962,Southern Cold Water
3,259.48505,0.002413,-1.475978,1.528884,4.023276,98726.880,-1.499950,123.39962,Southern Cold Water
4,260.28888,0.002427,-1.533721,1.645065,4.075678,98722.550,-1.505903,130.80351,Southern Cold Water
...,...,...,...,...,...,...,...,...,...
51941,253.90776,0.000719,-3.107174,4.886118,7.411502,102083.305,-3.039971,109.26570,Northern Cold Water
51942,253.90776,0.000719,-3.107174,4.886118,7.411502,102083.305,-3.039971,109.26570,Northern Cold Water
51943,253.90776,0.000719,-3.107174,4.886118,7.411502,102083.305,-3.039971,109.26570,Northern Cold Water
51944,253.90776,0.000719,-3.107174,4.886118,7.411502,102083.305,-3.039971,109.26570,Northern Cold Water


In [64]:
# Import libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [65]:
# Define features (X) and target (y)

# Dropping 'realm' from the features column so we can use all other variables as features, and label the target as the 'realm'
X = df.drop(['realm'], axis=1)
y = df['realm']

# Split into testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [66]:
# Apply the same transformations as were applied in the previous code
X_train['atm_surface_temp'] = (X_train['atm_surface_temp'])**2
X_test['atm_surface_temp'] = (X_test['atm_surface_temp'])**2

X_train['atm_surface_pressure'] = (X_train['atm_surface_pressure'])**2
X_test['atm_surface_pressure'] = (X_test['atm_surface_pressure'])**2

X_train['hydrostatic_pressure_anomaly'] = np.log(X_train['hydrostatic_pressure_anomaly'])
X_test['hydrostatic_pressure_anomaly'] = np.log(X_test['hydrostatic_pressure_anomaly'])

In [67]:
ohe = OneHotEncoder(sparse_output=False)

# Reshape y_train and y_test to be 2D arrays
y_train_reshaped = y_train.values.reshape(-1, 1)
y_test_reshaped = y_test.values.reshape(-1, 1)

# OneHotEncoder on y_train and y_test data
y_train_encoded = ohe.fit_transform(y_train_reshaped)
y_test_encoded = ohe.transform(y_test_reshaped)

We can now start running supervised learning models on the dataset.

# K-Nearest Neighbors

The reason I chose k-nearest neighbors for this analysis is that, although it works slowly on large datasets, it generally performs well and doesn't assume normality or a lack of relationship between the features in the dataset. I have attempted to transform the above variables to normal distributions, but was unable to get a clear picture.

I will use the original train-test split for this analysis.

In [68]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

I will start by using the default parameters, then conduct a grid search to find the best parameters and test some other ones to find patterns.

In [69]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train_encoded)
print(f'knn score: {knn.score(X_train, y_train_encoded)}')
knn.predict(X_train)

knn score: 0.6967581303421546


array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [70]:
# Predicting on test set and checking accuracy score
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test_encoded, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test_encoded, y_pred))

Accuracy: 0.5377685377685377
              precision    recall  f1-score   support

           0       0.65      0.25      0.36       358
           1       0.67      0.63      0.65      4386
           2       0.71      0.60      0.65      2264
           3       0.60      0.51      0.55      1964
           4       0.63      0.52      0.57      3041
           5       0.74      0.30      0.42       273
           6       0.36      0.17      0.23       334
           7       0.35      0.12      0.18       176
           8       0.15      0.08      0.11        25
           9       0.50      0.11      0.18        83
          10       0.20      0.02      0.04        83

   micro avg       0.65      0.54      0.59     12987
   macro avg       0.50      0.30      0.36     12987
weighted avg       0.64      0.54      0.58     12987
 samples avg       0.54      0.54      0.54     12987



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


I can now check cross-validation scores on the default parameters for this model.

In [71]:
# Cross validation across training data
from sklearn.model_selection import cross_val_score
knn_cv = KNeighborsClassifier()
cv_scores = cross_val_score(knn_cv, X_train, y_train_encoded, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.47227926 0.46316735 0.46008727 0.45675051 0.4651521 ]
Mean cross-validation score: 0.46348729856515847


Cross-validation did not generate promising results with the default parameters. I will now perform a grid search across the training data to find best parameters.

In [72]:
# Performing grid search for best parameters
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors': np.arange(1, 25)}

knn_gscv = GridSearchCV(knn, param_grid, cv=5)
knn_gscv.fit(X_train, y_train_encoded)

0,1,2
,estimator,KNeighborsClassifier()
,param_grid,"{'n_neighbors': array([ 1, 2..., 22, 23, 24])}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_neighbors,np.int64(1)
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [73]:
# Checking the accuracy score of the best value of n_neighbors according to the Grid Search
knn_gscv.best_score_

np.float64(0.8236606986368994)

In [74]:
# Checking cross-validation results
knn_gscv.cv_results_

{'mean_fit_time': array([0.33138895, 0.14749818, 0.14127092, 0.15488057, 0.13437543,
        0.15185375, 0.24064274, 0.26177278, 0.13469372, 0.07460213,
        0.07387533, 0.0760469 , 0.07300339, 0.07426491, 0.07266078,
        0.07595072, 0.08417087, 0.1170711 , 0.08942523, 0.07169943,
        0.07279472, 0.0716013 , 0.0709662 , 0.07260461]),
 'std_fit_time': array([0.13724071, 0.0422982 , 0.02112345, 0.02763133, 0.02223707,
        0.02624296, 0.09758082, 0.13919227, 0.02489705, 0.00553356,
        0.001658  , 0.00666433, 0.00332758, 0.0055444 , 0.00327445,
        0.00639689, 0.01881022, 0.0048428 , 0.02033529, 0.00188965,
        0.00283265, 0.00144017, 0.0031228 , 0.00248294]),
 'mean_score_time': array([0.2145474 , 0.16696   , 0.208881  , 0.1861527 , 0.18097243,
        0.17934895, 0.33080044, 0.24224558, 0.17382007, 0.09768381,
        0.10777707, 0.09963608, 0.10227013, 0.11247354, 0.11240301,
        0.11625776, 0.14858065, 0.20229621, 0.15666776, 0.12847419,
        0.131915

n_neighbors = 1 ranked best in this model. A value of 10 neighbors fell somewhere in between the best and worst performance, and n_neighbors = 24 was the worst performing model.

In [75]:
# Top two best performing parameters: accuracy
knn1 = KNeighborsClassifier(n_neighbors=1)
knn1.fit(X_train, y_train_encoded)
print(f'knn (1 neighbor) score: {knn1.score(X_test, y_test_encoded)}')
print(f'Classification report (1 neighbor):\n {classification_report(y_test_encoded, knn1.predict(X_test))}')

knn2 = KNeighborsClassifier(n_neighbors=3)
knn2.fit(X_train, y_train_encoded)
print(f'knn (2 neighbors) score: {knn2.score(X_test, y_test_encoded)}')
print(f'Classification report (2 neighbors):\n {classification_report(y_test_encoded, knn2.predict(X_test))}')

knn (1 neighbor) score: 0.8784938784938785
Classification report (1 neighbor):
               precision    recall  f1-score   support

           0       0.72      0.62      0.67       358
           1       0.91      0.92      0.92      4386
           2       0.89      0.88      0.88      2264
           3       0.87      0.89      0.88      1964
           4       0.90      0.90      0.90      3041
           5       0.78      0.77      0.77       273
           6       0.75      0.69      0.72       334
           7       0.68      0.64      0.66       176
           8       0.56      0.56      0.56        25
           9       0.59      0.53      0.56        83
          10       0.56      0.41      0.47        83

   micro avg       0.88      0.88      0.88     12987
   macro avg       0.75      0.71      0.73     12987
weighted avg       0.88      0.88      0.88     12987
 samples avg       0.88      0.88      0.88     12987

knn (2 neighbors) score: 0.6615846615846616
Classific

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [76]:
# Middle-performing parameter: accuracy
knn3 = KNeighborsClassifier(n_neighbors=10)
knn3.fit(X_train, y_train_encoded)
print(f'knn (10 neighbors) score: {knn3.score(X_test, y_test_encoded)}')
print(f'Classification report (10 neighbors):\n {classification_report(y_test_encoded, knn3.predict(X_test))}')

knn (10 neighbors) score: 0.3327173327173327
Classification report (10 neighbors):
               precision    recall  f1-score   support

           0       0.67      0.04      0.08       358
           1       0.63      0.43      0.51      4386
           2       0.70      0.44      0.54      2264
           3       0.61      0.27      0.37      1964
           4       0.60      0.27      0.38      3041
           5       0.98      0.21      0.34       273
           6       0.42      0.02      0.05       334
           7       0.56      0.05      0.09       176
           8       0.00      0.00      0.00        25
           9       0.00      0.00      0.00        83
          10       0.25      0.01      0.02        83

   micro avg       0.64      0.33      0.44     12987
   macro avg       0.49      0.16      0.22     12987
weighted avg       0.63      0.33      0.42     12987
 samples avg       0.33      0.33      0.33     12987



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [77]:
# Worst performing parameter
knn4 = KNeighborsClassifier(n_neighbors=24)
knn4.fit(X_train, y_train_encoded)
print(f'knn (24 neighbors) score: {knn4.score(X_test, y_test_encoded)}')
print(f'Classification report (24 neighbors):\n {classification_report(y_test_encoded, knn4.predict(X_test))}')

knn (24 neighbors) score: 0.24116424116424118
Classification report (24 neighbors):
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       358
           1       0.57      0.34      0.43      4386
           2       0.64      0.31      0.41      2264
           3       0.61      0.19      0.29      1964
           4       0.53      0.16      0.25      3041
           5       1.00      0.20      0.33       273
           6       0.50      0.01      0.02       334
           7       0.00      0.00      0.00       176
           8       0.00      0.00      0.00        25
           9       0.00      0.00      0.00        83
          10       0.00      0.00      0.00        83

   micro avg       0.59      0.24      0.34     12987
   macro avg       0.35      0.11      0.16     12987
weighted avg       0.56      0.24      0.33     12987
 samples avg       0.24      0.24      0.24     12987



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# Decision Trees and Random Forests

I am going to start by attempting decision tree analysis on the dataset, since it doesn't need much scaling and makes no assumptions about the data. If the tree overfits the data, I will apply the random forest method to attempt more generalization in the model.

Like the KNN model, I will start without defining any parameters, and edit parameters after the grid search has been completed.

In [78]:
from sklearn.tree import DecisionTreeClassifier

# Fitting the decision tree model to the training sets and then calculating accuracy for both training and test sets
tree = DecisionTreeClassifier(random_state = 0)
tree.fit(X_train, y_train_encoded)
print(f'Accuracy on training set: {tree.score(X_train, y_train_encoded)}')
print(f'Accuracy on test set: {tree.score(X_test, y_test_encoded)}')

# Anayzing precision, recall, and f1 scores on the prediction model using the decision tree
print(classification_report(y_test_encoded, tree.predict(X_test)))

Accuracy on training set: 0.9689673759593419
Accuracy on test set: 0.9452529452529452
              precision    recall  f1-score   support

           0       0.80      0.73      0.76       358
           1       0.98      0.98      0.98      4386
           2       0.96      0.97      0.97      2264
           3       0.95      0.94      0.95      1964
           4       0.97      0.96      0.97      3041
           5       0.89      0.85      0.87       273
           6       0.88      0.84      0.86       334
           7       0.87      0.80      0.83       176
           8       0.76      0.52      0.62        25
           9       0.77      0.60      0.68        83
          10       0.63      0.45      0.52        83

   micro avg       0.96      0.95      0.95     12987
   macro avg       0.86      0.79      0.82     12987
weighted avg       0.96      0.95      0.95     12987
 samples avg       0.95      0.95      0.95     12987



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [79]:
# Checking which features were most significant in influencing the model
print(tree.feature_importances_)

[0.10406901 0.0638787  0.12560141 0.18686216 0.0885233  0.14876081
 0.20173016 0.08057444]


In [80]:
param_grid = {'max_depth': np.arange(1, 10)}
tree_gscv = GridSearchCV(tree, param_grid, cv=5)
tree_gscv.fit(X_train, y_train_encoded)

0,1,2
,estimator,DecisionTreeC...andom_state=0)
,param_grid,"{'max_depth': array([1, 2, ..., 6, 7, 8, 9])}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,np.int64(9)
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,0
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [81]:
tree_gscv.best_score_

np.float64(0.8169356121434277)

Since the best maximum depth for the decision tree is 9, I will first try to rebuild the model with a max depth of 9. I will then try 5 and 20 as values to determine how much the model might be overfitting to the training set.

In [82]:
# Best performing, max_depth = 9
tree1 = DecisionTreeClassifier(max_depth=9)
tree1.fit(X_train, y_train_encoded)
print(f'Accuracy on training set: {tree1.score(X_train, y_train_encoded)}')
print(f'Accuracy on test set: {tree1.score(X_test, y_test_encoded)}')
print(classification_report(y_test_encoded, tree1.predict(X_test)))

Accuracy on training set: 0.8365204445699325
Accuracy on test set: 0.8298298298298298
              precision    recall  f1-score   support

           0       0.79      0.49      0.60       358
           1       0.89      0.89      0.89      4386
           2       0.90      0.94      0.92      2264
           3       0.80      0.75      0.78      1964
           4       0.86      0.87      0.87      3041
           5       0.76      0.55      0.64       273
           6       0.82      0.45      0.58       334
           7       0.86      0.61      0.71       176
           8       0.88      0.28      0.42        25
           9       0.94      0.19      0.32        83
          10       0.33      0.02      0.04        83

   micro avg       0.87      0.83      0.85     12987
   macro avg       0.80      0.55      0.62     12987
weighted avg       0.86      0.83      0.84     12987
 samples avg       0.83      0.83      0.83     12987



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [83]:
# max_depth = 5
tree2 = DecisionTreeClassifier(max_depth=5)
tree2.fit(X_train, y_train_encoded)
print(f'Accuracy on training set: {tree2.score(X_train, y_train_encoded)}')
print(f'Accuracy on test set: {tree2.score(X_test, y_test_encoded)}')
print(classification_report(y_test_encoded, tree2.predict(X_test)))

Accuracy on training set: 0.6238353140481019
Accuracy on test set: 0.6163856163856164
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       358
           1       0.70      0.77      0.73      4386
           2       0.69      0.89      0.78      2264
           3       0.74      0.51      0.60      1964
           4       0.64      0.51      0.57      3041
           5       0.64      0.19      0.29       273
           6       0.00      0.00      0.00       334
           7       0.00      0.00      0.00       176
           8       0.00      0.00      0.00        25
           9       0.00      0.00      0.00        83
          10       0.00      0.00      0.00        83

   micro avg       0.69      0.62      0.65     12987
   macro avg       0.31      0.26      0.27     12987
weighted avg       0.63      0.62      0.62     12987
 samples avg       0.62      0.62      0.62     12987



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [84]:
# max_depth = 20
tree3 = DecisionTreeClassifier(max_depth=9)
tree3.fit(X_train, y_train_encoded)
print(f'Accuracy on training set: {tree3.score(X_train, y_train_encoded)}')
print(f'Accuracy on test set: {tree3.score(X_test, y_test_encoded)}')
print(classification_report(y_test_encoded, tree3.predict(X_test)))

Accuracy on training set: 0.8365204445699325
Accuracy on test set: 0.8295218295218295
              precision    recall  f1-score   support

           0       0.79      0.49      0.60       358
           1       0.89      0.89      0.89      4386
           2       0.90      0.94      0.92      2264
           3       0.80      0.75      0.78      1964
           4       0.86      0.87      0.87      3041
           5       0.77      0.55      0.64       273
           6       0.82      0.45      0.58       334
           7       0.87      0.60      0.71       176
           8       0.88      0.28      0.42        25
           9       0.94      0.19      0.32        83
          10       0.33      0.02      0.04        83

   micro avg       0.87      0.83      0.85     12987
   macro avg       0.80      0.55      0.62     12987
weighted avg       0.86      0.83      0.84     12987
 samples avg       0.83      0.83      0.83     12987



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


It looks like there isn't much of a difference between a maximum depth of 9 versus 20 in the decision tree, and both models perform relatively well. In order to ensure the least amount of overfitting, it is best to stick with a maximum depth of 9.