In [2]:
import pandas as pd
df = pd.read_csv('SST_Chlo_SSS_Wind.csv')
df

Unnamed: 0,CHL,SST,SSS,WIND
0,1.299,21.105,40.098,5.965
1,0.828,21.814,40.108,1.531
2,1.099,21.556,40.104,4.903
3,1.385,20.402,40.102,3.410
4,1.122,21.297,40.087,3.432
...,...,...,...,...
7279,2.258,24.238,39.563,1.381
7280,0.861,23.999,39.566,7.925
7281,0.825,23.772,39.562,5.035
7282,1.809,23.569,39.566,4.628


In [16]:
def detect_outliers(column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers
outliers_sst = detect_outliers('SST')
outliers_sss = detect_outliers('SSS')
outliers_wind = detect_outliers('WIND')
outliers_chl = detect_outliers('CHL')

# Display outliers
print("Outliers in SST:")
print(outliers_sst)

print("\nOutliers in SSS:")
print(outliers_sss)

print("\nOutliers in WIND:")
print(outliers_wind)

print("\nOutliers in CHL:")
print(outliers_chl)

Outliers in SST:
Empty DataFrame
Columns: [CHL, SST, SSS, WIND]
Index: []

Outliers in SSS:
        CHL     SST    SSS   WIND
2762  0.931  32.556  19.98  4.801
5520  1.385  19.586  20.00  4.874

Outliers in WIND:
        CHL     SST     SSS    WIND
16    1.429  20.131  40.035   8.468
46    1.975  21.249  39.949  10.510
57    0.997  19.922  39.931   9.432
58    1.177  19.844  39.969   9.312
70    1.725  20.417  39.941  11.179
...     ...     ...     ...     ...
7067  1.672  26.745  39.052   8.404
7079  0.872  30.549  38.950   9.634
7080  0.841  30.091  38.961  11.241
7177  1.585  33.698  39.126   9.238
7262  0.830  26.105  39.495   8.428

[148 rows x 4 columns]

Outliers in CHL:
         CHL     SST     SSS   WIND
15     4.030  19.937  40.043  4.155
19     2.340  21.001  40.079  2.733
20     3.640  20.813  40.074  4.747
33     2.227  19.973  40.033  3.711
37    11.989  20.574  39.997  3.184
...      ...     ...     ...    ...
7123   3.114  33.574  38.979  4.732
7191   4.719  31.551  39.

In [18]:
def remove_outliers(column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_no_outliers = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df_no_outliers

# Remove outliers for each column
df = remove_outliers('SST')
df = remove_outliers('SSS')
df_ = remove_outliers('WIND')
df = remove_outliers('CHL')
df

Unnamed: 0,CHL,SST,SSS,WIND
0,1.299,21.105,40.098,5.965
1,0.828,21.814,40.108,1.531
2,1.099,21.556,40.104,4.903
3,1.385,20.402,40.102,3.410
4,1.122,21.297,40.087,3.432
...,...,...,...,...
7277,0.796,24.446,39.570,5.593
7280,0.861,23.999,39.566,7.925
7281,0.825,23.772,39.562,5.035
7282,1.809,23.569,39.566,4.628


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

X = df[['SST', 'SSS', 'WIND']]
y = df['CHL']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = DecisionTreeRegressor()

model.fit(X_train, y_train)

predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)

print(f'Mean Squared Error: {mse}')


Mean Squared Error: 0.22703567871287128


In [20]:
# Import necessary libraries
from sklearn.metrics import r2_score

# ... (previous code)

# Evaluate the model with R-squared
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

# ... (remaining code)


R-squared: -0.7413489635087245


In [21]:
from sklearn.tree import export_text

# ... (previous code)

# Export rules from the decision tree model
tree_rules = export_text(model, feature_names=list(X.columns))
print(tree_rules)


|--- SSS <= 39.17
|   |--- SST <= 30.67
|   |   |--- SST <= 22.06
|   |   |   |--- SST <= 21.00
|   |   |   |   |--- SST <= 19.94
|   |   |   |   |   |--- value: [1.42]
|   |   |   |   |--- SST >  19.94
|   |   |   |   |   |--- value: [1.20]
|   |   |   |--- SST >  21.00
|   |   |   |   |--- value: [1.79]
|   |   |--- SST >  22.06
|   |   |   |--- WIND <= 6.18
|   |   |   |   |--- WIND <= 5.90
|   |   |   |   |   |--- SST <= 25.03
|   |   |   |   |   |   |--- WIND <= 3.84
|   |   |   |   |   |   |   |--- SSS <= 39.02
|   |   |   |   |   |   |   |   |--- SSS <= 38.99
|   |   |   |   |   |   |   |   |   |--- SST <= 23.61
|   |   |   |   |   |   |   |   |   |   |--- value: [1.06]
|   |   |   |   |   |   |   |   |   |--- SST >  23.61
|   |   |   |   |   |   |   |   |   |   |--- value: [1.19]
|   |   |   |   |   |   |   |   |--- SSS >  38.99
|   |   |   |   |   |   |   |   |   |--- value: [1.44]
|   |   |   |   |   |   |   |--- SSS >  39.02
|   |   |   |   |   |   |   |   |--- WIND <= 2.49


In [22]:

# Evaluate the model with modified R2 score
mean_chl = y_train.mean()
ss_total = ((y_test - mean_chl) ** 2).sum()
ss_residual = (predictions - y_test).dot(predictions - y_test)

modified_r2 = 1 - (ss_residual / ss_total)
print(f'Modified R-squared: {modified_r2}')



Modified R-squared: -0.7400844450616137


In [23]:

y_pred = model.predict(X_test)