In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [4]:
# import os
# from pathlib import Path

# print("CWD:", os.getcwd())
# print("Here files:", [p.name for p in Path(".").iterdir()][:30])

CWD: /Users/richard/1B A/EYC/Models/BaseData
Here files: ['.ipynb_checkpoints', 'linear_regression.ipynb']


In [11]:
water_quality_data = pd.read_csv("water_quality_training_dataset.csv")
terraclimate_data  = pd.read_csv("terraclimate_features_training.csv")
landsat_data       = pd.read_csv("landsat_features_training.csv")

In [14]:
print("Water quality columns:", water_quality_data.columns.tolist())
print("TerraClimate columns:", terraclimate_data.columns.tolist())
print("Landsat columns:", landsat_data.columns.tolist())

Water quality columns: ['Latitude', 'Longitude', 'Sample Date', 'Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus']
TerraClimate columns: ['Latitude', 'Longitude', 'Sample Date', 'pet']
Landsat columns: ['Latitude', 'Longitude', 'Sample Date', 'nir', 'green', 'swir16', 'swir22', 'NDMI', 'MNDWI']


In [16]:
# Merge datasets
merged_data = water_quality_data.merge(
    terraclimate_data, 
    on=['Latitude', 'Longitude', 'Sample Date'],  
    how='left'
).merge(
    landsat_data, 
    on=['Latitude', 'Longitude', 'Sample Date'],  
    how='left'
)

print("Merged shape:", merged_data.shape)
print(merged_data.head())
print("Columns:", merged_data.columns.tolist())

Merged shape: (9319, 13)
    Latitude  Longitude Sample Date  Total Alkalinity  Electrical Conductance  \
0 -28.760833  17.730278  02-01-2011           128.912                   555.0   
1 -26.861111  28.884722  03-01-2011            74.720                   162.9   
2 -26.450000  28.085833  03-01-2011            89.254                   573.0   
3 -27.671111  27.236944  03-01-2011            82.000                   203.6   
4 -27.356667  27.286389  03-01-2011            56.100                   145.1   

   Dissolved Reactive Phosphorus    pet      nir    green   swir16   swir22  \
0                           10.0  174.2  11190.0  11426.0   7687.5   7645.0   
1                          163.0  124.1  17658.5   9550.0  13746.5  10574.0   
2                           80.0  127.5  15210.0  10720.0  17974.0  14201.0   
3                          101.0  129.7  14887.0  10943.0  13522.0  11403.0   
4                          151.0  129.2  16828.5   9502.5  12665.5   9643.0   

       NDMI  

In [17]:

targets = ['Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus']

X = merged_data.drop(columns=targets + ['Sample Date'])  
y = merged_data[targets]


print("Missing values in X:\n", X.isnull().sum())
print("\nX shape:", X.shape)
print("y shape:", y.shape)

X = X.fillna(X.mean())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
results = {}

for target in targets:
    print(f"\n{'='*50}")
    print(f"Training model for: {target}")
    print('='*50)
    
    # Train model
    model = LinearRegression()
    model.fit(X_train, y_train[target])
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    mse = mean_squared_error(y_test[target], y_pred)
    r2 = r2_score(y_test[target], y_pred)
    
    results[target] = {'MSE': mse, 'R2': r2}
    
    print(f"MSE: {mse:.4f}")
    print(f"R² Score: {r2:.4f}")


print(f"\n{'='*50}")
print("SUMMARY - Linear Regression Results")
print('='*50)
for target, metrics in results.items():
    print(f"{target}: R² = {metrics['R2']:.4f}")

avg_r2 = sum([m['R2'] for m in results.values()]) / len(results)
print(f"\nAverage R² Score: {avg_r2:.4f}")

Missing values in X:
 Latitude        0
Longitude       0
pet             0
nir          1085
green        1085
swir16       1085
swir22       1085
NDMI         1085
MNDWI        1085
dtype: int64

X shape: (9319, 9)
y shape: (9319, 3)

Training model for: Total Alkalinity
MSE: 4553.5360
R² Score: 0.2090

Training model for: Electrical Conductance
MSE: 98022.4721
R² Score: 0.1789

Training model for: Dissolved Reactive Phosphorus
MSE: 2437.0925
R² Score: 0.0919

SUMMARY - Linear Regression Results
Total Alkalinity: R² = 0.2090
Electrical Conductance: R² = 0.1789
Dissolved Reactive Phosphorus: R² = 0.0919

Average R² Score: 0.1599


In [22]:
submission_template = pd.read_csv("submission_template.csv")
print("Submission template shape:", submission_template.shape)
print(submission_template.head())

Submission template shape: (200, 6)
    Latitude  Longitude Sample Date  Total Alkalinity  Electrical Conductance  \
0 -32.043333  27.822778  01-09-2014               NaN                     NaN   
1 -33.329167  26.077500  16-09-2015               NaN                     NaN   
2 -32.991639  27.640028  07-05-2015               NaN                     NaN   
3 -34.096389  24.439167  07-02-2012               NaN                     NaN   
4 -32.000556  28.581667  01-10-2014               NaN                     NaN   

   Dissolved Reactive Phosphorus  
0                            NaN  
1                            NaN  
2                            NaN  
3                            NaN  
4                            NaN  


# Submission Template

- Test the model R^2 using submission template

In [42]:
submission_template = pd.read_csv("submission_template.csv")
print("Submission template shape:", submission_template.shape)
print(submission_template.head())

Submission template shape: (200, 6)
    Latitude  Longitude Sample Date  Total Alkalinity  Electrical Conductance  \
0 -32.043333  27.822778  01-09-2014               NaN                     NaN   
1 -33.329167  26.077500  16-09-2015               NaN                     NaN   
2 -32.991639  27.640028  07-05-2015               NaN                     NaN   
3 -34.096389  24.439167  07-02-2012               NaN                     NaN   
4 -32.000556  28.581667  01-10-2014               NaN                     NaN   

   Dissolved Reactive Phosphorus  
0                            NaN  
1                            NaN  
2                            NaN  
3                            NaN  
4                            NaN  


In [43]:
submission_features = submission_template.merge(
    terraclimate_data,
    on=['Latitude', 'Longitude', 'Sample Date'],
    how='left'
).merge(
    landsat_data,
    on=['Latitude', 'Longitude', 'Sample Date'],
    how='left'
)

print("Submission features shape:", submission_features.shape)
print("Columns:", submission_features.columns.tolist())

Submission features shape: (200, 13)
Columns: ['Latitude', 'Longitude', 'Sample Date', 'Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus', 'pet', 'nir', 'green', 'swir16', 'swir22', 'NDMI', 'MNDWI']


In [44]:
# Keep only the same columns as training X
submission_X = submission_features[X.columns]
submission_X = submission_X.fillna(submission_X.mean())

print("Submission X shape:", submission_X.shape)
print("Any missing values:", submission_X.isnull().sum().sum())

Submission X shape: (200, 9)
Any missing values: 1400


In [46]:
# Prepare submission features (X)
submission_X = submission_features.drop(columns=['Sample Date'])

# Keep only matching columns in the same order
submission_X = submission_X[X.columns]

# Fill missing values - use training data statistics
for col in submission_X.columns:
    if submission_X[col].isnull().any():
        # Use the mean from training data, not submission data
        submission_X[col] = submission_X[col].fillna(X[col].mean())

print(f"Submission X shape: {submission_X.shape}")
print(f"Any NaN left: {submission_X.isnull().sum().sum()}")

Submission X shape: (200, 9)
Any NaN left: 0


In [48]:
# Display all columns including predictions
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
print(final_predictions.head())
print("\nColumns in file:", final_predictions.columns.tolist())
print("\nShape:", final_predictions.shape)

    Latitude  Longitude Sample Date
0 -32.043333  27.822778  01-09-2014
1 -33.329167  26.077500  16-09-2015
2 -32.991639  27.640028  07-05-2015
3 -34.096389  24.439167  07-02-2012
4 -32.000556  28.581667  01-10-2014

Columns in file: ['Latitude', 'Longitude', 'Sample Date']

Shape: (200, 3)


In [49]:
# Display the submission file with all columns visible
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', 20)

print("Submission predictions:")
print(final_predictions)

Submission predictions:
      Latitude  Longitude Sample Date
0   -32.043333  27.822778  01-09-2014
1   -33.329167  26.077500  16-09-2015
2   -32.991639  27.640028  07-05-2015
3   -34.096389  24.439167  07-02-2012
4   -32.000556  28.581667  01-10-2014
..         ...        ...         ...
195 -33.771111  25.386667  06-12-2012
196 -33.185361  27.390750  04-09-2014
197 -32.043333  27.822778  28-09-2015
198 -33.001667  25.161389  08-01-2015
199 -33.237780  26.994720  27-03-2013

[200 rows x 3 columns]


In [50]:
final_predictions

Unnamed: 0,Latitude,Longitude,Sample Date
0,-32.043333,27.822778,01-09-2014
1,-33.329167,26.077500,16-09-2015
2,-32.991639,27.640028,07-05-2015
3,-34.096389,24.439167,07-02-2012
4,-32.000556,28.581667,01-10-2014
...,...,...,...
195,-33.771111,25.386667,06-12-2012
196,-33.185361,27.390750,04-09-2014
197,-32.043333,27.822778,28-09-2015
198,-33.001667,25.161389,08-01-2015
