In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load and clean the data
file_path = r"C:\Users\riya kansal\Desktop\2016.xlsx"  # Update this path to your actual file path
data = pd.read_excel(file_path)
data.columns = data.columns.str.strip()

# Display dataset information and missing values
print(data.info())
print(data.isnull().sum())

# Handling missing values: assuming no rainfall data
data.dropna(subset=['NDVI1', 'MaxTemp1', 'MinTemp1'], inplace=True)
data.fillna(data.mean(), inplace=True)

# Define the number of fortnights
num_fortnights = 8

# List to store the combined features and targets
combined_features = []
combined_targets = []

# Loop through each fortnight to collect features and targets
for i in range(1, num_fortnights + 1):
    # Define the features and target for the current fortnight
    target = f'NDVI{i}'
    features = [f'MaxTemp{i}', f'MinTemp{i}', f'DaysMaxTempAbove16{i}',
                f'DaysMaxTempAbove18{i}', f'DaysMaxTempAbove20{i}', f'DaysMaxTempAbove24{i}',
                f'DaysMinTempBelow16{i}', f'DaysMinTempBelow18{i}', f'DaysMinTempBelow20{i}',
                f'DaysMinTempBelow24{i}', f'Percentile99_Max{i}', f'Percentile95_Max{i}',
                f'Percentile90_Max{i}', f'Percentile99_Min{i}', f'Percentile95_Min{i}',
                f'Percentile90_Min{i}']

    # Ensure the features exist in the dataset
    features = [feature for feature in features if feature in data.columns]

    # Collect features and targets for the current fortnight
    if features and target in data.columns:
        combined_features.append(data[features])
        combined_targets.append(data[target])

# Concatenate all features and targets
X = pd.concat(combined_features, axis=0)
y = pd.concat(combined_targets, axis=0)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the Gradient Boosting model
gbr = GradientBoostingRegressor(random_state=42)

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0]
}

# Grid search for the best parameters
grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and model
best_params = grid_search.best_params_
best_gbr = grid_search.best_estimator_

# Predict and evaluate
y_train_pred = best_gbr.predict(X_train)
y_test_pred = best_gbr.predict(X_test)

# Training set evaluation
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_train_pred)

print(f'Training Mean Squared Error: {train_mse}')
print(f'Training Root Mean Squared Error: {train_rmse}')
print(f'Training R² Score: {train_r2}')

# Testing set evaluation
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Testing Mean Squared Error: {test_mse}')
print(f'Testing Root Mean Squared Error: {test_rmse}')
print(f'Testing R² Score: {test_r2}')

# Output the predictions and actual values for further analysis
print("Testing Predictions: ", y_test_pred[:10])
print("Testing Actual values: ", y_test[:10].values)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7480 entries, 0 to 7479
Columns: 107 entries, YEAR to Percentile90_Min8
dtypes: float64(46), int64(61)
memory usage: 6.1 MB
None
YEAR                 0
LATITUDE             0
LONGITUDE            0
NDVI1                0
NDVI2                0
                    ..
Percentile90_Min4    0
Percentile90_Min5    0
Percentile90_Min6    0
Percentile90_Min7    0
Percentile90_Min8    0
Length: 107, dtype: int64
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Training Mean Squared Error: 0.003594767398394434
Training Root Mean Squared Error: 0.05995637913011787
Training R² Score: 0.8701129980709474
Testing Mean Squared Error: 0.0038493135782567737
Testing Root Mean Squared Error: 0.06204283663934761
Testing R² Score: 0.860968739167267
Testing Predictions:  [0.69681312 0.82334829 0.68462096 0.36955101 0.6548771  0.73600916
 0.7859916  0.43496949 0.40117685 0.70927358]
Testing Actual values:  [0.65300006 0.80700004 0.72900003 0.4080