In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load and clean the data
file_path = r"C:\Users\riya kansal\Desktop\2016.xlsx"  # Update this path to your actual file path
data = pd.read_excel(file_path)
data.columns = data.columns.str.strip()

# Display dataset information and missing values
print(data.info())
print(data.isnull().sum())

# Handling missing values: assuming no rainfall data
data.dropna(subset=['NDVI1', 'MaxTemp1', 'MinTemp1'], inplace=True)
data.fillna(data.mean(), inplace=True)

# Define the number of fortnights
num_fortnights = 8

# List to store the combined features and targets
combined_features = []
combined_targets = []

# Loop through each fortnight to collect features and targets
for i in range(1, num_fortnights + 1):
    # Define the features and target for the current fortnight
    target = f'NDVI{i}'
    features = [f'MaxTemp{i}', f'MinTemp{i}', f'DaysMaxTempAbove16{i}',
                f'DaysMaxTempAbove18{i}', f'DaysMaxTempAbove20{i}', f'DaysMaxTempAbove24{i}',
                f'DaysMinTempBelow16{i}', f'DaysMinTempBelow18{i}', f'DaysMinTempBelow20{i}',
                f'DaysMinTempBelow24{i}', f'Percentile99_Max{i}', f'Percentile95_Max{i}',
                f'Percentile90_Max{i}', f'Percentile99_Min{i}', f'Percentile95_Min{i}',
                f'Percentile90_Min{i}']

    # Ensure the features exist in the dataset
    features = [feature for feature in features if feature in data.columns]

    # Collect features and targets for the current fortnight
    if features and target in data.columns:
        combined_features.append(data[features])
        combined_targets.append(data[target])

# Concatenate all features and targets
X = pd.concat(combined_features, axis=0)
y = pd.concat(combined_targets, axis=0)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Feature importance using Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print(f"{f + 1}. Feature {indices[f]} ({importances[indices[f]]})")

# Select top features
top_features = indices[:int(0.5 * len(indices))]  # Select top 50% features

# Train with top features
X_train_top = X_train[:, top_features]
X_test_top = X_test[:, top_features]

# Grid search with top features
param_grid = {
    'n_estimators': [50, 100],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [5, 10, 15],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4]
}

rf = RandomForestRegressor(random_state=42, n_jobs=-1)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_top, y_train)

# Best parameters and model
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

# Predict and evaluate
y_train_pred = best_rf.predict(X_train_top)
y_test_pred = best_rf.predict(X_test_top)

# Training set evaluation
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_train_pred)

print(f'Training Mean Squared Error: {train_mse}')
print(f'Training Root Mean Squared Error: {train_rmse}')
print(f'Training R² Score: {train_r2}')

# Testing set evaluation
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Testing Mean Squared Error: {test_mse}')
print(f'Testing Root Mean Squared Error: {test_rmse}')
print(f'Testing R² Score: {test_r2}')

# Output the predictions and actual values for further analysis
print("Testing Predictions: ", y_test_pred[:10])
print("Testing Actual values: ", y_test[:10].values)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7480 entries, 0 to 7479
Columns: 107 entries, YEAR to Percentile90_Min8
dtypes: float64(46), int64(61)
memory usage: 6.1 MB
None
YEAR                 0
LATITUDE             0
LONGITUDE            0
NDVI1                0
NDVI2                0
                    ..
Percentile90_Min4    0
Percentile90_Min5    0
Percentile90_Min6    0
Percentile90_Min7    0
Percentile90_Min8    0
Length: 107, dtype: int64
Feature ranking:
1. Feature 81 (0.24297649444336372)
2. Feature 88 (0.1872655810663724)
3. Feature 87 (0.18715420830056392)
4. Feature 86 (0.1607671472204285)
5. Feature 5 (0.020050866661316075)
6. Feature 62 (0.018210736092683134)
7. Feature 70 (0.00673528658381051)
8. Feature 13 (0.006681767493860528)
9. Feature 23 (0.006084002559285687)
10. Feature 1 (0.005594438933794619)
11. Feature 60 (0.005534376917918253)
12. Feature 11 (0.005400060619077908)
13. Feature 10 (0.004777610430416774)
14. Feature 94 (0.004729698409715208)
15. Feature

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Load and clean the data
file_path = r"C:\Users\riya kansal\Desktop\2016.xlsx"  # Update this path to your actual file path
data = pd.read_excel(file_path)
data.columns = data.columns.str.strip()

# Display dataset information and missing values
print(data.info())
print(data.isnull().sum())

# Handling missing values: assuming no rainfall data
data.dropna(subset=['NDVI1', 'MaxTemp1', 'MinTemp1'], inplace=True)
data.fillna(data.mean(), inplace=True)

# Define the number of fortnights
num_fortnights = 8

# List to store the combined features and targets
combined_features = []
combined_targets = []

# Loop through each fortnight to collect features and targets
for i in range(1, num_fortnights + 1):
    # Define the features and target for the current fortnight
    target = f'NDVI{i}'
    features = [f'MaxTemp{i}', f'MinTemp{i}', f'DaysMaxTempAbove16{i}',
                f'DaysMaxTempAbove18{i}', f'DaysMaxTempAbove20{i}', f'DaysMaxTempAbove24{i}',
                f'DaysMinTempBelow16{i}', f'DaysMinTempBelow18{i}', f'DaysMinTempBelow20{i}',
                f'DaysMinTempBelow24{i}', f'Percentile99_Max{i}', f'Percentile95_Max{i}',
                f'Percentile90_Max{i}', f'Percentile99_Min{i}', f'Percentile95_Min{i}',
                f'Percentile90_Min{i}']

    # Ensure the features exist in the dataset
    features = [feature for feature in features if feature in data.columns]

    # Collect features and targets for the current fortnight
    if features and target in data.columns:
        combined_features.append(data[features])
        combined_targets.append(data[target])

# Concatenate all features and targets
X = pd.concat(combined_features, axis=0)
y = pd.concat(combined_targets, axis=0)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Reshape the data for LSTM
timesteps = num_fortnights  # Number of timesteps (fortnights)
num_features = X_scaled.shape[1] // num_fortnights

X_reshaped = X_scaled.reshape(-1, timesteps, num_features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)

# Build the LSTM model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(timesteps, num_features)))
model.add(Dropout(0.2))
model.add(LSTM(units=50))
model.add(Dropout(0.2))
model.add(Dense(1))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=2)

# Predict and evaluate
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Training set evaluation
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_train_pred)

print(f'Training Mean Squared Error: {train_mse}')
print(f'Training Root Mean Squared Error: {train_rmse}')
print(f'Training R² Score: {train_r2}')

# Testing set evaluation
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Testing Mean Squared Error: {test_mse}')
print(f'Testing Root Mean Squared Error: {test_rmse}')
print(f'Testing R² Score: {test_r2}')

# Output the predictions and actual values for further analysis
print("Testing Predictions: ", y_test_pred[:10].flatten())
print("Testing Actual values: ", y_test[:10].values)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7480 entries, 0 to 7479
Columns: 107 entries, YEAR to Percentile90_Min8
dtypes: float64(46), int64(61)
memory usage: 6.1 MB
None
YEAR                 0
LATITUDE             0
LONGITUDE            0
NDVI1                0
NDVI2                0
                    ..
Percentile90_Min4    0
Percentile90_Min5    0
Percentile90_Min6    0
Percentile90_Min7    0
Percentile90_Min8    0
Length: 107, dtype: int64
Epoch 1/100


  super().__init__(**kwargs)


1197/1197 - 10s - 9ms/step - loss: 0.0180 - val_loss: 0.0079
Epoch 2/100
1197/1197 - 7s - 6ms/step - loss: 0.0089 - val_loss: 0.0064
Epoch 3/100
1197/1197 - 8s - 6ms/step - loss: 0.0075 - val_loss: 0.0061
Epoch 4/100
1197/1197 - 8s - 7ms/step - loss: 0.0068 - val_loss: 0.0058
Epoch 5/100
1197/1197 - 8s - 7ms/step - loss: 0.0063 - val_loss: 0.0057
Epoch 6/100
1197/1197 - 8s - 6ms/step - loss: 0.0061 - val_loss: 0.0056
Epoch 7/100
1197/1197 - 7s - 6ms/step - loss: 0.0058 - val_loss: 0.0055
Epoch 8/100
1197/1197 - 7s - 6ms/step - loss: 0.0057 - val_loss: 0.0057
Epoch 9/100
1197/1197 - 7s - 6ms/step - loss: 0.0056 - val_loss: 0.0055
Epoch 10/100
1197/1197 - 7s - 6ms/step - loss: 0.0055 - val_loss: 0.0055
Epoch 11/100
1197/1197 - 7s - 6ms/step - loss: 0.0055 - val_loss: 0.0053
Epoch 12/100
1197/1197 - 9s - 7ms/step - loss: 0.0053 - val_loss: 0.0052
Epoch 13/100
1197/1197 - 9s - 7ms/step - loss: 0.0053 - val_loss: 0.0052
Epoch 14/100
1197/1197 - 8s - 6ms/step - loss: 0.0052 - val_loss: 0.005