<a href="https://colab.research.google.com/github/saksham-42/Summer-Analytics-2025-Assignments/blob/main/Hackathon_SA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()


# Libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.signal import savgol_filter
from scipy.stats import skew, kurtosis

# Data
train = pd.read_csv('hacktrain.csv')
test = pd.read_csv('hacktest.csv')

# NDVI Col Identification
ndvi_cols = [col for col in test.columns if '_N' in col]

# Seasonal Mapping
def get_season(date_str):
    month = int(str(date_str)[:6][4:6])
    if month in [12, 1, 2]:
        return 'winter'
    elif month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    elif month in [9, 10, 11]:
        return 'fall'
    else:
        return 'unknown'
season_map = {col: get_season(col) for col in ndvi_cols}

# Preprocessing
for df in [train, test]:
    df[ndvi_cols] = df[ndvi_cols].interpolate(axis=1, limit_direction='both')

def sg_smooth(row, window=5, poly=2):
    w = min(window, len(row) if len(row)%2==1 else len(row)-1)
    if w < 3: w = 3
    try:
        return savgol_filter(row, window_length=w, polyorder=poly)
    except:
        return row  # fallback if filter fails

for df in [train, test]:
    df[ndvi_cols] = df[ndvi_cols].apply(lambda x: sg_smooth(x.values), axis=1, result_type='expand')

# Feature Engineering
def extract_features(df):
    feats = pd.DataFrame()
    feats['ndvi_mean'] = df[ndvi_cols].mean(axis=1)
    feats['ndvi_std'] = df[ndvi_cols].std(axis=1)
    feats['ndvi_min'] = df[ndvi_cols].min(axis=1)
    feats['ndvi_max'] = df[ndvi_cols].max(axis=1)
    feats['ndvi_median'] = df[ndvi_cols].median(axis=1)
    feats['ndvi_amplitude'] = feats['ndvi_max'] - feats['ndvi_min']
    feats['ndvi_skew'] = df[ndvi_cols].apply(lambda x: skew(x), axis=1)
    feats['ndvi_kurtosis'] = df[ndvi_cols].apply(lambda x: kurtosis(x), axis=1)
    feats['ndvi_trend'] = df[ndvi_cols].apply(lambda x: np.polyfit(range(len(x)), x, 1)[0], axis=1)
    feats['ndvi_auc'] = df[ndvi_cols].sum(axis=1)
    n = len(ndvi_cols)
    feats['ndvi_early_mean'] = df[ndvi_cols[:n//3]].mean(axis=1)
    feats['ndvi_mid_mean'] = df[ndvi_cols[n//3:2*n//3]].mean(axis=1)
    feats['ndvi_late_mean'] = df[ndvi_cols[2*n//3:]].mean(axis=1)
    feats['ndvi_above_05'] = (df[ndvi_cols] > 0.5).sum(axis=1)
    feats['ndvi_below_0'] = (df[ndvi_cols] < 0).sum(axis=1)
    feats['ndvi_first'] = df[ndvi_cols].iloc[:, 0]
    feats['ndvi_last'] = df[ndvi_cols].iloc[:, -1]
    feats['ndvi_diff_first_last'] = feats['ndvi_last'] - feats['ndvi_first']
    feats['ndvi_peak_time'] = df[ndvi_cols].idxmax(axis=1).apply(lambda x: int(str(x).split('_')[0]))
    # --- Seasonal means ---
    for season in ['winter', 'spring', 'summer', 'fall']:
        season_cols = [col for col in ndvi_cols if season_map[col] == season]
        feats[f'ndvi_{season}_mean'] = df[season_cols].mean(axis=1) if season_cols else 0
    return feats

X_train = extract_features(train)
y_train = train['class']
X_test = extract_features(test)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature Expansion
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

# Hyperparameter Tuning with GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1],
    'solver': ['lbfgs', 'saga'],
    'max_iter': [500, 1000]
}
grid = GridSearchCV(LogisticRegression(multi_class='multinomial', class_weight='balanced', random_state=42),
                    param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train_poly, y_train)

model = grid.best_estimator_
print("Best parameters:", grid.best_params_)

# Validation with Stratified K-Fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for train_idx, val_idx in cv.split(X_train_poly, y_train):
    model.fit(X_train_poly[train_idx], y_train.iloc[train_idx])
    preds = model.predict(X_train_poly[val_idx])
    acc = accuracy_score(y_train.iloc[val_idx], preds)
    scores.append(acc)
    print("\nFold Classification Report:\n", classification_report(y_train.iloc[val_idx], preds))
    print("Confusion Matrix:\n", confusion_matrix(y_train.iloc[val_idx], preds))
print(f"\nAverage CV Accuracy: {np.mean(scores):.4f} ± {np.std(scores):.4f}")

# Model Training
model.fit(X_train_poly, y_train)

# Prediction on Test Data
test['class'] = model.predict(X_test_poly)

# Submission file + Download
submission = test[['ID', 'class']]
submission.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' is ready!")
files.download('submission.csv')

Saving hacktest.csv to hacktest (3).csv
Saving hacktrain.csv to hacktrain (3).csv


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best parameters: {'C': 1, 'max_iter': 500, 'solver': 'lbfgs'}


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Fold Classification Report:
               precision    recall  f1-score   support

        farm       0.49      0.72      0.59       168
      forest       0.97      0.81      0.88      1232
       grass       0.29      0.82      0.43        39
  impervious       0.73      0.81      0.77       134
     orchard       0.07      0.50      0.13         6
       water       0.56      0.71      0.62        21

    accuracy                           0.80      1600
   macro avg       0.52      0.73      0.57      1600
weighted avg       0.88      0.80      0.83      1600

Confusion Matrix:
 [[121  18   9  11   5   4]
 [112 998  64  25  26   7]
 [  3   1  32   2   1   0]
 [  8   6   5 109   5   1]
 [  0   3   0   0   3   0]
 [  1   0   1   3   1  15]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Fold Classification Report:
               precision    recall  f1-score   support

        farm       0.44      0.75      0.55       168
      forest       0.97      0.78      0.86      1232
       grass       0.22      0.59      0.32        39
  impervious       0.70      0.79      0.74       134
     orchard       0.03      0.17      0.06         6
       water       0.39      0.81      0.52        21

    accuracy                           0.77      1600
   macro avg       0.46      0.65      0.51      1600
weighted avg       0.86      0.77      0.80      1600

Confusion Matrix:
 [[126  15   9   9   4   5]
 [146 955  70  28  24   9]
 [  4   4  23   7   0   1]
 [  7   5   4 106   0  12]
 [  2   3   0   0   1   0]
 [  2   0   0   2   0  17]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Fold Classification Report:
               precision    recall  f1-score   support

        farm       0.48      0.74      0.59       168
      forest       0.97      0.81      0.88      1232
       grass       0.24      0.72      0.36        39
  impervious       0.74      0.75      0.74       134
     orchard       0.03      0.17      0.05         6
       water       0.47      0.76      0.58        21

    accuracy                           0.79      1600
   macro avg       0.49      0.66      0.53      1600
weighted avg       0.87      0.79      0.82      1600

Confusion Matrix:
 [[125  23   6   9   4   1]
 [119 992  66  20  26   9]
 [  0   4  28   5   0   2]
 [ 11   4  14 100   1   4]
 [  2   1   0   0   1   2]
 [  1   2   1   1   0  16]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Fold Classification Report:
               precision    recall  f1-score   support

        farm       0.43      0.68      0.52       169
      forest       0.96      0.81      0.88      1232
       grass       0.24      0.62      0.34        39
  impervious       0.74      0.81      0.78       133
     orchard       0.04      0.17      0.07         6
       water       0.47      0.67      0.55        21

    accuracy                           0.78      1600
   macro avg       0.48      0.62      0.52      1600
weighted avg       0.86      0.78      0.81      1600

Confusion Matrix:
 [[115  28   9  11   3   3]
 [135 992  59  17  20   9]
 [  8   2  24   5   0   0]
 [  6   6   9 108   0   4]
 [  5   0   0   0   1   0]
 [  1   2   0   4   0  14]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Fold Classification Report:
               precision    recall  f1-score   support

        farm       0.45      0.76      0.56       168
      forest       0.97      0.77      0.86      1231
       grass       0.30      0.88      0.45        40
  impervious       0.72      0.77      0.74       134
     orchard       0.04      0.33      0.08         6
       water       0.38      0.57      0.45        21

    accuracy                           0.77      1600
   macro avg       0.48      0.68      0.52      1600
weighted avg       0.87      0.77      0.80      1600

Confusion Matrix:
 [[127  22   6   4   4   5]
 [143 949  62  27  40  10]
 [  1   1  35   3   0   0]
 [  8   5  12 103   1   5]
 [  2   1   0   1   2   0]
 [  2   1   1   5   0  12]]

Average CV Accuracy: 0.7812 ± 0.0122
Submission file 'submission.csv' is ready!


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>