<a href="https://colab.research.google.com/github/pranavbakshi/Summer_Analytics/blob/main/summer_analytics_mid_hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from scipy.signal import savgol_filter
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt


def preprocess_ndvi(df, is_train=True, class_map=None):
    ndvi_cols = [col for col in df.columns if "_N" in col]

    df[ndvi_cols] = df[ndvi_cols].interpolate(axis=1).bfill(axis=1).ffill(axis=1)

    def smart_smooth(row):
        if isinstance(row, pd.Series):
            if row.isnull().all():
                return np.full_like(row.values, np.nan)
            elif row.isnull().mean() > 0.5:
                 median_val = row.median()
                 return np.full_like(row.values, median_val if pd.notnull(median_val) else np.nan)

            try:
                smoothed_row = savgol_filter(row.values[~np.isnan(row.values)], 5, 2)

                result = np.full_like(row.values, np.nan)
                result[~np.isnan(row.values)] = smoothed_row
                return result

            except Exception as e:
                return row.values
        else:
            return row

    smoothed_data = df[ndvi_cols].apply(lambda row: smart_smooth(row), axis=1, result_type='broadcast')
    df[ndvi_cols] = smoothed_data


    if not is_train and class_map:
        for col in ndvi_cols:
            # Corrected indentation for the else block
            if col in class_map:
                 df[col] = df[col].fillna(class_map[col])
            else:
                 print(f"Warning: Column {col} not found in class_map. Cannot fill NaNs.")


    return df

train = pd.read_csv("/content/hacktrain.csv")
test = pd.read_csv("/content/hacktest.csv")

train = preprocess_ndvi(train.copy())
ndvi_cols = [col for col in train.columns if "_N" in col]
if 'class' in train.columns:
    class_ndvi_map = train.groupby('class')[ndvi_cols].median()
    class_ndvi_map = class_ndvi_map.mean().to_dict()

    test = preprocess_ndvi(test.copy(), is_train=False, class_map=class_ndvi_map)
else:
    print("Warning: 'class' column not found in train after initial load. Cannot create class_ndvi_map.")
    class_ndvi_map = None
    test = preprocess_ndvi(test.copy(), is_train=False, class_map=None)


def extract_features(df):
    ndvi_cols = [col for col in df.columns if "_N" in col]
    if not ndvi_cols:
        print("Warning: No NDVI columns found for feature extraction.")
        return pd.DataFrame()

    x = np.arange(len(ndvi_cols))

    stats = pd.DataFrame()
    try:
        ndvi_vals = df[ndvi_cols]
    except KeyError:
        print("Error: Some NDVI columns not found in DataFrame.")
        return pd.DataFrame()


    if not ndvi_vals.empty:
        stats['ndvi_mean'] = ndvi_vals.mean(axis=1)
        stats['ndvi_std'] = ndvi_vals.std(axis=1)
        stats['ndvi_max'] = ndvi_vals.max(axis=1)
        stats['ndvi_min'] = ndvi_vals.min(axis=1)
        stats['ndvi_trend'] = ndvi_vals.apply(
            lambda r: np.polyfit(x[~np.isnan(r)], r[~np.isnan(r)], 1)[0] if len(r[~np.isnan(r)]) >= 2 else np.nan, axis=1
            )
        stats['ndvi_range'] = stats['ndvi_max'] - stats['ndvi_min']

        cut1, cut2 = len(ndvi_cols) // 3, 2 * len(ndvi_cols) // 3
        for i, (start, end) in enumerate([(0, cut1), (cut1, cut2), (cut2, len(ndvi_cols))]):
            segment = ndvi_vals.iloc[:, start:end]
            if not segment.empty:
                stats[f'seg_{i}_mean'] = segment.mean(axis=1)
                stats[f'seg_{i}_std'] = segment.std(axis=1)
                stats[f'seg_{i}_max'] = segment.max(axis=1)
            else:
                 stats[f'seg_{i}_mean'] = np.nan
                 stats[f'seg_{i}_std'] = np.nan
                 stats[f'seg_{i}_max'] = np.nan


        def get_peak_time(row, columns):
            if row.isnull().all():
                return np.nan
            try:
                peak_idx = np.argmax(row.values)
                peak_col = columns[peak_idx]
                return int(peak_col[:6])
            except Exception as e:
                return np.nan

        stats['ndvi_peak_time'] = ndvi_vals.apply(
            lambda r: get_peak_time(r, ndvi_vals.columns), axis=1
        )


        stats['ndvi_peak_month'] = stats['ndvi_peak_time'].apply(lambda x: x % 100 if pd.notnull(x) else np.nan)
    else:
         print("Warning: ndvi_vals is empty after selecting columns. No stats extracted.")
         return stats


    return stats

X_train = extract_features(train)
X_test = extract_features(test)

if X_train.isnull().any().any():
    print("Warning: NaNs found in X_train after feature extraction. Imputing with mean.")
    imputation_means = X_train.mean()
    X_train = X_train.fillna(imputation_means)

if X_test.isnull().any().any():
     print("Warning: NaNs found in X_test after feature extraction. Imputing with mean from train data.")
     X_test = X_test.fillna(imputation_means)


le = LabelEncoder()
if 'class' in train.columns:
    y = le.fit_transform(train['class'])
else:
    print("Error: 'class' column not found in train. Cannot fit LabelEncoder.")


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

from sklearn.decomposition import PCA
pca = PCA(n_components=0.95, random_state=42)

X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

if 'y' in locals():
    X_tr, X_val, y_tr, y_val = train_test_split(X_train_pca, y, test_size=0.2, stratify=y, random_state=42)

    model = LogisticRegression(max_iter=3000, multi_class='multinomial', solver='lbfgs', random_state=42)
    model.fit(X_tr, y_tr)

    y_pred = model.predict(X_val)
    print("Validation Accuracy:", accuracy_score(y_val, y_pred))
    print(classification_report(y_val, y_pred, target_names=le.classes_))


    test_pred = model.predict(X_test_pca)

    submission = pd.DataFrame({
        'ID': test['ID'],
        'class': le.inverse_transform(test_pred)
    })

    submission.to_csv("submission.csv", index=False)
    print("Submission saved as 'submission.csv'")
else:
    print("Skipping model training and submission due to missing 'y' (class column).")



Validation Accuracy: 0.84125
              precision    recall  f1-score   support

        farm       0.40      0.14      0.20       168
      forest       0.86      0.97      0.92      1232
       grass       0.00      0.00      0.00        39
  impervious       0.79      0.82      0.81       134
     orchard       0.00      0.00      0.00         6
       water       0.86      0.57      0.69        21

    accuracy                           0.84      1600
   macro avg       0.49      0.42      0.44      1600
weighted avg       0.79      0.84      0.80      1600

Submission saved as 'submission.csv'


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
