<a href="https://colab.research.google.com/github/snehapathak9/Summer-Analytics/blob/main/NDVI_Hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [2]:
def preprocess_data(df):
    """Preprocess data: sort NDVI columns chronologically and impute missing values"""
    ndvi_cols = [col for col in df.columns if '_N' in col]

    # Extract and sort dates chronologically
    date_keys = [col.split('_')[0] for col in ndvi_cols]
    sorted_dates = sorted(date_keys, key=lambda x: datetime.strptime(x, '%Y%m%d'))
    sorted_cols = [f"{date}_N" for date in sorted_dates]

    # Reorder columns and impute missing values
    df_sorted = df[sorted_cols].copy()
    df_imputed = df_sorted.T.fillna(method='ffill').fillna(method='bfill').T
    return df_imputed, sorted_dates


In [3]:
def extract_features(df_imputed, sorted_dates):
    """Extract temporal features from imputed NDVI data"""
    # Basic statistics
    features = pd.DataFrame({
        'mean': df_imputed.mean(axis=1),
        'median': df_imputed.median(axis=1),
        'std': df_imputed.std(axis=1),
        'min': df_imputed.min(axis=1),
        'max': df_imputed.max(axis=1),
        'range': df_imputed.max(axis=1) - df_imputed.min(axis=1)
    })
    # Linear trend (slope)
    x = np.arange(len(sorted_dates))
    slopes = []
    for i in range(len(df_imputed)):
        y = df_imputed.iloc[i].values
        coef = np.polyfit(x, y, 1)
        slopes.append(coef[0])
    features['slope'] = slopes

    # Seasonal features (by year and season)
    seasonal_features = {}
    for j, date_str in enumerate(sorted_dates):
        dt = datetime.strptime(date_str, '%Y%m%d')
        year = dt.year
        month = dt.month
        if month in [12, 1, 2]:
            season = 'winter'
        elif month in [3, 4, 5]:
            season = 'spring'
        elif month in [6, 7, 8]:
            season = 'summer'
        else:
            season = 'fall'

        key = f"{season}_{year}"
        if key not in seasonal_features:
            seasonal_features[key] = []
        seasonal_features[key].append(j)

    for key, indices in seasonal_features.items():
        features[f"season_{key}"] = df_imputed.iloc[:, indices].mean(axis=1)

    return features

In [4]:
# Load data
train_df = pd.read_csv('/content/hacktrain.csv')
test_df = pd.read_csv('/content/hacktest.csv')

In [5]:
# Preprocess training data
train_df = train_df.drop(columns=['Unnamed: 0'], errors='ignore')
train_imputed, sorted_dates = preprocess_data(train_df)
X_train = extract_features(train_imputed, sorted_dates)
y_train = train_df['class']

  df_imputed = df_sorted.T.fillna(method='ffill').fillna(method='bfill').T


In [6]:
# Preprocess test data
test_imputed, _ = preprocess_data(test_df)
X_test = extract_features(test_imputed, sorted_dates)


  df_imputed = df_sorted.T.fillna(method='ffill').fillna(method='bfill').T


In [7]:
# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y_train)


In [8]:
# Model pipeline with hyperparameter tuning
pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
)

In [9]:
param_grid = {'logisticregression__C': [0.1, 1, 10, 100]}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)
grid.fit(X_train, y_encoded)



In [10]:
# Predict and format submission
test_preds = grid.best_estimator_.predict(X_test)
predicted_classes = le.inverse_transform(test_preds)

submission = pd.DataFrame({
    'ID': test_df['ID'],
    'class': predicted_classes
})
submission.to_csv('submission.csv', index=False)