<a href="https://colab.research.google.com/github/shubhamjha16/UrbanIntelligence/blob/main/urbanintelligence_ndvi_phase1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Step 1: Load and Clean Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

train_path = '/content/drive/MyDrive/kagglesummerhackathon/hacktrain.csv'
test_path = '/content/drive/MyDrive/kagglesummerhackathon/hacktest.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Drop 'Unnamed: 0' if present
train_df.drop(columns=['Unnamed: 0'], errors='ignore', inplace=True)
test_df.drop(columns=['Unnamed: 0'], errors='ignore', inplace=True)

Step 2: Feature Engineering from NDVI Time Series
We'll generate useful statistical and trend-based features.

In [None]:
import numpy as np

def extract_ndvi_features(data):
    ndvi_columns = [col for col in data.columns if '_N' in col]

    features = pd.DataFrame()
    features['ID'] = data['ID']

    features['ndvi_mean'] = data[ndvi_columns].mean(axis=1)
    features['ndvi_std'] = data[ndvi_columns].std(axis=1)
    features['ndvi_min'] = data[ndvi_columns].min(axis=1)
    features['ndvi_max'] = data[ndvi_columns].max(axis=1)
    features['ndvi_amp'] = features['ndvi_max'] - features['ndvi_min']
    features['ndvi_first'] = data[ndvi_columns].iloc[:, -1]
    features['ndvi_last'] = data[ndvi_columns].iloc[:, 0]

    # Linear slope
    X_time = np.arange(len(ndvi_columns))
    features['ndvi_slope'] = data[ndvi_columns].apply(
        lambda row: np.polyfit(X_time, row.values, 1)[0], axis=1)

    return features

Step 3: EXTRACT FEATURES FROM BOTH DATASETS

In [None]:
X_train = extract_ndvi_features(train_df)
X_test = extract_ndvi_features(test_df)

# Prepare target variable
le = LabelEncoder()
y_train = le.fit_transform(train_df['class'])

Step 4: Impute Missing Values on Feature Sets

In [None]:
imputer = SimpleImputer(strategy='mean')

X_train_clean = imputer.fit_transform(X_train.drop(columns=['ID']))
X_test_clean = imputer.transform(X_test.drop(columns=['ID']))

REMOVE ID BEFORE IMPUTATION

In [None]:
from sklearn.impute import SimpleImputer

# Remove ID column before imputation
X_train_clean = X_train.drop(columns=['ID'])
X_test_clean = X_test.drop(columns=['ID'])

# Fill missing NDVI values with column means
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_clean)
X_test_imputed = imputer.transform(X_test_clean)

Step 5: Train Logistic Regression Model

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)
model = LogisticRegression(max_iter=5000)
model.fit(X_train_scaled, y_train)

STEP 6: Predict on test set and decode labels


In [None]:
y_pred = model.predict(X_test_scaled)
y_labels = le.inverse_transform(y_pred)

STEP 7: Create and save submission CSV

In [None]:
submission = pd.DataFrame({
    'ID': X_test['ID'],
    'class': y_labels
})

# Update the path below to save wherever you want in your Drive
submission.to_csv('/content/drive/MyDrive/kagglesummerhackathon/submission.csv', index=False)