In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Load data
TRAIN_PATH = "/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv"
TEST_PATH = "/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv"

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
ndvi_columns = [col for col in train.columns if col.endswith('_N')]

# Impute missing values (mean or median both work well)
imputer = SimpleImputer(strategy='mean')
X_train_ndvi = imputer.fit_transform(train[ndvi_columns])
X_test_ndvi = imputer.transform(test[ndvi_columns])

X_train_df = pd.DataFrame(X_train_ndvi, columns=ndvi_columns)
X_test_df = pd.DataFrame(X_test_ndvi, columns=ndvi_columns)

# Feature Engineering (proven set)
def extract_features(df):
    features = pd.DataFrame()
    features['mean'] = df.mean(axis=1)
    features['std'] = df.std(axis=1)
    features['min'] = df.min(axis=1)
    features['max'] = df.max(axis=1)
    features['median'] = df.median(axis=1)
    features['range'] = features['max'] - features['min']
    features['q25'] = df.quantile(0.25, axis=1)
    features['q75'] = df.quantile(0.75, axis=1)
    features['iqr'] = features['q75'] - features['q25']
    features['first'] = df.iloc[:, 0]
    features['last'] = df.iloc[:, -1]
    features['diff'] = df.iloc[:, -1] - df.iloc[:, 0]
    features['slope'] = (df.iloc[:, -1] - df.iloc[:, 0]) / (df.shape[1] - 1)
    return features

train_features = extract_features(X_train_df)
test_features = extract_features(X_test_df)

# Combine raw NDVI + engineered features
X_train_final = np.hstack([X_train_ndvi, train_features.values])
X_test_final = np.hstack([X_test_ndvi, test_features.values])

# Encode target
y_train = train['class'].astype('category').cat.codes
label_map = dict(enumerate(train['class'].astype('category').cat.categories))

# Model - tune C for best validation performance
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(
        max_iter=3000,
        multi_class='multinomial',
        solver='lbfgs',
        random_state=42,
        C=1.5  # Slightly less regularization, can tune between 1.0 and 2.0
    ))
])
pipeline.fit(X_train_final, y_train)

# Predict
y_pred = pipeline.predict(X_test_final)
pred_labels = pd.Series(y_pred).map(label_map)

# Submission
submission = pd.DataFrame({'ID': test['ID'], 'class': pred_labels})
submission.to_csv('/kaggle/working/submission.csv', index=False)
print("🎯 Final optimized submission saved!")
