In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, make_scorer

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.impute import SimpleImputer
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/iitgsumana/hacktest.csv
/kaggle/input/iitgsumana/hacktrain.csv


In [None]:
train = pd.read_csv('/kaggle/input/iitgsumana/hacktrain.csv')
test = pd.read_csv('/kaggle/input/iitgsumana/hacktest.csv')
print(train.shape)

In [None]:
print(test.columns)
#print(train.head)

In [None]:
ndvi_columns = sorted([col for col in train.columns if col.endswith('_N')])
train = train.drop(columns=['Unnamed: 0', 'ID'])
#test = test.drop(columns=['Unnamed: 0', 'ID'])
target_cols = ['class']

In [None]:
print(test.columns)

In [None]:

sns.heatmap(train.isnull(), cmap='Blues', cbar=False, yticklabels=False, xticklabels=train.columns);

In [None]:
from sklearn.preprocessing import StandardScaler


# Fit scaler on train and apply to both train and test
scaler = StandardScaler()
train[ndvi_columns] = scaler.fit_transform(train[ndvi_columns])
test[ndvi_columns] = scaler.transform(test[ndvi_columns])

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(max_iter=10, random_state=42)

train[ndvi_columns] = imputer.fit_transform(train[ndvi_columns])
test[ndvi_columns] = imputer.transform(test[ndvi_columns])



sns.heatmap(train.isnull(), cmap='Blues', cbar=False, yticklabels=False, xticklabels=train.columns);
#sns.heatmap(test.isnull(), cmap='Blues', cbar=False, yticklabels=False, xticklabels=test.columns);


In [None]:

sns.heatmap(test.isnull(), cmap='Blues', cbar=False, yticklabels=False, xticklabels=test.columns);

In [None]:
def filter_outliers_zscore(df, ndvi_columns, threshold=3):
    """
    Replaces NDVI values > threshold*std from row mean with NaN.
    """
    for col in ndvi_columns:
        row_mean = df[ndvi_columns].mean(axis=1)
        row_std = df[ndvi_columns].std(axis=1)
        outliers = (df[col] - row_mean).abs() > threshold * row_std
        df.loc[outliers, col] = np.nan
    return df

train = filter_outliers_zscore(train, ndvi_columns, threshold=3)
test = filter_outliers_zscore(test, ndvi_columns, threshold=3)

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
train[ndvi_columns] = imputer.fit_transform(train[ndvi_columns])
test[ndvi_columns] = imputer.transform(test[ndvi_columns])


In [None]:
from scipy.signal import savgol_filter

def smooth_ndvi_series(row):
    # Apply Savitzky–Golay filter: window size 5, poly order 2
    return savgol_filter(row, window_length=5, polyorder=2)

# Apply to train
train[ndvi_columns] = train[ndvi_columns].apply(smooth_ndvi_series, axis=1, result_type='broadcast')
test[ndvi_columns] = test[ndvi_columns].apply(smooth_ndvi_series, axis=1, result_type='broadcast')

def extract_fft_features(row, n_components=3):
    fft_vals = np.fft.fft(row)
    fft_amplitudes = np.abs(fft_vals)
    # Ignore the zero-frequency (DC component), which is just the mean
    return fft_amplitudes[1:n_components+1]

fft_train = np.vstack(train[ndvi_columns].apply(extract_fft_features, axis=1))
fft_test = np.vstack(test[ndvi_columns].apply(extract_fft_features, axis=1))

# Add to dataframe
for i in range(fft_train.shape[1]):
    train[f'fft_{i+1}'] = fft_train[:, i]
    test[f'fft_{i+1}'] = fft_test[:, i]
    
fft_feats = [f'fft_{i+1}' for i in range(fft_train.shape[1])]


In [None]:
train['ndvi_mean'] = train[ndvi_columns].mean(axis=1)
train['ndvi_std'] = train[ndvi_columns].std(axis=1)
train['ndvi_min'] = train[ndvi_columns].min(axis=1)
train['ndvi_max'] = train[ndvi_columns].max(axis=1)
train['ndvi_range'] = train['ndvi_max'] - train['ndvi_min']
train['ndvi_missing'] = train[ndvi_columns].isna().sum(axis=1)

test['ndvi_mean'] = test[ndvi_columns].mean(axis=1)
test['ndvi_std'] = test[ndvi_columns].std(axis=1)
test['ndvi_min'] = test[ndvi_columns].min(axis=1)
test['ndvi_max'] = test[ndvi_columns].max(axis=1)
test['ndvi_range'] = test['ndvi_max'] - test['ndvi_min']
test['ndvi_missing'] = test[ndvi_columns].isna().sum(axis=1)



#train['ndvi_trend'] = train[ndvi_columns].apply(lambda row: np.polyfit(range(len(row)), row, deg=1)[0], axis=1)
#test['ndvi_trend'] = test[ndvi_columns].apply(lambda row: np.polyfit(range(len(row)), row, deg=1)[0], axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['label_encoded'] = le.fit_transform(train['class'])


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Combine all your features
extra_feats = ['ndvi_mean', 'ndvi_std', 'ndvi_min', 'ndvi_max', 'ndvi_range', 'ndvi_missing']
features = ndvi_columns + extra_feats + fft_feats

X = train[features]
y = train['label_encoded']

# Train-validation split for sanity checking
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# ✅ Pipeline with scaling + logistic regression
pipe = Pipeline([
    ('scaler', StandardScaler()),  # Scales the features
    ('clf', LogisticRegression(multi_class='multinomial', solver='lbfgs'))
])

# Grid for Logistic Regression hyperparameters
param_grid = {
    'clf__C': [0.001, 0.01, 0.1, 1, 10],
    'clf__max_iter': [1000, 2000, 5000]
}

# Grid search with CV
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', verbose=1)
grid.fit(X_train, y_train)

# Results
print("✅ Best Parameters:", grid.best_params_)
print("✅ Best CV Accuracy:", grid.best_score_)

# Validate on hold-out split
y_val_pred = grid.predict(X_val)
print("✅ Validation Accuracy:", accuracy_score(y_val, y_val_pred))


In [None]:

X_test = test[features]
test_preds = model.predict(X_test)
test['class'] = le.inverse_transform(test_preds)
submission = test[['ID', 'class']]
submission.to_csv('submission.csv', index=False)
