In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hackathon-set/hacktest.csv
/kaggle/input/hackathon-set/hacktrain.csv


##  STEP 1: Preprocessing – Clean & Smooth NDVI Time Series

In [2]:
import pandas as pd
import numpy as np
from scipy.signal import savgol_filter

# Load datasets
train = pd.read_csv("/kaggle/input/hackathon-set/hacktrain.csv").drop(columns=['Unnamed: 0'])
test = pd.read_csv("/kaggle/input/hackathon-set/hacktest.csv").drop(columns=['Unnamed: 0'])

# Identify NDVI columns
ndvi_cols = [col for col in train.columns if '_N' in col]
ndvi_cols.sort()  # Ensure chronological order

# Interpolate and apply Savitzky-Golay smoothing
def clean_ndvi(df, is_train=True):
    df_clean = df.copy()
    
    # Interpolation (linear along time)
    df_clean[ndvi_cols] = df_clean[ndvi_cols].interpolate(method='linear', axis=1, limit_direction='both')
    
    # Smoothing
    window = 7 if len(ndvi_cols) >= 7 else len(ndvi_cols) // 2 * 2 + 1
    for idx in df_clean.index:
        try:
            smoothed = savgol_filter(df_clean.loc[idx, ndvi_cols], window_length=window, polyorder=2)
            df_clean.loc[idx, ndvi_cols] = smoothed
        except:
            pass
    
    if is_train:
        return df_clean[ndvi_cols], df_clean['class']
    else:
        return df_clean[['ID'] + ndvi_cols]

X_train, y_train = clean_ndvi(train, is_train=True)
X_test = clean_ndvi(test, is_train=False)

## STEP 2: Feature Engineering

In [3]:
from scipy.stats import skew, kurtosis

def extract_features(df):
    features = pd.DataFrame()
    ndvi_data = df[ndvi_cols]

    features["mean"] = ndvi_data.mean(axis=1)
    features["std"] = ndvi_data.std(axis=1)
    features["min"] = ndvi_data.min(axis=1)
    features["max"] = ndvi_data.max(axis=1)
    features["skew"] = ndvi_data.apply(skew, axis=1)
    features["kurtosis"] = ndvi_data.apply(kurtosis, axis=1)
    features["trend"] = ndvi_data.apply(lambda row: np.polyfit(range(len(row)), row, 1)[0], axis=1)  # slope

    return features

X_train_feat = extract_features(X_train)
X_test_feat = extract_features(X_test)

## STEP 3: Model Training — Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Encode target
y_train_encoded = y_train.astype('category').cat.codes
label_mapping = dict(enumerate(y_train.astype('category').cat.categories))

# Pipeline: Scale + Model
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=1000))
])

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X_train_feat, y_train_encoded, cv=cv, scoring='accuracy')
print("Cross-validated Accuracy:", scores.mean())

Cross-validated Accuracy: 0.8317500000000001


## STEP 4: Train Final Model & Predict on Test Set

In [5]:
# Train on full data
pipeline.fit(X_train_feat, y_train_encoded)

# Predict
test_preds = pipeline.predict(X_test_feat)
test_labels = [label_mapping[i] for i in test_preds]

# Create submission
submission = pd.DataFrame({
    "ID": X_test["ID"],
    "class": test_labels
})
submission.to_csv("submission.csv", index=False)
print("Submission file saved!")

Submission file saved!


In [6]:
submission

Unnamed: 0,ID,class
0,1,forest
1,2,forest
2,3,forest
3,4,forest
4,5,forest
...,...,...
2840,2841,water
2841,2842,water
2842,2843,water
2843,2844,water
