In [1]:
import numpy as np 
import pandas as pd


In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import KNNImputer
from scipy.stats import skew, kurtosis

In [3]:
train = pd.read_csv('hacktrain.csv') 
test = pd.read_csv('hacktest.csv')

In [4]:
print(train.head())
print(train['class'].value_counts())

   Unnamed: 0  ID  class  20150720_N  20150602_N  20150517_N  20150501_N  \
0           0   1  water    637.5950     658.668   -1882.030    -1924.36   
1           1   2  water    634.2400     593.705   -1625.790    -1672.32   
2           3   4  water     58.0174   -1599.160         NaN    -1052.63   
3           4   5  water     72.5180         NaN     380.436    -1256.93   
4           7   8  water   1136.4400         NaN         NaN     1647.83   

   20150415_N  20150330_N  20150314_N  ...  20140610_N  20140525_N  \
0     997.904   -1739.990     630.087  ...         NaN   -1043.160   
1     914.198    -692.386     707.626  ...         NaN    -933.934   
2         NaN   -1564.630         NaN  ...    -1025.88     368.622   
3     515.805   -1413.180    -802.942  ...    -1813.95     155.624   
4    1935.800         NaN    2158.980  ...     1535.00    1959.430   

   20140509_N  20140423_N  20140407_N  20140322_N  20140218_N  20140202_N  \
0   -1942.490     267.138         NaN        

In [7]:
ndvi_cols = [col for col in train.columns if '_N' in col]
imputer = KNNImputer(n_neighbors=5)
train[ndvi_cols] = imputer.fit_transform(train[ndvi_cols])
test[ndvi_cols] = imputer.transform(test[ndvi_cols])

In [8]:
def extract_features(df):
    df['ndvi_mean'] = df[ndvi_cols].mean(axis=1)
    df['ndvi_std'] = df[ndvi_cols].std(axis=1)
    df['ndvi_min'] = df[ndvi_cols].min(axis=1)
    df['ndvi_max'] = df[
    ndvi_cols].max(axis=1)
    df['ndvi_range'] = df['ndvi_max'] - df['ndvi_min']
    df['ndvi_skew'] = df[ndvi_cols].skew(axis=1)
    df['ndvi_kurt'] = df[ndvi_cols].kurtosis(axis=1)
    df['ndvi_trend'] = df[ndvi_cols].iloc[:, -1] - df[ndvi_cols].iloc[:, 0]
    return df

hacktrain = extract_features(train)
hacktest = extract_features(test)

feature_cols = ndvi_cols + [
    'ndvi_mean', 'ndvi_std', 'ndvi_min', 'ndvi_max', 'ndvi_range', 'ndvi_skew', 'ndvi_kurt', 'ndvi_trend'
]


In [9]:
le = LabelEncoder()
hacktrain['class_encoded'] = le.fit_transform(hacktrain['class'])

In [10]:
scaler = StandardScaler()
X = scaler.fit_transform(hacktrain[feature_cols])
X_test = scaler.transform(hacktest[feature_cols])
y = hacktrain['class_encoded']

In [12]:
clf = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=1000, random_state=42, n_jobs=-1)

In [13]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(clf, X, y, cv=skf, scoring='accuracy')
print(f'Cross-validated accuracy: {scores.mean():.4f} ± {scores.std():.4f}')



Cross-validated accuracy: 0.9267 ± 0.0030


In [14]:
clf.fit(X, y)



In [15]:
test_prediction = clf.predict(X_test)
test['class'] = le.inverse_transform(test_prediction)

In [16]:
mysubmission = test[['ID', 'class']]
mysubmission.to_csv('mysubmission.csv', index=False)
print(mysubmission.head())

   ID    class
0   1  orchard
1   2  orchard
2   3  orchard
3   4   forest
4   5  orchard
