In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt


In [2]:
train = pd.read_csv("hacktrain.csv")
test = pd.read_csv("hacktest.csv")

In [3]:
print(train.head())

   Unnamed: 0  ID  class  20150720_N  20150602_N  20150517_N  20150501_N  \
0           0   1  water    637.5950     658.668   -1882.030    -1924.36   
1           1   2  water    634.2400     593.705   -1625.790    -1672.32   
2           3   4  water     58.0174   -1599.160         NaN    -1052.63   
3           4   5  water     72.5180         NaN     380.436    -1256.93   
4           7   8  water   1136.4400         NaN         NaN     1647.83   

   20150415_N  20150330_N  20150314_N  ...  20140610_N  20140525_N  \
0     997.904   -1739.990     630.087  ...         NaN   -1043.160   
1     914.198    -692.386     707.626  ...         NaN    -933.934   
2         NaN   -1564.630         NaN  ...    -1025.88     368.622   
3     515.805   -1413.180    -802.942  ...    -1813.95     155.624   
4    1935.800         NaN    2158.980  ...     1535.00    1959.430   

   20140509_N  20140423_N  20140407_N  20140322_N  20140218_N  20140202_N  \
0   -1942.490     267.138         NaN        

In [4]:
print(train['class'].value_counts())


class
forest        6159
farm           841
impervious     669
grass          196
water          105
orchard         30
Name: count, dtype: int64


In [5]:
print(train.isnull().sum())

Unnamed: 0       0
ID               0
class            0
20150720_N     560
20150602_N    1200
20150517_N     800
20150501_N     960
20150415_N     480
20150330_N    1120
20150314_N     720
20150226_N    1360
20150210_N     640
20150125_N    1040
20150109_N     880
20141117_N    1280
20141101_N     400
20141016_N    1440
20140930_N     800
20140813_N     560
20140626_N    1600
20140610_N     480
20140525_N     720
20140509_N     880
20140423_N    1760
20140407_N     640
20140322_N    1120
20140218_N    1440
20140202_N     560
20140117_N    1200
20140101_N     400
dtype: int64


In [6]:
from sklearn.impute import SimpleImputer

ndvi_cols = [col for col in train.columns if col.endswith('_N')]
imputer = SimpleImputer(strategy='median')
train[ndvi_cols] = imputer.fit_transform(train[ndvi_cols])
test[ndvi_cols] = imputer.transform(test[ndvi_cols])


In [7]:
train.drop(columns=['Unnamed: 0'], inplace=True)
test.drop(columns=['Unnamed: 0'], inplace=True)

In [8]:
def generate_features(df):
    ndvi_cols = [col for col in df.columns if col.endswith('_N')]

    df['ndvi_mean'] = df[ndvi_cols].mean(axis=1)
    df['ndvi_std'] = df[ndvi_cols].std(axis=1)
    df['ndvi_max'] = df[ndvi_cols].max(axis=1)
    df['ndvi_min'] = df[ndvi_cols].min(axis=1)
    df['ndvi_range'] = df['ndvi_max'] - df['ndvi_min']
    df['ndvi_median'] = df[ndvi_cols].median(axis=1)

    return df

train = generate_features(train)
test = generate_features(test)


In [9]:
train.drop(columns=ndvi_cols, inplace=True)
test.drop(columns=ndvi_cols, inplace=True)


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler


In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['class_encoded'] = le.fit_transform(train['class'])


In [12]:
X = train[['ndvi_mean', 'ndvi_std', 'ndvi_max', 'ndvi_min', 'ndvi_range', 'ndvi_median']]
y = train['class_encoded']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

model = LogisticRegression(multi_class='multinomial', solver='lbfgs', class_weight='balanced', max_iter=1000)
model.fit(X_train_scaled, y_train)





In [13]:
y_val_pred = model.predict(X_val_scaled)

print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("Classification Report:\n", classification_report(y_val, y_val_pred, target_names=le.classes_))


Accuracy: 0.62875
Classification Report:
               precision    recall  f1-score   support

        farm       0.40      0.60      0.48       168
      forest       0.98      0.62      0.76      1232
       grass       0.20      0.62      0.30        39
  impervious       0.78      0.77      0.77       134
     orchard       0.02      0.83      0.03         6
       water       0.50      0.67      0.57        21

    accuracy                           0.63      1600
   macro avg       0.48      0.68      0.49      1600
weighted avg       0.88      0.63      0.71      1600



In [14]:
X_test_scaled = scaler.transform(test[['ndvi_mean', 'ndvi_std', 'ndvi_max', 'ndvi_min', 'ndvi_range', 'ndvi_median']])
test_preds = model.predict(X_test_scaled)
test_labels = le.inverse_transform(test_preds)
submission = pd.DataFrame({
    'ID': test['ID'],
    'class': test_labels
})

submission.to_csv('submission.csv', index=False)
print(submission.head())


   ID    class
0   1   forest
1   2  orchard
2   3  orchard
3   4  orchard
4   5  orchard


In [15]:
ndvi_cols = [col for col in train.columns if '_N' in col]


In [16]:
# Convert all NDVI columns to numeric, force errors to NaN
for col in ndvi_cols:
    train[col] = pd.to_numeric(train[col], errors='coerce')
    test[col] = pd.to_numeric(test[col], errors='coerce')


In [17]:
# Trend calculation
from scipy.stats import linregress
def compute_trend(row):
    time = list(range(len(ndvi_cols)))
    ndvi_values = row[ndvi_cols].values.astype(float)  # Ensure float
    mask = ~np.isnan(ndvi_values)
    if mask.sum() > 1:
        slope, _, _, _, _ = linregress(np.array(time)[mask], ndvi_values[mask])
        return slope
    return 0
