In [31]:
import pandas as pd
import numpy as np

# Load datasets
train_df = pd.read_csv('data/hacktrain.csv')
test_df = pd.read_csv('data/hacktest.csv')

# Check data
print(train_df.head())
print(test_df.head())


   Unnamed: 0  ID  class  20150720_N  20150602_N  20150517_N  20150501_N  \
0           0   1  water    637.5950     658.668   -1882.030    -1924.36   
1           1   2  water    634.2400     593.705   -1625.790    -1672.32   
2           3   4  water     58.0174   -1599.160         NaN    -1052.63   
3           4   5  water     72.5180         NaN     380.436    -1256.93   
4           7   8  water   1136.4400         NaN         NaN     1647.83   

   20150415_N  20150330_N  20150314_N  ...  20140610_N  20140525_N  \
0     997.904   -1739.990     630.087  ...         NaN   -1043.160   
1     914.198    -692.386     707.626  ...         NaN    -933.934   
2         NaN   -1564.630         NaN  ...    -1025.88     368.622   
3     515.805   -1413.180    -802.942  ...    -1813.95     155.624   
4    1935.800         NaN    2158.980  ...     1535.00    1959.430   

   20140509_N  20140423_N  20140407_N  20140322_N  20140218_N  20140202_N  \
0   -1942.490     267.138         NaN        

In [32]:
print(train_df.info())
print(train_df.isnull().sum())
print(train_df.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  8000 non-null   int64  
 1   ID          8000 non-null   int64  
 2   class       8000 non-null   object 
 3   20150720_N  7440 non-null   float64
 4   20150602_N  6800 non-null   float64
 5   20150517_N  7200 non-null   float64
 6   20150501_N  7040 non-null   float64
 7   20150415_N  7520 non-null   float64
 8   20150330_N  6880 non-null   float64
 9   20150314_N  7280 non-null   float64
 10  20150226_N  6640 non-null   float64
 11  20150210_N  7360 non-null   float64
 12  20150125_N  6960 non-null   float64
 13  20150109_N  7120 non-null   float64
 14  20141117_N  6720 non-null   float64
 15  20141101_N  7600 non-null   float64
 16  20141016_N  6560 non-null   float64
 17  20140930_N  7200 non-null   float64
 18  20140813_N  7440 non-null   float64
 19  20140626_N  6400 non-null  

In [33]:
print(train_df['class'].value_counts())


class
forest        6159
farm           841
impervious     669
grass          196
water          105
orchard         30
Name: count, dtype: int64


In [39]:
ndvi_cols = [col for col in train_df.columns if '_N' in col]
train_df[ndvi_cols] = train_df[ndvi_cols].interpolate(axis=1)
test_df[ndvi_cols] = test_df[ndvi_cols].interpolate(axis=1)


In [38]:
train_df['ndvi_mean'] = train_df[ndvi_cols].mean(axis=1)
train_df['ndvi_std'] = train_df[ndvi_cols].std(axis=1)
test_df['ndvi_mean'] = test_df[ndvi_cols].mean(axis=1)
test_df['ndvi_std'] = test_df[ndvi_cols].std(axis=1)


In [40]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_df['class_encoded'] = le.fit_transform(train_df['class'])


In [41]:
features = ndvi_cols + ['ndvi_mean', 'ndvi_std']
X = train_df[features]
y = train_df['class_encoded']
X_test = test_df[features]


In [42]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [45]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='saga')
model.fit(X_train_scaled, y_train)


In [46]:
model = LogisticRegression(max_iter=5000, multi_class='multinomial', solver='saga')
model.fit(X_train_scaled, y_train)


In [48]:
from sklearn.impute import SimpleImputer

# (A) Impute before scaling
imputer = SimpleImputer(strategy='mean')
X_train_imp = imputer.fit_transform(X_train)
X_val_imp = imputer.transform(X_val)
X_test_imp = imputer.transform(X_test)

# (B) Then scale
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imp)
X_val_scaled = scaler.transform(X_val_imp)
X_test_scaled = scaler.transform(X_test_imp)

# (C) Fit model
model = LogisticRegression(max_iter=5000, multi_class='multinomial', solver='saga')
model.fit(X_train_scaled, y_train)

# (D) Predict and evaluate
val_preds = model.predict(X_val_scaled)
print("Validation Accuracy:", accuracy_score(y_val, val_preds))


Validation Accuracy: 0.911875


In [49]:
print(X_train.isnull().sum().sum(), X_val.isnull().sum().sum())


0 637


In [53]:
# After imputation and scaling:
val_preds = model.predict(X_val_scaled)
print("Validation Accuracy:", accuracy_score(y_val, val_preds))


Validation Accuracy: 0.911875


In [54]:
print(np.isnan(X_test).sum())  # Should be 0 after imputation


20150720_N    0
20150602_N    0
20150517_N    0
20150501_N    0
20150415_N    0
20150330_N    0
20150314_N    0
20150226_N    0
20150210_N    0
20150125_N    0
20150109_N    0
20141117_N    0
20141101_N    0
20141016_N    0
20140930_N    0
20140813_N    0
20140626_N    0
20140610_N    0
20140525_N    0
20140509_N    0
20140423_N    0
20140407_N    0
20140322_N    0
20140218_N    0
20140202_N    0
20140117_N    0
20140101_N    0
ndvi_mean     0
ndvi_std      0
dtype: int64


In [55]:
# Predict the encoded class labels for the test set
test_preds = model.predict(X_test_scaled)


In [56]:
# Convert numeric predictions back to original class labels
test_class_labels = le.inverse_transform(test_preds)


In [57]:
# Add predictions to the test dataframe
test_df['class'] = test_class_labels

# Prepare submission DataFrame with only ID and class columns
submission = test_df[['ID', 'class']]


In [60]:
# Save to CSV (no index)
submission.to_csv('submissions/submission.csv', index=False)
