In [1]:
import pandas as pd
import numpy as np


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,GridSearchCV
from sklearn.preprocessing import LabelEncoder,StandardScaler,PolynomialFeatures
from sklearn.impute import KNNImputer
from sklearn.metrics import classification_report,accuracy_score
from sklearn.feature_selection import SelectKBest,f_classif

In [4]:
df=pd.read_csv("hacktrain.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,ID,class,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,1,water,637.595,658.668,-1882.03,-1924.36,997.904,-1739.99,630.087,...,,-1043.16,-1942.49,267.138,,,211.328,-2203.02,-1180.19,433.906
1,1,2,water,634.24,593.705,-1625.79,-1672.32,914.198,-692.386,707.626,...,,-933.934,-625.385,120.059,364.858,476.972,220.878,-2250.0,-1360.56,524.075
2,3,4,water,58.0174,-1599.16,,-1052.63,,-1564.63,,...,-1025.88,368.622,,-1227.8,304.621,,369.214,-2202.12,,-1343.55
3,4,5,water,72.518,,380.436,-1256.93,515.805,-1413.18,-802.942,...,-1813.95,155.624,,-924.073,432.15,282.833,298.32,-2197.36,,-826.727
4,7,8,water,1136.44,,,1647.83,1935.8,,2158.98,...,1535.0,1959.43,-279.317,-384.915,-113.406,1020.72,1660.65,-116.801,-568.05,-1357.14


In [5]:
ndvi_columns=[col for col in df.columns if'_N' in col]

In [6]:
imputer=KNNImputer(n_neighbors=3)
df[ndvi_columns]=imputer.fit_transform(df[ndvi_columns])

In [7]:
df['ndvi_mean'] = df[ndvi_columns].mean(axis=1)
df['ndvi_std'] = df[ndvi_columns].std(axis=1)
df['ndvi_max'] = df[ndvi_columns].max(axis=1)
df['ndvi_min'] = df[ndvi_columns].min(axis=1)
df['ndvi_range'] = df['ndvi_max'] - df['ndvi_min']

In [8]:
le=LabelEncoder()
df['class_encoded']=le.fit_transform(df['class'])

In [9]:
features = ndvi_columns + ['ndvi_mean', 'ndvi_std', 'ndvi_max', 'ndvi_min', 'ndvi_range']
X = df[features]
y = df['class_encoded']

In [10]:
poly=PolynomialFeatures(degree=3,include_bias=False)
X_poly=poly.fit_transform(X)

In [11]:
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)

In [12]:
selector=SelectKBest(score_func=f_classif,k=20)
X_selected=selector.fit_transform(X_poly,y)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42,stratify=y)

In [14]:
param_grid={'C':[0.01,0.1,1,10,100]}
grid=GridSearchCV(LogisticRegression(max_iter=1000),param_grid,cv=5)
grid.fit(X_train,y_train)

In [15]:
model=grid.best_estimator_


In [16]:
y_pred=model.predict(X_test)


In [17]:
print("Test set accuracy:", accuracy_score(y_test, y_pred))

Test set accuracy: 0.914375


In [18]:
print(classification_report(
    y_test,
    y_pred,
    labels=list(range(len(le.classes_))),
    target_names=le.classes_
))

              precision    recall  f1-score   support

        farm       0.76      0.62      0.68       168
      forest       0.94      0.98      0.96      1232
       grass       0.85      0.56      0.68        39
  impervious       0.83      0.87      0.85       134
     orchard       0.50      0.17      0.25         6
       water       0.87      0.62      0.72        21

    accuracy                           0.91      1600
   macro avg       0.79      0.64      0.69      1600
weighted avg       0.91      0.91      0.91      1600



In [19]:
cv_score=cross_val_score(model,X_scaled,y,cv=5)
print(cv_score.mean())

0.9043749999999999


In [20]:
check=pd.read_csv("hacktest.csv")
check.shape

(2845, 29)

In [21]:
ID= check['ID']
check.drop('ID',axis=1,inplace=True)

In [22]:
check_ndvi=check[ndvi_columns]

In [23]:
check_imputed=imputer.transform(check_ndvi)
check_imputed_df=pd.DataFrame(check_imputed,columns=ndvi_columns)

In [24]:
check_imputed_df['ndvi_mean'] = check_imputed_df.mean(axis=1)
check_imputed_df['ndvi_std'] = check_imputed_df.std(axis=1)
check_imputed_df['ndvi_max'] = check_imputed_df.max(axis=1)
check_imputed_df['ndvi_min'] = check_imputed_df.min(axis=1)
check_imputed_df['ndvi_range'] = check_imputed_df['ndvi_max'] - check_imputed_df['ndvi_min']

In [25]:
check_scaled=scaler.transform(check_imputed_df)

In [26]:
check_poly=poly.transform(check_scaled)



In [27]:
check_predictions=model.predict(check_scaled)

In [28]:
y_test_labels=le.inverse_transform(check_predictions)

In [29]:
submission=pd.DataFrame({'ID':ID,'class':y_test_labels})

In [30]:
submission

Unnamed: 0,ID,class
0,1,orchard
1,2,orchard
2,3,orchard
3,4,forest
4,5,orchard
...,...,...
2840,2841,water
2841,2842,water
2842,2843,water
2843,2844,water


In [31]:
submission.to_csv('submission.csv',index=False)

In [32]:
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>