In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


df = pd.read_csv("/Users/sanskarranjan/Downloads/hacktrain.csv")
df.drop(columns=['ID'], inplace=True, errors='ignore')


label_encoder = LabelEncoder()
df['class'] = label_encoder.fit_transform(df['class'])


df = pd.get_dummies(df, drop_first=True)


X = df.drop(columns=['class']).values
y = df['class']


imputer = KNNImputer(n_neighbors=5)
X_imputed = imputer.fit_transform(X)
print("NaNs after imputation:", np.isnan(X_imputed).sum())


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)


selector = SelectKBest(f_classif, k=10)
X_selected = selector.fit_transform(X_scaled, y)


X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, stratify=y, random_state=42
)


param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [8, 12, 16, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(class_weight='balanced', random_state=42)


random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=25,  
    scoring='f1_weighted',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)


random_search.fit(X_train, y_train)
best_rf = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)


y_pred = best_rf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("Accuracy:", accuracy_score(y_test, y_pred))

test_data = pd.read_csv("/Users/sanskarranjan/Downloads/hacktest.csv")
test_clean = test_data.drop(columns=['ID'])


feature_columns = df.drop(columns=['class']).columns
test_clean = test_clean[feature_columns]

test_imputed = imputer.transform(test_clean)
test_scaled = scaler.transform(test_imputed)
test_selected = selector.transform(test_scaled)

y_test_pred = best_rf.predict(test_selected)
y_decoded = label_encoder.inverse_transform(y_test_pred)

submission = pd.DataFrame({
    'ID': test_data['ID'],
    'class': y_decoded
})
submission.to_csv("submission_rf_randomsearch.csv", index=False)


NaNs after imputation: 0
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best Parameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 16}
              precision    recall  f1-score   support

        farm       0.87      0.95      0.91       168
      forest       1.00      1.00      1.00      1232
       grass       0.96      0.67      0.79        39
  impervious       0.90      0.90      0.90       134
     orchard       0.60      0.50      0.55         6
       water       0.83      0.95      0.89        21

    accuracy                           0.97      1600
   macro avg       0.86      0.83      0.84      1600
weighted avg       0.97      0.97      0.97      1600

Accuracy: 0.9725


