In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')


class NDVILandCoverClassifier:
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()

        self.model = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            min_samples_split=10,
            min_samples_leaf=4,
            max_features='sqrt',
            bootstrap=True,
            oob_score=True,
            class_weight='balanced',
            random_state=random_state,
            n_jobs=-1
        )
        self.ndvi_columns = []

    def extract_ndvi_columns(self, df):
        ndvi_cols = [col for col in df.columns if col.endswith('_N')]
        ndvi_cols.sort()
        return ndvi_cols

    def clean_data(self, df, ndvi_cols):
        df_clean = df.copy()
        for col in ndvi_cols:
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
        return df_clean

    def preprocess_ndvi(self, df, ndvi_cols):
        df_processed = df.copy()

        for col in ndvi_cols:
            median_val = df_processed[col].median()
            if np.isnan(median_val):
                median_val = 0.3
            df_processed[col].fillna(median_val, inplace=True)

        for col in ndvi_cols:
            q1 = df_processed[col].quantile(0.01)
            q3 = df_processed[col].quantile(0.99)
            iqr = q3 - q1
            lower = q1 - 1.5 * iqr
            upper = q3 + 1.5 * iqr
            df_processed[col] = df_processed[col].clip(lower, upper)

        max_val = max(abs(df_processed[ndvi_cols].min().min()), abs(df_processed[ndvi_cols].max().max()))
        if max_val > 3:
            for col in ndvi_cols:
                df_processed[col] = df_processed[col] / max_val

        for col in ndvi_cols:
            df_processed[col] = df_processed[col].clip(-1, 1)

        return df_processed

    def extract_features(self, df, ndvi_cols):
        features = pd.DataFrame(index=df.index)
        ndvi_data = df[ndvi_cols].values

        features['mean'] = np.mean(ndvi_data, axis=1)
        features['std'] = np.std(ndvi_data, axis=1)
        features['min'] = np.min(ndvi_data, axis=1)
        features['max'] = np.max(ndvi_data, axis=1)
        features['range'] = features['max'] - features['min']
        features['median'] = np.median(ndvi_data, axis=1)

        features['above_0.2'] = (ndvi_data > 0.2).sum(axis=1)
        features['above_0.4'] = (ndvi_data > 0.4).sum(axis=1)
        features['above_0.6'] = (ndvi_data > 0.6).sum(axis=1)
        features['below_0'] = (ndvi_data < 0).sum(axis=1)
        features['below_0.2'] = (ndvi_data < 0.2).sum(axis=1)

        total_points = len(ndvi_cols)
        quarter = total_points // 4
        for i in range(4):
            start = i * quarter
            end = (i + 1) * quarter if i < 3 else total_points
            quarter_data = ndvi_data[:, start:end]
            features[f'q{i+1}_mean'] = np.mean(quarter_data, axis=1)
            features[f'q{i+1}_max'] = np.max(quarter_data, axis=1)

        for i in range(3):
            features[f'diff_q{i+1}_to_q{i+2}'] = features[f'q{i+2}_mean'] - features[f'q{i+1}_mean']

        selected_points = np.linspace(0, total_points-1, 5).astype(int)
        for i, idx in enumerate(selected_points):
            features[f'ndvi_t{i}'] = df[ndvi_cols[idx]]

        return features.fillna(0)

    def preprocess_data(self, df, is_training=True):
        print("Preprocessing data...")
        self.ndvi_columns = self.extract_ndvi_columns(df)
        cleaned = self.clean_data(df, self.ndvi_columns)
        processed = self.preprocess_ndvi(cleaned, self.ndvi_columns)
        features = self.extract_features(processed, self.ndvi_columns)

        if 'ID' in df.columns:
            features['ID'] = df['ID']
        return features, processed

    def fit(self, X, y):
        print("Training Random Forest model...")
        y_encoded = self.label_encoder.fit_transform(y)
        X_scaled = self.scaler.fit_transform(X)
        self.model.fit(X_scaled, y_encoded)

        print(f"OOB Score: {self.model.oob_score_:.4f}")
        y_pred = self.model.predict(X_scaled)
        print("Training performance:")
        print(classification_report(y_encoded, y_pred, target_names=self.label_encoder.classes_))

        print("Predicted class distribution:")
        print(pd.Series(self.label_encoder.inverse_transform(y_pred)).value_counts())

        return self

    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        return self.label_encoder.inverse_transform(self.model.predict(X_scaled))

    def predict_proba(self, X):
        X_scaled = self.scaler.transform(X)
        return self.model.predict_proba(X_scaled)


def main():
    print("NDVI Land Cover Classification")
    print("=" * 50)

    classifier = NDVILandCoverClassifier(random_state=42)

    try:
        print("Loading training data...")
        train_df = pd.read_csv("hacktrain.csv")
        print(f"Training set shape: {train_df.shape}")
        print("Class distribution:")
        print(train_df['class'].value_counts())

        X_train_df, _ = classifier.preprocess_data(train_df)
        y_train = train_df['class']
        X_train = X_train_df.drop(columns=['ID'], errors='ignore')

        classifier.fit(X_train, y_train)

        print("Loading test data...")
        test_df = pd.read_csv("hacktest.csv")
        print(f"Test set shape: {test_df.shape}")

        X_test_df, _ = classifier.preprocess_data(test_df, is_training=False)
        X_test = X_test_df.drop(columns=['ID'], errors='ignore')

        print("Generating predictions...")
        preds = classifier.predict(X_test)

        output = pd.DataFrame({'ID': test_df['ID'], 'class': preds})
        output.to_csv('submission.csv', index=False)

        print("Submission file saved as 'submission.csv'")
        print("Class distribution in predictions:")
        print(output['class'].value_counts())
        print("Sample predictions:")
        print(output.head(10))

        return classifier, output

    except Exception as e:
        print("Error during execution:", e)
        import traceback
        traceback.print_exc()
        return None, None


def analyze_feature_importance(classifier, features):
    if hasattr(classifier.model, 'feature_importances_'):
        df = pd.DataFrame({
            'feature': features,
            'importance': classifier.model.feature_importances_
        }).sort_values(by='importance', ascending=False)

        print("Top 15 important features:")
        print(df.head(15))
        return df
    else:
        print("Feature importance not available.")
        return None


if __name__ == "__main__":
    classifier, submission = main()


NDVI Land Cover Classification
Loading training data...
Training set shape: (8000, 30)
Class distribution:
class
forest        6159
farm           841
impervious     669
grass          196
water          105
orchard         30
Name: count, dtype: int64
Preprocessing data...
Training Random Forest model...
OOB Score: 0.8769
Training performance:
              precision    recall  f1-score   support

        farm       0.65      0.93      0.77       841
      forest       0.99      0.92      0.95      6159
       grass       0.71      0.97      0.82       196
  impervious       0.91      0.91      0.91       669
     orchard       0.56      1.00      0.71        30
       water       0.94      0.96      0.95       105

    accuracy                           0.92      8000
   macro avg       0.79      0.95      0.85      8000
weighted avg       0.94      0.92      0.93      8000

Predicted class distribution:
forest        5706
farm          1195
impervious     669
grass          268
wate