In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from imblearn.over_sampling import SMOTE  # For handling class imbalance

# Load and filter data
wave7_csv = "data/preprocessed/filtered_wave_7.csv"
df = pd.read_csv(wave7_csv, low_memory=False)
df_usa = df[df["Country"] == 840].copy()

# Replace missing values
missing_values = [-1, -2, -4, -5]
df_usa = df_usa.replace(missing_values, np.nan)

# Select relevant columns
selected_columns = [
    "C Government", "C Political parties", "C Courts", "C Elections",
    "Age", "Sex", "Scale of incomes", "Highest educational level",
    "Importance of democracy", "Strong Leader", "Expert Non Govt Person"
]
df_model = df_usa[selected_columns]

# Fill missing values
for col in df_model.columns:
    if df_model[col].dtype == "object" or col in ["Sex", "Scale of incomes", "Highest educational level"]:
        df_model[col].fillna(df_model[col].mode()[0], inplace=True)
    else:
        df_model[col].fillna(df_model[col].median(), inplace=True)

# Encode categorical variables
encoder = LabelEncoder()
for col in ["Sex", "Scale of incomes", "Highest educational level", "Strong Leader", "Expert Non Govt Person"]:
    df_model[col] = encoder.fit_transform(df_model[col])

# Normalize continuous variables
scaler = MinMaxScaler()
df_model["Age"] = scaler.fit_transform(df_model[["Age"]].copy())
df_model["Importance of democracy"] = scaler.fit_transform(df_model[["Importance of democracy"]].copy())

# Define target and predictors
X = df_model.drop("C Government", axis=1)
y = df_model["C Government"]

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Train Random Forest with class weights
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, 30],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42, class_weight="balanced"),
                           param_grid=param_grid, cv=5, scoring="accuracy")
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)

# Evaluate model
y_pred = grid_search.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

# Feature Importance
feature_importances = pd.DataFrame({
    "Feature": X.columns,
    "Importance": grid_search.feature_importances_
}).sort_values(by="Importance", ascending=False)
print("\nFeature Importances:")
print(feature_importances)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_model[col].fillna(df_model[col].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model[col].fillna(df_model[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] 

Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Confusion Matrix:
[[232  23   9  12]
 [ 35 180  62  39]
 [ 19  70 134  85]
 [  9  25  66 204]]

Classification Report:
              precision    recall  f1-score   support

         1.0       0.79      0.84      0.81       276
         2.0       0.60      0.57      0.59       316
         3.0       0.49      0.44      0.46       308
         4.0       0.60      0.67      0.63       304

    accuracy                           0.62      1204
   macro avg       0.62      0.63      0.62      1204
weighted avg       0.62      0.62      0.62      1204


Accuracy Score:
0.6229235880398671


AttributeError: 'GridSearchCV' object has no attribute 'feature_importances_'