In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE  # If needed, install with: pip install imbalanced-learn


In [2]:
# Load the data
df = pd.read_csv('Model2.csv')

In [3]:
# Encode the target variable
le = LabelEncoder()
df['income'] = le.fit_transform(df['income'])


In [4]:
# Split data into features and target variable
X = df.drop(columns=['income'])
y = df['income']


In [5]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# 1. Address Class Imbalance with SMOTE
sm = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = sm.fit_resample(X_train, y_train)

In [None]:
# 2. Hyperparameter Tuning for RandomForestClassifier
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    # Add more parameters as needed
}
grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_balanced, y_train_balanced)
best_params_rf = grid_search_rf.best_params_


In [None]:
# Train the RandomForestClassifier with best parameters
best_rf_model = RandomForestClassifier(**best_params_rf)
best_rf_model.fit(X_train_balanced, y_train_balanced)


In [None]:
# 3. Feature Importance
feature_importances = best_rf_model.feature_importances_
plt.figure(figsize=(10, 6))
plt.barh(X_train.columns, feature_importances)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Random Forest Feature Importance')
plt.show()


In [None]:
# 4. Ensemble Methods: Voting Classifier
models = [('rf', best_rf_model),
          ('lr', LogisticRegression(solver='lbfgs', random_state=1))]
voting_clf = VotingClassifier(estimators=models, voting='soft')
voting_clf.fit(X_train_balanced, y_train_balanced)
voting_preds = voting_clf.predict(X_test)


In [None]:
# Evaluate the Voting Classifier
report_voting = classification_report(y_test, voting_preds)
print("Classification Report for Voting Classifier:")
print(report_voting)