In [30]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

import pandas as pd

In [31]:
data = pd.read_csv('cleaned_data.csv')
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,income
0,0.776945,0.976646,0.000276,-0.697833,-1.128018,-1.68604,1.873544,-1.918672,-0.38487,-0.319098,0.0
1,0.833077,0.872461,0.747157,-0.90878,-0.59062,-1.670044,0.687414,-0.130746,0.529908,-1.014595,0.0
2,0.886616,-0.330346,0.643282,-0.683154,-0.108027,0.609371,-0.181776,0.705427,-0.300574,0.129765,0.0
3,0.601948,-1.452807,1.223392,0.495497,0.011877,0.094685,-0.625768,0.015312,-0.33019,-1.923799,0.0
4,-2.563063,-1.66923,0.953954,4.72153,0.67402,-0.418288,0.628209,0.497818,-0.658595,2.861867,0.0


In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'data' contains your dataset

# Drop the target column from features
X = data.drop(columns=['income'])
y = data['income']

# Convert categorical variables into numerical representation (One-Hot Encoding)
X = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the grid search object
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,  # 5-fold cross-validation
                           scoring='accuracy',
                           n_jobs=-1)  # Use all available cores

# Perform the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Accuracy:", best_score)

# Use the best parameters to create the final model
best_rf_model = RandomForestClassifier(**best_params, random_state=42)

# Train the final model on the training data
best_rf_model.fit(X_train, y_train)

# Make predictions on the test data using the final model
best_predictions = best_rf_model.predict(X_test)

# Calculate accuracy using the final model
best_accuracy = accuracy_score(y_test, best_predictions)
print("Accuracy with Optimized Parameters:", best_accuracy)

# Generate classification report with the final model
best_report = classification_report(y_test, best_predictions)
print("Classification Report with Optimized Parameters:")
print(best_report)

# Get feature importance using the final model
best_feature_importance = pd.DataFrame({'Feature': X_train.columns, 'Importance': best_rf_model.feature_importances_})
print("Feature Importance with Optimized Parameters:")
print(best_feature_importance)
