In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import pandas as pd

In [11]:
df = pd.read_csv('balanced_credit.csv')
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, shuffle=True)

In [12]:
clf = RandomForestClassifier(random_state=42)
# Define the hyperparameter grid for tuning
param_grid = {'max_depth': range(1,6),
              'n_estimators': [50,100,200],
              'min_samples_split': range(2, 10, 2),
              'criterion': ['gini', 'entropy']}

# Create a GridSearchCV object and fit it to the training data
grid_search = GridSearchCV(clf, param_grid, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


In [15]:
# Print the best parameters and best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

# Use the best model to make predictions on the test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate performance on the test data
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy}")

Best parameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 200}
Best score: 0.7111707205035727
Accuracy on test set: 0.7106690777576854


In [14]:
results = pd.DataFrame(grid_search.cv_results_)
results.to_csv('random_forest_results.csv', index=False)