In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

# Load the dataset (make sure the file path is correct)
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'
column_names = [
    'checking_status', 'duration', 'credit_history', 'purpose', 'credit_amount', 'savings_status',
    'employment', 'personal_status', 'other_parties', 'residence_since', 'property_magnitude', 'housing',
    'existing_credits', 'job', 'num_dependents', 'own_telephone', 'foreign_worker', 'class'
]

df = pd.read_csv(url, delimiter=' ', header=None, names=column_names)

# Preprocessing the data: Encoding categorical variables
label_encoder = LabelEncoder()

# Applying label encoding to all columns that are categorical
for column in df.columns:
    if df[column].dtype == 'object':  # If column is categorical
        df[column] = label_encoder.fit_transform(df[column])

# Splitting the data into features (X) and target (y)
X = df.drop('class', axis=1)
y = df['class']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# 2. Build the Random Forest Model
rf = RandomForestClassifier(random_state=42)

# Fit the model to the training data
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Evaluate the model's performance
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy Score:  0.7
Classification Report:
               precision    recall  f1-score   support

           1       0.75      0.87      0.80       141
           2       0.49      0.29      0.36        59

    accuracy                           0.70       200
   macro avg       0.62      0.58      0.58       200
weighted avg       0.67      0.70      0.67       200

Confusion Matrix:
 [[123  18]
 [ 42  17]]


In [7]:
# 3. Hyperparameter Tuning Using GridSearchCV

# Set the parameters for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Perform Grid Search to find the best parameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
print("Best Hyperparameters: ", grid_search.best_params_)

# Use the best estimator to predict on the test set
best_rf = grid_search.best_estimator_
y_pred_tuned = best_rf.predict(X_test)

# Evaluate the tuned model's performance
print("Accuracy Score (Tuned Model): ", accuracy_score(y_test, y_pred_tuned))
print("Classification Report (Tuned Model):\n", classification_report(y_test, y_pred_tuned))
print("Confusion Matrix (Tuned Model):\n", confusion_matrix(y_test, y_pred_tuned))

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Hyperparameters:  {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Accuracy Score (Tuned Model):  0.725
Classification Report (Tuned Model):
               precision    recall  f1-score   support

           1       0.74      0.93      0.83       141
           2       0.58      0.24      0.34        59

    accuracy                           0.72       200
   macro avg       0.66      0.58      0.58       200
weighted avg       0.70      0.72      0.68       200

Confusion Matrix (Tuned Model):
 [[131  10]
 [ 45  14]]


In [19]:
import pandas as pd

# URL of the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'

# Column names based on the UCI dataset description
column_names = [
    'checking_status', 'duration', 'credit_history', 'purpose', 'credit_amount', 'savings_status',
    'employment', 'personal_status', 'other_parties', 'residence_since', 'property_magnitude', 'housing',
    'existing_credits', 'job', 'num_dependents', 'own_telephone', 'foreign_worker', 'class'
]

# Load the dataset from the URL
df = pd.read_csv(url, delimiter=' ', header=None, names=column_names)

# Check for missing values
print(df.isnull().sum())


checking_status       0
duration              0
credit_history        0
purpose               0
credit_amount         0
savings_status        0
employment            0
personal_status       0
other_parties         0
residence_since       0
property_magnitude    0
housing               0
existing_credits      0
job                   0
num_dependents        0
own_telephone         0
foreign_worker        0
class                 0
dtype: int64


In [21]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to each categorical feature
for column in df.columns:
    if df[column].dtype == 'object':  # If column is categorical
        df[column] = label_encoder.fit_transform(df[column])

# Verify the changes
print(df.head())


            checking_status  duration  credit_history  purpose  credit_amount  \
A11 6  A34                4      1169               4        4              4   
A12 48 A32                4      5951               0        2              2   
A14 12 A34                7      2096               0        3              2   
A11 42 A32                3      7882               0        3              2   
    24 A33                0      4870               0        2              3   

            savings_status  employment  personal_status  other_parties  \
A11 6  A34               2           0                4              0   
A12 48 A32               1           0                2              0   
A14 12 A34               2           0                3              0   
A11 42 A32               2           2                4              1   
    24 A33               2           0                4              3   

            residence_since  property_magnitude  housing  existing_c

In [23]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Apply scaling to the feature columns (excluding target 'class')
X = df.drop('class', axis=1)
X_scaled = scaler.fit_transform(X)

# Now X_scaled is the scaled version of your features


In [25]:
from sklearn.model_selection import train_test_split

# Define features and target
X = df.drop('class', axis=1)
y = df['class']

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Verify the shapes of the resulting splits
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(800, 17) (200, 17) (800,) (200,)


In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Train the model on the training data
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Display results
print("Accuracy Score (Baseline Model):", accuracy)
print("\nClassification Report (Baseline Model):\n", class_report)
print("\nConfusion Matrix (Baseline Model):\n", conf_matrix)


Accuracy Score (Baseline Model): 0.735

Classification Report (Baseline Model):
               precision    recall  f1-score   support

           1       0.75      0.92      0.83       140
           2       0.62      0.30      0.40        60

    accuracy                           0.73       200
   macro avg       0.69      0.61      0.62       200
weighted avg       0.71      0.73      0.70       200


Confusion Matrix (Baseline Model):
 [[129  11]
 [ 42  18]]


In [29]:
# Initialize the Random Forest model with class_weight='balanced'
rf_balanced = RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=100, max_depth=10)

# Train the model on the training data
rf_balanced.fit(X_train, y_train)

# Make predictions on the test set
y_pred_balanced = rf_balanced.predict(X_test)

# Evaluate the model's performance
accuracy_balanced = accuracy_score(y_test, y_pred_balanced)
class_report_balanced = classification_report(y_test, y_pred_balanced)
conf_matrix_balanced = confusion_matrix(y_test, y_pred_balanced)

# Display results
print("Accuracy Score (Balanced Model):", accuracy_balanced)
print("\nClassification Report (Balanced Model):\n", class_report_balanced)
print("\nConfusion Matrix (Balanced Model):\n", conf_matrix_balanced)


Accuracy Score (Balanced Model): 0.73

Classification Report (Balanced Model):
               precision    recall  f1-score   support

           1       0.76      0.89      0.82       140
           2       0.58      0.35      0.44        60

    accuracy                           0.73       200
   macro avg       0.67      0.62      0.63       200
weighted avg       0.71      0.73      0.71       200


Confusion Matrix (Balanced Model):
 [[125  15]
 [ 39  21]]


In [31]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Get predicted probabilities for each class (class 1 and class 2)
y_prob = rf_balanced.predict_proba(X_test)

# Let's choose a threshold of 0.4 for class 2 (Bad Credit)
threshold = 0.4

# Adjust the prediction: if the probability of class 2 is greater than the threshold, predict class 2
y_pred_threshold = np.where(y_prob[:, 1] > threshold, 2, 1)

# Evaluate the model's performance with the adjusted threshold
accuracy_threshold = accuracy_score(y_test, y_pred_threshold)
class_report_threshold = classification_report(y_test, y_pred_threshold)
conf_matrix_threshold = confusion_matrix(y_test, y_pred_threshold)

# Display results
print("Accuracy Score (Threshold Adjusted Model):", accuracy_threshold)
print("\nClassification Report (Threshold Adjusted Model):\n", class_report_threshold)
print("\nConfusion Matrix (Threshold Adjusted Model):\n", conf_matrix_threshold)


Accuracy Score (Threshold Adjusted Model): 0.655

Classification Report (Threshold Adjusted Model):
               precision    recall  f1-score   support

           1       0.79      0.69      0.74       140
           2       0.44      0.58      0.50        60

    accuracy                           0.66       200
   macro avg       0.62      0.63      0.62       200
weighted avg       0.69      0.66      0.67       200


Confusion Matrix (Threshold Adjusted Model):
 [[96 44]
 [25 35]]
