In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder


In [5]:
# Paths to the files
german_data_path = '/kaggle/input/csi-internship-project/german.data'
german_data_numeric_path = '/kaggle/input/csi-internship-project/german.data-numeric'

# Define columns based on the documentation
columns = ['Status_of_existing_checking_account', 'Duration_in_month', 'Credit_history', 'Purpose', 'Credit_amount', 
           'Savings_account/bonds', 'Present_employment_since', 'Installment_rate_in_percentage_of_disposable_income', 
           'Personal_status_and_sex', 'Other_debtors/guarantors', 'Present_residence_since', 'Property', 'Age_in_years', 
           'Other_installment_plans', 'Housing', 'Number_of_existing_credits_at_this_bank', 'Job', 'Number_of_people_being_liable_to_provide_maintenance_for', 
           'Telephone', 'Foreign_worker', 'Credit_risk']

# Load german.data
german_data = pd.read_csv(german_data_path, sep=' ', header=None, names=columns, skipinitialspace=True)

# Load german.data-numeric
german_data_numeric = pd.read_csv(german_data_numeric_path, sep=' ', header=None, skipinitialspace=True)


In [8]:
# Separate numeric and categorical columns
numeric_columns = german_data.select_dtypes(include=[np.number]).columns
categorical_columns = german_data.select_dtypes(include=['object']).columns

# Fill missing values for numeric columns with the mean
german_data[numeric_columns] = german_data[numeric_columns].apply(lambda x: x.fillna(x.mean()), axis=0)

# Fill missing values for categorical columns with the mode
german_data[categorical_columns] = german_data[categorical_columns].apply(lambda x: x.fillna(x.mode()[0]), axis=0)



In [9]:
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    german_data[column] = label_encoders[column].fit_transform(german_data[column])


In [10]:
scaler = StandardScaler()
X = scaler.fit_transform(german_data.drop('Credit_risk', axis=1))
y = german_data['Credit_risk'] - 1  # Adjust target values to be 0 and 1


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [18]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print("Training Accuracy: ", accuracy_score(y_train, y_pred_train))
print("Test Accuracy: ", accuracy_score(y_test, y_pred_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))


Training Accuracy:  1.0
Test Accuracy:  0.81
Confusion Matrix:
 [[132   9]
 [ 29  30]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.94      0.87       141
           1       0.77      0.51      0.61        59

    accuracy                           0.81       200
   macro avg       0.79      0.72      0.74       200
weighted avg       0.80      0.81      0.80       200



In [19]:
feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = german_data.drop('Credit_risk', axis=1).columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)


                                                    importance
Credit_amount                                         0.131240
Age_in_years                                          0.104386
Status_of_existing_checking_account                   0.103118
Duration_in_month                                     0.095828
Purpose                                               0.066581
Credit_history                                        0.061481
Present_employment_since                              0.055956
Savings_account/bonds                                 0.049017
Property                                              0.047091
Present_residence_since                               0.043563
Installment_rate_in_percentage_of_disposable_in...    0.042611
Personal_status_and_sex                               0.033586
Job                                                   0.032321
Housing                                               0.028624
Other_installment_plans                               0