In [61]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
columns = [
    "status", "duration", "credit_history", "purpose", "amount", "savings",
    "employment", "installment_rate", "personal_status", "guarantor", "residence_duration",
    "property", "age", "other_installment", "housing", "existing_credits",
    "job", "num_dependents", "own_telephone", "foreign_worker", "credit_risk"
]

In [14]:
df = pd.read_csv("/Users/raya/DataspellProjects/fraud_detection/data/raw/german_credit.data", sep=" ", header=None, names=columns)

In [15]:
df.head()

Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment,installment_rate,personal_status,guarantor,...,property,age,other_installment,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,credit_risk
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [45]:
df.isnull().sum()

status                0
duration              0
credit_history        0
purpose               0
amount                0
savings               0
employment            0
installment_rate      0
personal_status       0
guarantor             0
residence_duration    0
property              0
age                   0
other_installment     0
housing               0
existing_credits      0
job                   0
num_dependents        0
own_telephone         0
foreign_worker        0
credit_risk           0
dtype: int64

In [17]:
def replace_value(value):
    if value == 1:
        return 0
    return 1

In [18]:
df['credit_risk'] = df['credit_risk'].apply(replace_value)
df.head()

Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment,installment_rate,personal_status,guarantor,...,property,age,other_installment,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,credit_risk
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,0
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,1
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,0
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,0
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,1


In [20]:
df.dtypes

status                object
duration               int64
credit_history        object
purpose               object
amount                 int64
savings               object
employment            object
installment_rate       int64
personal_status       object
guarantor             object
residence_duration     int64
property              object
age                    int64
other_installment     object
housing               object
existing_credits       int64
job                   object
num_dependents         int64
own_telephone         object
foreign_worker        object
credit_risk            int64
dtype: object

In [21]:
categorical_cols = df.select_dtypes(include='object').columns
categorical_cols

Index(['status', 'credit_history', 'purpose', 'savings', 'employment',
       'personal_status', 'guarantor', 'property', 'other_installment',
       'housing', 'job', 'own_telephone', 'foreign_worker'],
      dtype='object')

In [22]:
numerical_cols = df.select_dtypes(include='number').columns
numerical_cols

Index(['duration', 'amount', 'installment_rate', 'residence_duration', 'age',
       'existing_credits', 'num_dependents', 'credit_risk'],
      dtype='object')

In [57]:
one_hot_df = pd.get_dummies(df, columns=categorical_cols).astype(int)

In [58]:
one_hot_df.head()

Unnamed: 0,duration,amount,installment_rate,residence_duration,age,existing_credits,num_dependents,credit_risk,status_A11,status_A12,...,housing_A152,housing_A153,job_A171,job_A172,job_A173,job_A174,own_telephone_A191,own_telephone_A192,foreign_worker_A201,foreign_worker_A202
0,6,1169,4,4,67,2,1,0,1,0,...,1,0,0,0,1,0,0,1,1,0
1,48,5951,2,2,22,1,1,1,0,1,...,1,0,0,0,1,0,1,0,1,0
2,12,2096,2,3,49,1,2,0,0,0,...,1,0,0,1,0,0,1,0,1,0
3,42,7882,2,4,45,1,2,0,1,0,...,0,1,0,0,1,0,1,0,1,0
4,24,4870,3,4,53,2,2,1,1,0,...,0,1,0,0,1,0,1,0,1,0


In [59]:
one_hot_df.dtypes

duration               int64
amount                 int64
installment_rate       int64
residence_duration     int64
age                    int64
                       ...  
job_A174               int64
own_telephone_A191     int64
own_telephone_A192     int64
foreign_worker_A201    int64
foreign_worker_A202    int64
Length: 62, dtype: object

In [60]:
df_feature = one_hot_df.drop(columns=["credit_risk"])
df_target = one_hot_df["credit_risk"]

In [64]:
X_train,X_test,y_train,y_test = train_test_split(df_feature, df_target, test_size=0.2, random_state=42)

In [79]:
scalar = StandardScaler()
to_normalize = ['duration', 'amount', 'installment_rate', 'residence_duration', 'age',
                'existing_credits', 'num_dependents']
X_train[to_normalize]=scalar.fit_transform(X_train[to_normalize])
X_test[to_normalize]=scalar.fit_transform(X_test[to_normalize])
X_train.shape, X_test.shape


((800, 61), (200, 61))

In [80]:
X_train

Unnamed: 0,duration,amount,installment_rate,residence_duration,age,existing_credits,num_dependents,status_A11,status_A12,status_A13,...,housing_A152,housing_A153,job_A171,job_A172,job_A173,job_A174,own_telephone_A191,own_telephone_A192,foreign_worker_A201,foreign_worker_A202
29,3.297082,1.199912,0.031196,1.044509,2.406187,1.017777,-0.409736,1,0,0,...,1,0,0,0,1,0,0,1,1,0
535,-0.008051,-0.359630,-0.860109,-1.671440,-0.224364,-0.710931,-0.409736,0,0,1,...,0,0,0,0,1,0,1,0,1,0
695,-1.279256,-0.733547,-0.860109,1.044509,1.266282,-0.710931,-0.409736,0,0,0,...,0,0,0,0,1,0,1,0,1,0
557,-0.008051,0.567050,-1.751413,1.044509,-0.575104,1.017777,-0.409736,0,0,0,...,1,0,0,0,1,0,0,1,1,0
836,-0.770774,-0.854388,0.922500,-0.766124,-1.276585,-0.710931,-0.409736,0,0,0,...,1,0,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,-0.262292,1.069404,-0.860109,1.044509,0.301746,1.017777,2.440599,0,0,0,...,1,0,0,0,0,1,0,1,1,0
270,-0.262292,-0.241206,0.922500,0.139192,-0.312049,-0.710931,-0.409736,0,0,0,...,1,0,0,0,1,0,1,0,0,1
860,0.246190,0.843603,0.922500,-0.766124,-0.750474,1.017777,-0.409736,0,0,0,...,1,0,0,0,1,0,1,0,1,0
435,-0.770774,-0.647923,-0.860109,-1.671440,-0.925844,-0.710931,-0.409736,0,1,0,...,1,0,0,0,1,0,0,1,1,0
