In [None]:
import numpy as np
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


The original dataset contains 1000 entries with 20 categorial/symbolic attributes prepared by Prof. Hofmann. In this dataset, each entry represents a person who takes a credit by a bank. Each person is classified as good or bad credit risks according to the set of attributes
https://archive.ics.uci.edu/ml/datasets/Statlog+%28German+Credit+Data%29

This is a simplified version of the original dataset:

* Age (numeric)
* Sex (text: male, female)
* Job (numeric: 0 - unskilled and non-resident, 1 - unskilled and resident, 2 - skilled, 3 - highly skilled)
* Housing (text: own, rent, or free)
* Saving accounts (text - little, moderate, quite rich, rich)
* Checking account (numeric, in DM - Deutsch Mark)
* Credit amount (numeric, in DM)
* Duration (numeric, in month)
* Purpose(text: car, furniture/equipment, radio/TV, domestic appliances, repairs, education, business, vacation/others
* Risk (Value target - Good or Bad Risk)

In [None]:
import seaborn as sns #Graph library that use matplot in background
import matplotlib.pyplot as plt #to plot some parameters in seaborn

#Importing the data
df = pd.read_csv("/kaggle/input/german-credit-data-risk/german_credit_data.csv",index_col=0)

In [None]:
df.head()

In [None]:
df.nunique()

In [None]:
df["Risk"].unique()

Transform the target into numbers
1 is good and 0 is bad

In [None]:
df["Housing"].unique()

In [None]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()

df["Risk"] = LE.fit_transform(df["Risk"])
df["Housing"] = LE.fit_transform(df["Housing"])
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df["Checking account"].unique()

In [None]:
df["Checking account"] = df["Checking account"].map({ 'little' : 1, 'moderate': 2 , 'rich': 3})

In [None]:
df["Checking account"]  = df["Checking account"].fillna(0)

In [None]:
df["Checking account"].unique()

In [None]:
df["Purpose"].unique()

In [None]:

sns.catplot(x="Purpose", hue="Risk", kind="count", data=df, height=8.27, aspect=11.7/8.27)

In [None]:
df["Purpose"] = df["Purpose"].map({ 'radio/TV' : 1, 'education' : 2 , 'furniture/equipment' : 1 , 'car' : 0 , 'business' :3,
       'domestic appliances' : 1 , 'repairs' : 1 , 'vacation/others' : 4})

In [None]:
df["Saving accounts"].unique()

In [None]:
df["Saving accounts"] = df["Saving accounts"].map({ 'little' : 1, 'moderate': 2 , 'rich': 4 , 'quite rich' : 3})

In [None]:
df["Saving accounts"] = df["Saving accounts"].fillna(0)

In [None]:
df["Saving accounts"].unique()

In [None]:
df.info()

In [None]:
df["male"] = pd.get_dummies(df["Sex"], drop_first=True)

In [None]:
df = df.drop("Sex" , axis = 1 )
df.info()

In [None]:
sns.catplot(x="Housing", hue="Risk", kind="count", data=df);

In [None]:
sns.barplot(x="Saving accounts", y="Risk", hue = "Purpose", data=df)

In [None]:
sns.catplot(x="male", hue="Risk", kind="count", data=df, height=8.27, aspect=11.7/8.27)

## 2. Define the features X and the target y

In [None]:
X = df.drop("Risk", axis = 1)
X.shape

In [None]:
y = df[["Risk"]]
y.shape

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(X)

## 3. Divide the data into 2 splits: training set and testing set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0)

### 4. Create the model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)

### 5. Train the model

In [None]:
knn.fit(X_train, y_train)


### 6. Evaluate the model

In [None]:
knn.score(X_train, y_train)

In [None]:
knn.score(X_test, y_test)