In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [None]:
### Loading dataset
### --------------------

df = pd.read_csv("./data/german_credit_dataset.csv").drop('Sno', axis = 1)
df.head()

In [None]:
### Plotting data
### --------------------

df.plot(kind='scatter', x='Age', y='Credit amount')

In [None]:
df.hist(column = 'Credit amount', by = 'Risk')

In [None]:
sns.lmplot('Duration', 'Credit amount', hue = 'Risk', data = df, fit_reg = False)

In [None]:
sns.boxplot(x = 'Risk', y = 'Credit amount', data = df)
plt.show()

In [None]:
### train ML model
### --------------------

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
y_raw = df['Risk']
X_raw = df.drop('Risk', axis=1)

In [None]:
### sklearn pipeline
### --------------------

categorical_features = X_raw.select_dtypes(include=['object']).columns
numeric_features = X_raw.select_dtypes(include=['int64', 'float']).columns

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value="missing")),
    ('onehotencoder', OneHotEncoder(categories='auto', sparse=False))])

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

feature_engineering_pipeline = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)
    ], remainder="drop")

# Encode Labels
le = LabelEncoder()
encoded_y = le.fit_transform(y_raw)

# Create sklearn pipeline
lr_clf = Pipeline(steps=[('preprocessor', feature_engineering_pipeline),
                         ('classifier', LogisticRegression(solver="lbfgs"))])

In [None]:

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_raw, encoded_y, test_size=0.20, stratify=encoded_y, random_state=42)

In [None]:
### fit model
### --------------------

lr_clf.fit(X_train, y_train)

In [None]:
### evaluate model
### --------------------

print("Accuracy on test data set: ", lr_clf.score(X_test, y_test))
print("Recall for class 'Bad':   ", recall_score(y_test, lr_clf.predict(X_test), pos_label=0))

In [None]:
### persisting model
### --------------------

joblib.dump(value=lr_clf, filename='model.pkl')

In [None]:
### making 5 predictions
### --------------------

new_data = df[:5]
new_data.head()

In [None]:
lr_clf.predict(new_data)

## Disclaimer

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.