In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn.model_selection

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('/kaggle/input/cusersmarildownloadsgermancsv/german.csv',encoding ='ISO-8859-1',sep=";")
df.head()

#Code by Liv Toft https://www.kaggle.com/livtoft/the-qlattice-shows-how-3-features-predict-toxicity/notebook

In [None]:
!pip install feyn

In [None]:
import feyn

#The QLattice works with both categorical and numerical data, but needs to be told which entries are categorical (i.e. it assumes they are numerical). 

In [None]:
df.Creditability.value_counts()

#Splitting the data

Let's split the data into train and test sets. Stratify by Creditability and take 2/3 of the entire dataset for training

The line below doesn't work with few data (class_count)

In [None]:
train, test = sklearn.model_selection.train_test_split(df, stratify=df["Creditability"], train_size=.66, random_state=1)

#The actual QLattice is a Quantum Simulator that runs on Abzu's hardware, but we can allocate one with a single line of code. 

In [None]:
#Allocate a QLattice

ql = feyn.connect_qlattice()

#Reset

In [None]:
ql.reset(random_seed=1)

#Search for the best model

In [None]:
#Code by Liv Toft https://www.kaggle.com/livtoft/explainable-model-for-hf-using-the-qlattice

ql.reset(random_seed=1)
models = ql.auto_run(train, output_name="Creditability", kind="classification", max_complexity=10, criterion='aic')

#Models

models is a list of graphs sorted by accuracy. Each model shows how the selected features, or inputs, interact to achieve the output. We can access the best graph by calling:

In [None]:
models[0]

#Performance on train vs. test

Let's see how the model performs on the train versus test dataset. Looking for high accuracy and AUC, but similar values across the two datasets (Don't Overfit!)

In [None]:
models[0].plot(train, test)

#Compare this to three other Machine Learning algorithms: Random Forest, Gradient Boost, and Logistic Regression

In [None]:
#Code by Liv Toft https://www.kaggle.com/livtoft/explainable-model-for-hf-using-the-qlattice

rf = feyn.reference.RandomForestClassifier(train, output_name="Creditability")
gb = feyn.reference.GradientBoostingClassifier(train, output_name="Creditability")
lr = feyn.reference.LogisticRegressionClassifier(train, output_name="Creditability", max_iter=10000)

#Visualize their relative performances using their respective ROC curves

In [None]:
#Code by Liv Toft https://www.kaggle.com/livtoft/explainable-model-for-hf-using-the-qlattice

models[0].plot_roc_curve(test, label='QLattice')
rf.plot_roc_curve(test, label="Random Forest")
gb.plot_roc_curve(test, label="Gradient Boosting")
lr.plot_roc_curve(test, label="Logistic Regression")

"We can see here that the QLattice outperforms Random Forest and Logistic Regression. The ROC curve for the QLattice and Gradient Boosting seem to resemble one another."

For me they are all very close.

#See the number of false negative predictions of the QLattice models make using a confusion matrix.

In [None]:
models[0].plot_confusion_matrix(test)

##See the number of false negative predictions of Random Forest models make using a confusion matrix.

In [None]:
rf.plot_confusion_matrix(test)

We can see here that the QLattice model has a Little (32/31) lower false negative rate. It predicts that 31 Credit subjects are fine compared to the 32 predicted by Random Forest. From a Credit perspective, the QLattice model is therefore better. (In this case the difference is very small)

#Understanding the model

See how each feature contributes to the model using plot_flow_interactive

In [None]:
from feyn.plots.interactive import interactive_activation_flow

In [None]:
interactive_activation_flow(models[0], train)

In [None]:
models[0].plot_probability_scores(test)

In [None]:
#Code by Liv Toft https://www.kaggle.com/livtoft/the-qlattice-shows-how-3-features-predict-toxicity/notebook

train, test = sklearn.model_selection.train_test_split(df, stratify=df["Creditability"], train_size=.66, random_state=1)
test, holdout = sklearn.model_selection.train_test_split(test, stratify=test["Creditability"], test_size=.5, random_state=1)

In [None]:
predictions = models[0].predict(holdout)

In [None]:
predictions

In [None]:
holdout["Creditability"]