In supervised learning, the dataset is labeled with the answer that algorithm should come up with. Supervised learning takes input variables (x) along with an output variable (y). The output variable represents the column that you want to predict on.

The algorithm then uses these variables to learn and approximate the mapping function from the input to the output. Supervised learning algorithms support classification and regression problems.

https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science.html

![](https://5b378f06dc82cad32808-dde2f412afa97a75335b0e97fc82422c.ssl.cf2.rackcdn.com/h2o_eb874f02b7923924e633b4db54b4f6fe.png)training.h2o.ai

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

# ML tools
import h2o
from h2o.estimators import H2ORandomForestEstimator
from h2o.estimators import H2OGradientBoostingEstimator

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
nRowsRead = 1000 # specify 'None' if want to read whole file
df = pd.read_csv('../input/cusersmarildownloadsgermancsv/german.csv', delimiter=';', encoding = "ISO-8859-2", nrows = nRowsRead)
df.dataframeName = 'german.csv'
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')
df.head()

#All script by Chris X  https://www.kaggle.com/docxian/portugal-wine-quality

In [None]:
# target
print(df.Creditability.value_counts())

# plot
df.Creditability.value_counts().plot(kind='bar')
plt.title('Target "Creditability"')
plt.grid()
plt.show()

In [None]:
# numerical features
features_num = ['Credit_Amount', 'Duration_of_Credit_monthly', 'Purpose', 
                'Account_Balance', 'Age_years', 'Payment_Status_of_Previous_Credit',
                'Occupation', 'Instalment_per_cent', 'No_of_Credits_at_this_Bank', 'Foreign_Worker',
                'Guarantors']

In [None]:
# basic stats
df[features_num].describe(include='all')

In [None]:
# pairwise scatter plot and histograms [this takes a few minutes!!!]
t1 = time.time()
sns.pairplot(df[features_num],kind='reg', 
             plot_kws={'line_kws':{'color':'magenta'},
                       'scatter_kws': {'alpha': 0.1}})
plt.show()
t2 = time.time()
print('Elapsed time:', np.round(t2-t1,2))

In [None]:
# correlations
corr_pearson = df[features_num].corr(method='pearson')
corr_spearman = df[features_num].corr(method='spearman')

fig = plt.figure(figsize = (10,8))
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

fig = plt.figure(figsize = (10,8))
sns.heatmap(corr_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman Correlation')
plt.show()

In [None]:
# plot target vs features for all features
for f in features_num:
    plt.figure(figsize=(10,5))
    plt.scatter(df[f], df.Creditability, alpha=0.15)
    corr_target = np.round(df[f].corr(df.Creditability),4)
    my_title = 'Target vs ' + f + ', corr=' + str(corr_target)
    plt.title(my_title)
    plt.grid()

In [None]:
#Alternative visualization - Plot feature distribution by target level

for f in features_num:
    plt.figure(figsize=(10,5))
    sns.violinplot(data=df, y='Creditability', x=f, orient='h')
    plt.title(f)
    plt.grid()
    plt.show()

In [None]:
# select predictors
predictors = features_num
print('Number of predictors: ', len(predictors))
print(predictors)

# define target
target='Creditability'

In [None]:
# start H2O
h2o.init(max_mem_size='12G', nthreads=4) # Use maximum of 12 GB RAM and 4 cores

In [None]:
# upload data frame in H2O environment
df_hex = h2o.H2OFrame(df)

# train / test split (80/20)
train_hex, test_hex = df_hex.split_frame(ratios=[0.8], seed=999)

In [None]:
# define Gradient Boosting model
n_cv = 5
fit_1 = H2OGradientBoostingEstimator(ntrees = 50,
                                     max_depth=6,
                                     min_rows=5,
                                     sample_rate=1,
                                     col_sample_rate=0.5,
                                     nfolds=n_cv,
                                     seed=999)

In [None]:
# train model
t1 = time.time()
fit_1.train(x=predictors,
            y=target,
            training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show cross validation metrics
fit_1.cross_validation_metrics_summary()

In [None]:
# show scoring history - training vs cross validations
for i in range(n_cv):
    cv_model_temp = fit_1.cross_validation_models()[i]
    df_cv_score_history = cv_model_temp.score_history()
    my_title = 'CV ' + str(1+i) + ' - Scoring History [RMSE]'
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.training_rmse, 
                c='blue', label='training')
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.validation_rmse, 
                c='darkorange', label='validation')
    plt.title(my_title)
    plt.xlabel('Number of Trees')
    plt.legend()
    plt.grid()
    plt.show()

#SHAP Summary

SHAP summary plot shows the contribution of the features for each instance (row of data). The sum of the feature contributions and the bias term is equal to the raw prediction of the model, i.e., prediction before applying inverse link function.



SHAP Local Explanation

SHAP explanation shows contribution of features for a given instance. The sum of the feature contributions and the bias term is equal to the raw prediction of the model, i.e., prediction before applying inverse link function. H2O implements TreeSHAP which when the features are correlated, can increase contribution of a feature that had no influence on the prediction.

https://docs.h2o.ai/h2o/latest-stable/h2o-docs/explain.html

In [None]:
# variable importance using shap values => see direction as well as severity of feature impact
t1 = time.time()
fit_1.shap_summary_plot(train_hex);
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# predict on training data
pred_train = fit_1.predict(train_hex)
y_train_act = train_hex.as_data_frame()[target].values # actuals
y_train_pred = pred_train.as_data_frame().predict.values # predictions
# wrap results in data frame
df_train_eval = pd.DataFrame({'Actual' : y_train_act,
                              'PredNum' : y_train_pred})

In [None]:
# plot predictions vs actual
p=sns.jointplot(data=df_train_eval,
                x='Actual', y='PredNum',
                joint_kws={'alpha' : 0.15})
p.fig.suptitle('Prediction vs Actual - Training Data')
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.show()

#Regression => Classification:

In [None]:
# we have to map the continuous values from our regression exercise to the classes now
y_train_pred_class = np.round(y_train_pred,0).astype(int)
# also add to data frame
df_train_eval['PredClass'] = y_train_pred_class
df_train_eval.head()

In [None]:
# confusion matrix; rows ~ actual observations, cols ~ predictions
conf_train = pd.crosstab(df_train_eval.Actual, df_train_eval.PredClass)
# visualize
sns.heatmap(conf_train, cmap='Blues', annot=True, 
            cbar=False, fmt='d',
            linecolor='black',
            linewidths=0.1)
plt.show()

#Evaluate on test set

In [None]:
# predict on test data
pred_test = fit_1.predict(test_hex)
y_test_act = test_hex.as_data_frame()[target].values # actual values
y_test_pred = pred_test.as_data_frame().predict.values # predictions
# wrap results in data frame
df_test_eval = pd.DataFrame({'Actual' : y_test_act,
                             'PredNum' : y_test_pred})

In [None]:
# plot predictions vs actuals
p=sns.jointplot(data=df_test_eval,
                x='Actual', y='PredNum',
                joint_kws={'alpha' : 0.15})
p.fig.suptitle('Prediction vs Actual - Test Data')
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.show()

#Regression => Classification:

In [None]:
# map the continuous values to classes again
y_test_pred_class = np.round(y_test_pred,0).astype(int)
# also add to data frame
df_test_eval['PredClass'] = y_test_pred_class
df_test_eval.head()

In [None]:
# confusion matrix; rows ~ actual observations, cols ~ predictions
conf_test = pd.crosstab(df_test_eval.Actual, df_test_eval.PredClass)
# visualize
sns.heatmap(conf_test, cmap='Blues', annot=True, 
            cbar=False, fmt='d',
            linecolor='black',
            linewidths=0.1)
plt.show()

In [None]:
#Code by Olga Belitskaya https://www.kaggle.com/olgabelitskaya/sequential-data/comments
from IPython.display import display,HTML
c1,c2,f1,f2,fs1,fs2=\
'#eb3434','#eb3446','Akronim','Smokum',30,15
def dhtml(string,fontcolor=c1,font=f1,fontsize=fs1):
    display(HTML("""<style>
    @import 'https://fonts.googleapis.com/css?family="""\
    +font+"""&effect=3d-float';</style>
    <h1 class='font-effect-3d-float' style='font-family:"""+\
    font+"""; color:"""+fontcolor+"""; font-size:"""+\
    str(fontsize)+"""px;'>%s</h1>"""%string))
    
    
dhtml('Thanks Chris X, @docxian for all the script' )