In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
import sklearn
sklearn.__version__

'1.2.2'

# Database connection

In [None]:
def get_data_from_redshift(connection_parameters: Dict, query: str):
    import redshift_connector
    import pandas as pd
    df = None

    logger.info(f'Running {query} on RedShift!')
    logger.info(f'Redshift connection_parameters: {connection_parameters}')
    try:
        with redshift_connector.connect(
                host=connection_parameters['host'],
                database=connection_parameters['database'],
                user=connection_parameters['user'],
                password=connection_parameters['password'],
                port=int(connection_parameters['port']),
                ssl=False
        ) as conn:
            with conn.cursor() as cursor:
                df = pd.read_sql_query(query, conn)
    except Exception as e:
        logger.warning(f'Can not connect to Redshift: {e}')

    return df

# Data

In [3]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

dataset = datasets.load_wine()
df_mulitclass = pd.DataFrame(
    data= np.c_[dataset['data'], dataset['target']],
    columns= dataset['feature_names'] + ['target']
)
dataset = datasets.fetch_california_housing()
df_regression = pd.DataFrame(
    data= np.c_[dataset['data'], dataset['target']],
    columns= dataset['feature_names'] + ['target']
)
dataset = datasets.load_breast_cancer()
df_binary_class = pd.DataFrame(
    data= np.c_[dataset['data'], dataset['target']],
    columns= dataset['feature_names'].tolist() + ['target']
)

df = df_regression

In [4]:
from sklearn.model_selection import train_test_split
feature_columns = list(df.drop(['target'], axis=1).columns)
X = df[feature_columns]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [7]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(14448, 8) (6192, 8) (14448,) (6192,)


# EDA

## Profiling 

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title="Pandas Profiling Report")
profile.to_notebook_iframe()

## Correlation

In [None]:
# Pearson
from scipy import stats
stats.pearsonr(df['continous'], df['continous2'])

In [None]:
# Point-Biserial 
#  continous vs. binary
from scipy import stats
stats.pointbiserialr(df['continous'], df['binary'])

In [None]:
# Phi Coefficient
#  binary vs binary
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(df['binary'], df['binary2'])

# Feature engineering

In [None]:
df['hour_sin'] = np.sin(2*np.pi/24*df['hour'])
df['hour_cos'] = np.cos(2*np.pi/24*df['hour'])

# Hyperparamter optimalization

In [11]:
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras import Input
from keras.optimizers import Adadelta, Adam
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import sys
from sklearn.model_selection import KFold
import tensorflow as tf
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.metrics import f1_score

class Hyperparam_opt():
    def __init__(self, buildmodel, space, score, loss=False):
        """
        Args:
          * buildmodel -- f(params, traning_X, traning_y), a function which is used for training a single model
          * space -- the space for the hyperparamter optialization
          * score -- how we score the model, need to sync with the 'loss'
                     if hight score is the better model, we need to set the 'loss' to False
          * loss -- the score is a loss function or not, it is loss if the smaller value is better,
                    ex: f1 is not one, as 1 is the best and 0 is the worst value
        """
        self.buildmodel = buildmodel
        self.space = space
        self.score = score
        self.loss = loss
        self.models = []
    
    def hyperparameter_tuning(self, space):
        #"""
        # Define the K-fold Cross Validator
        kfold = KFold(n_splits=self.n_splits)
        scores = []
        for train_idx, test_idx in kfold.split(self.X_train, self.y_train):
            model = self.buildmodel(space, self.X_train[train_idx], self.y_train[train_idx])
            try:
                pred_auc = model.predict(self.X_train[test_idx], verbose = 0)
            except:
                pred_auc = model.predict(self.X_train[test_idx])
            scores.append( self.score(self.y_train[test_idx], pred_auc) )
        self.models.append({})
        self.models[-1]['space'] = space
        self.models[-1]['scores'] = scores
        self.models[-1]['loss'] =  np.mean(scores)
        if not self.loss:
            self.models[-1]['loss'] =  1-np.mean(scores)
        return {'loss': self.models[-1]['loss'], 'status': STATUS_OK}
    
    def train(self, X_train, y_train, X_test, y_test, n_splits=5, max_eval=100):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        # split for K fold CV
        self.n_splits = n_splits
        
        trials = Trials()
        # best arguments
        self.best_args = fmin(
            fn=self.hyperparameter_tuning,
            space=self.space,
            algo=tpe.suggest,
            max_evals=max_eval,
            trials=trials)
        # the best model
        self.model = self.buildmodel(self.best_args, X_train, y_train)
        

## LightGBM binarry classification

In [None]:
from sklearn.metrics import f1_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import lightgbm as ltb

# hyperparamter space for lightgbm
space_boost = {
    'num_leaves': hp.quniform("num_leaves", 4, 160, 2),
    'min_child_samples': hp.quniform("min_child_samples", 100, 3000, 100),
}

# one single LightGBM model fit
def build_boost_model(params, traning_X, traning_y):
    model = ltb.LGBMClassifier(
            num_leaves=int(params['num_leaves']),
            min_child_samples=int(params['min_child_samples']),
            is_unbalance=True
    )
    # train model        
    model.fit(traning_X, traning_y)
    return model

# hyperparamter optimalization
boost_model = Hyperparam_opt(
    build_boost_model,
    space_boost, 
    f1_score, 
    loss=False
)
boost_model.train(X_train.values, y_train.values, X_test.values, y_test.values, max_eval=1)

# predict with the optimalised model
predicted_y = boost_model.model.predict(X_test)
score = f1_score(y_test, predicted_y)

## LightGBPM multiclass

In [None]:
from sklearn.metrics import f1_score, roc_auc_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import lightgbm as ltb

# hyperparamter space for lightgbm
space_boost = {
    'num_leaves': hp.quniform("num_leaves", 4, 160, 2),
    'min_child_samples': hp.quniform("min_child_samples", 100, 3000, 100),
}

# ROC is better when classes is balanced
def roc_multiclass(y_test, y_pred):
    return roc_auc_score(y_test, y_pred, average="weighted", multi_class="ovr")

# one single LightGBM model fit
def build_boost_model(params, traning_X, traning_y):
    model = ltb.LGBMClassifier(
            num_leaves=int(params['num_leaves']),
            min_child_samples=int(params['min_child_samples']),
            is_unbalance=True
    )
    # overwrite predict fucntion with probability prediction as we use ROC for evaluation 
    model.predict = model.predict_proba
    # train model        
    model.fit(traning_X, traning_y)
    return model

# hyperparamter optimalization
boost_model = Hyperparam_opt(
    build_boost_model,
    space_boost, 
    roc_multiclass, 
    loss=False
)
boost_model.train(X_train.values, y_train.values, X_test.values, y_test.values, max_eval=10)

# predict with the optimalised model
predicted_y = boost_model.model.predict(X_test.values)
score = roc_multiclass(y_test, predicted_y)
print(score)
# turn to Probability to class label
predicted_y = predicted_y.argmax(axis=-1)


In [None]:
from sklearn.metrics import f1_score, roc_auc_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import lightgbm as ltb

# hyperparamter space for lightgbm
space_boost = {
    'num_leaves': hp.quniform("num_leaves", 4, 160, 2),
    'min_child_samples': hp.quniform("min_child_samples", 100, 3000, 100),
}

# own f1 function for multiclass is good when there is class inblance
def f1_multiclass(y_true, y_pred):
    return f1_score(y_true, y_pred, average='weighted')

# one single LightGBM model fit
def build_boost_model(params, traning_X, traning_y):
    model = ltb.LGBMClassifier(
            num_leaves=int(params['num_leaves']),
            min_child_samples=int(params['min_child_samples']),
            is_unbalance=True
    )
    # train model        
    model.fit(traning_X, traning_y)
    return model

# hyperparamter optimalization
boost_model = Hyperparam_opt(
    build_boost_model,
    space_boost, 
    f1_multiclass, 
    loss=False
)
boost_model.train(X_train.values, y_train.values, X_test.values, y_test.values, max_eval=100)

# predict with the optimalised model
predicted_y = boost_model.model.predict(X_test.values)
score = f1_multiclass(y_test, predicted_y)
print(score)

## LightGBM regression

In [12]:
from sklearn.metrics import mean_absolute_percentage_error
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import lightgbm as ltb

# hyperparamter space for lightgbm
space_boost = {
    'num_leaves': hp.quniform("num_leaves", 4, 160, 2),
    'min_child_samples': hp.quniform("min_child_samples", 100, 3000, 100),
}

# one single LightGBM model fit
def build_boost_model(params, traning_X, traning_y):
    model = ltb.LGBMRegressor(
            num_leaves=int(params['num_leaves']),
            min_child_samples=int(params['min_child_samples']),
    )
    # train model        
    model.fit(traning_X, traning_y)
    return model

# hyperparamter optimalization
boost_model = Hyperparam_opt(
    build_boost_model,
    space_boost, 
    mean_absolute_percentage_error, 
    loss=True
)
boost_model.train(X_train.values, y_train.values, X_test.values, y_test.values, max_eval=1)

# predict with the optimalised model
predicted_y = boost_model.model.predict(X_test.values)
score = mean_absolute_percentage_error(y_test, predicted_y)
print(score)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.81trial/s, best loss: 0.27576107166512165]
0.24998006441316417


# Clustering

# Model evaluation 

In [None]:
# confusion_matrix
from sklearn.metrics import confusion_matrix,  ConfusionMatrixDisplay
%matplotlib inline

cm = confusion_matrix(y_test, predicted_y, normalize='true')
cm_no_norm = confusion_matrix(y_test, predicted_y)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
ConfusionMatrixDisplay(cm).plot(ax=ax1)
ConfusionMatrixDisplay(cm_no_norm).plot(ax=ax2)
plt.show()

In [None]:
# SHAP value
import shap
explainer = shap.TreeExplainer(boost_model.model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)

In [None]:
# precission recall
from sklearn.metrics import PrecisionRecallDisplay
import matplotlib.pyplot as plt


# from model
pr_boost = PrecisionRecallDisplay.from_estimator(
    boost_model.model, X_test, y_test, name="Boosting"
)
# from prediction
pr_nn = PrecisionRecallDisplay.from_predictions(
    y_test, pre_nn, name="NN"
)

%matplotlib inline
fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
_ = pr_boost.plot(ax=ax1);
_ = pr_nn.plot(ax=ax1);
plt.legend()
plt.show()