# Funzioni

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import cross_val_score, train_test_split

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

def fit_predict_print(train, test, target_columns, estimator):
  X_tr = train.drop(columns=target_columns)
  y_tr = train[target_columns]
  X_ts = test.drop(columns=target_columns)
  y_ts = test[target_columns]

  return fit_predict_print_1(X_tr, X_ts, y_tr, y_ts, estimator)


def fit_predict_print_1(X_train, X_test, y_train, y_test, estimator):
  y_pred = estimator.fit(X_train, y_train).predict(X_test)
  print_accuracy(y_test, y_true, display_labels=estimator.classes_)
  return y_pred

def print_accuracy(y_test, y_pred, display_labels=None):
  print('Accuracy:', metrics.accuracy_score(y_test, y_pred))
  cm = metrics.confusion_matrix(y_test, y_pred)
  print('Confusion matrix:\n', cm)
  cmn = metrics.confusion_matrix(y_test, y_pred, normalize='true')
  ConfusionMatrixDisplay(cmn, display_labels=display_labels).plot()

In [None]:
def balanced_subsample(X, y, subsample_size=1.0):

    classes = y.unique()
    min_elems = y.value_counts().min()

    use_elems = min_elems
    if subsample_size < 1:
        use_elems = int(min_elems * subsample_size)

    Xb = pd.DataFrame()
    yb = pd.Series()

    for c in classes:
      X_ = X[y == c]
      if X_.shape[0] > use_elems:
        X_ = X_.sample(use_elems)

      yc = np.empty(use_elems)
      yc.fill(c)
      y_ = pd.Series(yc)

      Xb = pd.concat([Xb, X_])
      yb = pd.concat([yb, y_])

    return Xb, yb

In [None]:
from sklearn.preprocessing import OneHotEncoder

def one_hot_encode(dataset, columns, drop=None):

  categories = []
  ret_columns = []

  if isinstance(columns, str):
    columns = [columns]
  if (drop is not None) and (isinstance(drop, str)):
    drop = [drop]

  for i in range(0, len(columns)):
    ctg = dataset.loc[:, columns[i]].unique().tolist()
    categories.append(ctg)

    ctg_ = ctg.copy();
    if drop is not None:
      ctg_.remove(drop[i])
    ret_columns += ctg_

  encoder = OneHotEncoder(categories=categories, drop=drop).fit(dataset[columns])
  ds = pd.DataFrame(encoder.transform(dataset[columns]).toarray(), columns=ret_columns)

  return ds


def add_dummies(dataset, columns, drop_cat=None, drop_col=True):
  dummies = one_hot_encode(dataset, columns, drop_cat)
  if drop_col:
    ret = pd.concat([dataset.drop(columns=columns), dummies], axis=1)
  else:
    ret = pd.concat([dataset, dummies], axis=1)
  return ret

In [None]:
def xtab(rows_data, cols_data, normalize_rows=False):
  xtab = pd.crosstab(rows_data, cols_data)
  if normalize_rows:
    for i in xtab.index:
      xtab.loc[i] = xtab.loc[i] / (rows_data == i).sum()
    _ = sns.heatmap(xtab, vmin=0, vmax=1)
  else:
    _ = sns.heatmap(xtab)
  print(xtab)
  return xtab

In [None]:
def voting_classifier(estimators, X_train_list, y_train_list, X_test):

    preds = np.asarray([clf.fit(X, y).predict(X_test) for clf, X, y in
                        zip(estimators, X_train_list, y_train_list)])
    modes = np.apply_along_axis(lambda x: np.bincount(x).argmax(),
                                axis=0, arr=preds)

    return modes

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

def models():
    models = []
    models.append(('LR'   , LogisticRegression()))
    models.append(('LDA'  , LinearDiscriminantAnalysis()))
    models.append(('KNN'  , KNeighborsClassifier()))
    models.append(('CART' , DecisionTreeClassifier()))
    models.append(('NB'   , GaussianNB()))
    models.append(('SVM'  , SVC(probability=True)))
    models.append(('AB'   , AdaBoostClassifier()))
    models.append(('GBM'  , GradientBoostingClassifier()))
    models.append(('RF'   , RandomForestClassifier()))
    models.append(('ET'   , ExtraTreesClassifier()))
    return models


from sklearn.model_selection import StratifiedKFold

def cross_val_models(X, y, models=models(), num_folds=10, random_state=None,
                     scoring = 'accuracy'):
    names = []
    results = []
    for name, model in models:
        kfold = StratifiedKFold(n_splits=num_folds, random_state=random_state,
                                shuffle=True)
        cv_results = cross_val_score(model, X, y, cv=kfold, scoring=scoring,
                                     n_jobs=-1)
        names.append(name)
        results.append(cv_results)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)

    return names, results


def boxplot_models_performance(X, y, models=models(), num_folds=10, random_state=None,
                     scoring = 'accuracy'):
  names, results = cross_val_models(X, y, models, num_folds, random_state, scoring)
  pd.DataFrame(np.array(results).T, columns=names).plot(kind='box')


In [None]:
def train_test_classes_split(dataset, target_key, train_sizes, random_state=None):
  X_tr = pd.DataFrame()
  X_ts = pd.DataFrame()
  y_tr = pd.DataFrame()
  y_ts = pd.DataFrame()

  for c in train_sizes.keys():
    ds = dataset[dataset[target_key] == c]
    y_c = ds[target_key]
    X_c = ds.drop(columns=target_key)
    X_tr_c, X_ts_c, y_tr_c, y_ts_c = train_test_split(X_c, y_c,
                                                      random_state=random_state,
                                                      train_size=train_sizes[c])
    X_tr = pd.concat([X_tr, X_tr_c])
    X_ts = pd.concat([X_ts, X_ts_c])
    y_tr = pd.concat([y_tr, y_tr_c])
    y_ts = pd.concat([y_ts, y_ts_c])

  # print(X_tr_my.shape, X_ts_my.shape, y_tr_my.shape, y_ts_my.shape)
  return X_tr, X_ts, y_tr, y_ts