In [10]:
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import scipy.stats as _stats
import sklearn.metrics as metrics
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


CMAP = cm.Pastel2
# plt.colorbar()
plt.set_cmap(CMAP)

NR_FIGS = 7

<Figure size 432x288 with 0 Axes>

In [11]:
''' ___________________________________________________________
    FUNCTIONS FOR PLOTTING CHARTS
    ___________________________________________________________
'''


def choose_grid(nr):
    return (nr // NR_FIGS, NR_FIGS) if nr % NR_FIGS == 0 else (nr // NR_FIGS + 1, NR_FIGS)


def line_chart(ax: plt.Axes, series: pd.Series, title: str, x_label: str, y_label: str, percentage=False):
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_xticks(series.index)
    if percentage:
        ax.set_ylim(0.0, 1.0)
    ax.plot(series)


def multiple_line_chart(ax: plt.Axes, x_values: list, y_values: dict, title: str, x_label: str, y_label: str,
                        percentage=False):
    legend: list = []
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_xticks(x_values)
    if percentage:
        ax.set_ylim(0.0, 1.0)
    for name, y in y_values.items():
        ax.plot(x_values, y)
        legend.append(name)
    ax.legend(legend, loc='lower center', bbox_to_anchor=(0.5, -0.2), fancybox=True, shadow=True)


def bar_chart(ax: plt.Axes, x_values: list, y_values: list, title: str, x_label: str, y_label: str, percentage=False):
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_xticklabels(x_values, rotation=90, fontsize='small')
    if percentage:
        ax.set_ylim(0.0, 1.0)
    ax.bar(x_values, y_values, edgecolor='grey')


def multiple_bar_chart(ax: plt.Axes, x_values: list, y_values: dict, title: str, x_label: str, y_label: str,
                       percentage=False):
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    x = np.arange(len(x_values))  # the label locations
    ax.set_xticks(x)
    ax.set_xticklabels(x_values, fontsize='small')
    if percentage:
        ax.set_ylim(0.0, 1.0)
    width = 0.8  # the width of the bars
    step = width / len(y_values)
    i = 0
    for name, y in y_values.items():
        ax.bar(x + i * step, y, step, label=name)
        i += 1
    ax.legend(loc='lower center', ncol=len(y_values), bbox_to_anchor=(0.5, -0.2), fancybox=True, shadow=True)

In [12]:
def plot_confusion_matrix(ax: plt.Axes, cnf_matrix: np.ndarray, classes_names: list, normalize: bool = False):
    if normalize:
        total = cnf_matrix.sum(axis=1)[:, np.newaxis]
        cmt = cnf_matrix.astype('float') / total
        title = "Normalized confusion matrix"
    else:
        cmt = cnf_matrix
        title = 'Confusion matrix, without normalization'
    np.set_printoptions(precision=2)
    tick_marks = np.arange(0, len(classes_names), 1)
    ax.set_title(title)
    ax.set_ylabel('True label')
    ax.set_xlabel('Predicted label')
    ax.set_xticks(tick_marks)
    ax.set_yticks(tick_marks)
    ax.set_xticklabels(classes_names)
    ax.set_yticklabels(classes_names)
    ax.imshow(cmt, interpolation='nearest', cmap=CMAP)

    fmt = '.2f' if normalize else 'd'
    for i, j in itertools.product(range(cmt.shape[0]), range(cmt.shape[1])):
        ax.text(j, i, format(cmt[i, j], fmt), horizontalalignment="center")


def plot_roc_chart(ax: plt.Axes, models: dict, tst_X: np.ndarray, tst_y: np.ndarray, target: str = 'class'):
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.0])
    ax.set_xlabel('FP rate')
    ax.set_ylabel('TP rate')
    ax.set_title('ROC chart for %s' % target)
    ax.plot([0, 1], [0, 1], color='navy', label='random', linestyle='--')

    for clf in models:
        scores = models[clf].predict_proba(tst_X)[:, 1]
        fpr, tpr, _ = metrics.roc_curve(tst_y, scores, 'positive')
        roc_auc = metrics.roc_auc_score(tst_y, scores)
        ax.plot(fpr, tpr, label='%s (auc=%0.2f)' % (clf, roc_auc))
    ax.legend(loc="lower center")

In [13]:
''' ___________________________________________________________
    FUNCTIONS FOR SINGULAR VARIABLE ANALYSIS
    ___________________________________________________________
'''


def process_symbolic(data: pd.DataFrame):
    cat_vars = data.select_dtypes(include='object')
    for att in cat_vars:
        print(att, data[att].unique())

    data[cat_vars.columns] = data.select_dtypes(['object']).apply(lambda x: x.astype('category'))
    print(data.dtypes)


def plot_data(data: pd.DataFrame):
    (rows, cols) = choose_grid(data.shape[1])
    plt.figure()
    fig, axs = plt.subplots(rows, cols, figsize=(cols*5, rows*5))
    i, j, n = 0, 0, 0

    for col in data:
        line_chart(axs[i, j], data[col], col, 'date', col)
        n = n + 1
        i, j = (i + 1, 0) if n % cols == 0 else (i, j + 1)
    fig.tight_layout()
    plt.show()

In [14]:
def show_missing_values(data: pd.DataFrame):
    fig = plt.figure(figsize=(10,7))
    mv = {}
    for var in data:
        mv[var] = data[var].isna().sum()
        bar_chart(plt.gca(), mv.keys(), mv.values(), 'Number of missing values per variable', var, 'nr. missing values')
    fig.tight_layout()
    plt.show()

In [15]:
def show_boxplots(data: pd.DataFrame):
    columns = data.select_dtypes(include='number').columns
    rows, cols = choose_grid(len(columns))
    plt.figure()
    fig, axs = plt.subplots(rows, cols, figsize=(cols*4, rows*4), squeeze=False)
    i, j = 0, 0
    for n in range(len(columns)):
        axs[i, j].set_title('Boxplot for %s'%columns[n])
        axs[i, j].boxplot(data[columns[n]].dropna().values)
        i, j = (i + 1, 0) if (n+1) % cols == 0 else (i, j + 1)
    fig.tight_layout()
    plt.show()

In [16]:
def show_hist(data: pd.DataFrame):
    columns = data.select_dtypes(include='number').columns
    rows, cols = choose_grid(len(columns))
    plt.figure()
    fig, axs = plt.subplots(rows, cols, figsize=(cols*4, rows*4), squeeze=False)
    i, j = 0, 0
    for n in range(len(columns)):
        axs[i, j].set_title('Histogram for %s'%columns[n])
        axs[i, j].set_x_label(columns[n])
        axs[i, j].set_y_label("probability")
        axs[i, j].hist(data[columns[n]].dropna().values, 'auto')
        i, j = (i + 1, 0) if (n+1) % cols == 0 else (i, j + 1)
    fig.tight_layout()
    plt.show()

    columns = data.select_dtypes(include='category').columns
    rows, cols = choose_grid(len(columns))
    plt.figure()
    fig, axs = plt.subplots(rows, cols, figsize=(cols*4, rows*4), squeeze=False)
    i, j = 0, 0
    for n in range(len(columns)):
        counts = data[columns[n]].dropna().value_counts(normalize=True)
        bar_chart(axs[i, j], counts.index, counts.values, 'Histogram for %s'%columns[n], columns[n], 'probability')
        i, j = (i + 1, 0) if (n+1) % cols == 0 else (i, j + 1)
    fig.tight_layout()
    plt.show()


def show_hist_dist_seaborn(data: pd.DataFrame):
    columns = data.select_dtypes(include='number').columns
    rows, cols = choose_grid(len(columns))
    plt.figure()
    fig, axs = plt.subplots(rows, cols, figsize=(cols*4, rows*4), squeeze=False)
    i, j = 0, 0
    for n in range(len(columns)):
        axs[i, j].set_title('Histogram with trend for %s'%columns[n])
        axs[i, j].set_ylabel("probability")
        sns.distplot(data[columns[n]].dropna().values, norm_hist=True, ax=axs[i, j], axlabel=columns[n])
        i, j = (i + 1, 0) if (n+1) % cols == 0 else (i, j + 1)
    fig.tight_layout()
    plt.show()


def compute_known_distributions(x_values, n_bins) -> dict:
    distributions = dict()
    # Gaussian
    mean, sigma = _stats.norm.fit(x_values)
    distributions['Normal(%.1f,%.2f)'%(mean,sigma)] = _stats.norm.pdf(x_values, mean, sigma)
    # LogNorm
    # sigma, loc, scale = _stats.lognorm.fit(x_values)
    # distributions['LogNor(%.1f,%.2f)'%(np.log(scale),sigma)] = _stats.lognorm.pdf(x_values, sigma, loc, scale)
    # Exponential
    loc, scale = _stats.expon.fit(x_values)
    distributions['Exp(%.2f)'%(1/scale)] = _stats.expon.pdf(x_values, loc, scale)
    # SkewNorm
    # a, loc, scale = _stats.skewnorm.fit(x_values)
    # distributions['SkewNorm(%.2f)'%a] = _stats.skewnorm.pdf(x_values, a, loc, scale)
    return distributions


def histogram_with_distributions(ax: plt.Axes, series: pd.Series, var: str):
    values = series.sort_values().values
    n, bins, patches = ax.hist(values, 20, density=True, edgecolor='grey')
    distributions = compute_known_distributions(values, bins)
    multiple_line_chart(ax, values, distributions, 'Best fit for %s'%var, var, 'probability')


def show_hist_dist(data: pd.DataFrame):
    columns = data.select_dtypes(include='number').columns
    rows, cols = choose_grid(len(columns))
    plt.figure()
    fig, axs = plt.subplots(rows, cols, figsize=(cols*4, rows*4), squeeze=False)
    i, j = 0, 0
    for n in range(len(columns)):
        histogram_with_distributions(axs[i, j], data[columns[n]].dropna(), columns[n])
        i, j = (i + 1, 0) if (n+1) % cols == 0 else (i, j + 1)
    fig.tight_layout()
    plt.show()

In [17]:
''' ___________________________________________________________
    FUNCTIONS FOR MULTIVARIATE ANALYSIS
    ___________________________________________________________
'''


def scatter_plots(data: pd.DataFrame):
    columns = data.select_dtypes(include='number').columns
    rows, cols = len(columns)-1, len(columns)-1
    plt.figure()
    fig, axs = plt.subplots(rows, cols, figsize=(cols*4, rows*4), squeeze=False)
    for i in range(len(columns)):
        var1 = columns[i]
        for j in range(i+1, len(columns)):
            var2 = columns[j]
            axs[i, j-1].set_title("%s x %s"%(var1,var2))
            axs[i, j-1].set_xlabel(var1)
            axs[i, j-1].set_ylabel(var2)
            axs[i, j-1].scatter(data[var1], data[var2])
    fig.tight_layout()
    plt.show()


def heat_map(data: pd.DataFrame):
    plt.figure(figsize=[12, 12])
    corr_mtx = data.corr()
    sns.heatmap(corr_mtx, xticklabels=corr_mtx.columns, yticklabels=corr_mtx.columns, annot=True, cmap='Blues')
    plt.title('Correlation analysis')
    plt.show()

In [18]:
''' ___________________________________________________________
    FUNCTIONS FOR TRAINING MODELS
    ___________________________________________________________
'''


def naive_bayes(trn_X, tst_X, trn_y, tst_y, labels):
    estimators = {'GaussianNB': GaussianNB(),
                  'MultinomialNB': MultinomialNB(),
                  'BernoulyNB': BernoulliNB()}

    x_values = []
    y_values = []
    for clf in estimators:
        x_values.append(clf)
        estimators[clf].fit(trn_X, trn_y)
        prd_y = estimators[clf].predict(tst_X)
        y_values.append(metrics.accuracy_score(tst_y, prd_y))
        cnf_mtx = metrics.confusion_matrix(tst_y, prd_y, labels)
        plot_confusion_matrix(plt.gca(), cnf_mtx, labels)

    plt.figure()
    bar_chart(plt.gca(), x_values, y_values, 'Comparison of Naive Bayes Models', '', 'accuracy', percentage=True)
    plt.show()


def knn(trn_X, tst_X, trn_y, tst_y, labels):
    n_values = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
    dist = ['manhattan', 'euclidean', 'chebyshev']
    values = {}
    for d in dist:
        y_values = []
        for n in n_values:
            model = KNeighborsClassifier(n_neighbors=n, metric=d)
            model.fit(trn_X, trn_y)
            prd_y = model.predict(tst_X)
            y_values.append(metrics.accuracy_score(tst_y, prd_y))
            cnf_mtx = metrics.confusion_matrix(tst_y, prd_y, labels)
            plot_confusion_matrix(plt.gca(), cnf_mtx, labels)
        values[d] = y_values

    plt.figure()
    multiple_line_chart(plt.gca(), n_values, values, 'KNN variants', 'n', 'accuracy', percentage=True)
    plt.show()


def trees(trn_X, tst_X, trn_y, tst_y, labels):
    min_samples_leaf = [.05, .025, .01, .0075, .005, .0025, .001]
    max_depths = [5, 10, 25, 50]
    criteria = ['entropy', 'gini']

    plt.figure()
    fig, axs = plt.subplots(1, 2, figsize=(16, 4), squeeze=False)
    for k in range(len(criteria)):
        f = criteria[k]
        values = {}
        for d in max_depths:
            y_values = []
            for n in min_samples_leaf:
                model = DecisionTreeClassifier(min_samples_leaf=n, max_depth=d, criterion=f)
                model.fit(trn_X, trn_y)
                prd_y = model.predict(tst_X)
                y_values.append(metrics.accuracy_score(tst_y, prd_y))
                cnf_mtx = metrics.confusion_matrix(tst_y, prd_y, labels)
                plot_confusion_matrix(plt.gca(), cnf_mtx, labels)
            values[d] = y_values
        multiple_line_chart(axs[0, k], min_samples_leaf, values, 'Decision Trees with %s criteria' % f,
                                 'nr estimators',
                                 'accuracy', percentage=True)
    plt.show()


def random_forests(trn_X, tst_X, trn_y, tst_y, labels):
    n_estimators = [5, 10, 25, 50, 75, 100, 150, 200, 250, 300]
    max_depths = [5, 10, 25, 50]
    max_features = ['sqrt', 'log2']

    plt.figure()
    fig, axs = plt.subplots(1, 2, figsize=(10, 4), squeeze=False)
    for k in range(len(max_features)):
        f = max_features[k]
        values = {}
        for d in max_depths:
            y_values = []
            for n in n_estimators:
                model = RandomForestClassifier(n_estimators=n, max_depth=d, max_features=f)
                model.fit(trn_X, trn_y)
                prd_y = model.predict(tst_X)
                y_values.append(metrics.accuracy_score(tst_y, prd_y))
                cnf_mtx = metrics.confusion_matrix(tst_y, prd_y, labels)
                plot_confusion_matrix(plt.gca(), cnf_mtx, labels)
            values[d] = y_values
        multiple_line_chart(axs[0, k], n_estimators, values, 'Random Forests with %s features' % f, 'nr estimators',
                            'accuracy', percentage=True)
    plt.show()

