### 2018/2019 - Task List 10

1. Implement Naive Bayes classifier with pyro
    - create apropriate parameters (mean and std for a and b, sigma - noise)
    - provide optimization procedure
    - check appropriateness of implemented method with selected dataset


# Required imports

In [71]:
%matplotlib inline
import pyro
import torch
import numpy as np
import matplotlib.pyplot as plt
import pyro.optim as optim
import pyro.distributions as dist
from torch.distributions import constraints
from tqdm import tqdm_notebook as tqdm
import seaborn as sns
from matplotlib import animation, rc
from IPython.display import HTML
import torch.nn as nn
from functools import partial
import pandas as pd
from pyro.contrib.autoguide import AutoDiagonalNormal, AutoMultivariateNormal
from pyro.infer import EmpiricalMarginal, SVI, Trace_ELBO, TracePredictive
from pyro.optim import Adam
import operator
from sklearn.model_selection import train_test_split
from sklearn import model_selection, metrics
from sklearn.preprocessing import normalize

In [72]:
pyro.set_rng_seed(1)
pyro.enable_validation(True)

## Solutions

In [102]:
wine = pd.read_csv('wine.csv', header=None)
wine.columns = ["classname", "Alcohol", "MalicAcid", "Ash", "AlcalinityOfAsh", "Magnesium", "TotalPhenols","Flavanoids", 
                "NonflavanoidPhenols", "Proanthocyanins", "ColorIntensivity", "Hue","OD280/OD315", "Proline"]

df = wine
# num_columns = df.shape[1]
# num_rows = df.shape[0]
data = df

In [125]:
def model(data):
    data = torch.tensor(data.values).float()
    num_columns = data.shape[1]
    
    mean_prior = data[0]
    std_prior = torch.ones(num_columns)
#     noise_std = torch.ones(num_columns)
    
    prior = pyro.distributions.Normal(loc=mean_prior, scale=std_prior).independent(1)
    weight = pyro.sample("weight", prior)
    
    with pyro.plate("map", len(data)):
        #sample = pyro.sample("obs", pyro.distributions.Normal(weight, noise_std), obs=data)
        sample = pyro.sample("obs", prior, obs=data)
        return sample


def guide(data):
    num_columns = data.shape[1]
    
    mean = pyro.param("mean", torch.ones(1, num_columns)*0)
    std = pyro.param("std", torch.ones(1, num_columns)*1, constraint=constraints.positive)
    
    dists = pyro.distributions.Normal(loc=mean, scale=std).independent(1)
    sample = pyro.sample("weight", dists)  # , infer={'is_auxiliary': True}
    return sample


def train(data, num_steps=5000):
    pyro.clear_param_store()
    
    optim = Adam({"lr": 0.01})
    svi = pyro.infer.SVI(model=model,
                         guide=guide,
                         optim=optim,
                         loss=pyro.infer.Trace_ELBO(), num_samples=len(data))

    losses = []
    for t in tqdm(range(num_steps)):
        losses.append(svi.step(data))
    return pyro.param("mean"), pyro.param("std"), losses


def plot_loss(losses, learned_mean, learned_std, print_info=False):
    columns_data = [df[i] for i in df.columns]
    
    true_mean = [np.mean(x) for x in columns_data]
    true_std = [np.std(x) for x in columns_data]
    if (print_info):
        print(df.columns)
        print("{}\n{}".format(true_mean, true_std))
        print()

    plt.plot(losses)
    plt.title("evidence lower bound (ELBO)")
    plt.xlabel("step")
    plt.ylabel("loss");
    if (print_info):
        print('learned mean = ', learned_mean)
        print('learned std = ', learned_std)
        print()

    diff_mean = [learned_mean[i] - true_mean[i] for i in range(len(true_mean))]
    diff_std = [learned_std[i] - true_std[i] for i in range(len(true_std))]
    if (print_info):
        print(diff_mean, "\n", diff_std)

In [126]:
# %%time
# _, _, losses = train(data, 20000)

# learned_mean = pyro.param("mean").tolist()[0]
# learned_std = pyro.param("std").tolist()[0]

# plot_loss(losses, learned_mean, learned_std, print_info=True)

In [134]:
def extract_labels(dataset):
    # extract labels
    dataset_labels = dataset["classname"].copy()
    dataset = dataset.drop("classname", axis=1)

    return dataset, dataset_labels


def preprocess_data(X):
    # normalize [0,1]
    labels = X["classname"].copy()
    X = X.drop("classname", axis=1)
    columns = X.columns
    
    X = pd.DataFrame(normalize(X, axis=0), columns=columns)
    X = X.merge(labels, left_index=True, right_index=True)

    return X


def split_data(dataset, test_size=0.3):
    # split into train and test sets
    train_set, test_set = train_test_split(dataset, test_size=test_size, stratify=dataset['classname']) # random_state=42,

    # extract labels
    train_set_labels = train_set["classname"].copy()
    train_set = train_set.drop("classname", axis=1)

    test_set_labels = test_set["classname"].copy()
    test_set = test_set.drop("classname", axis=1)

    return train_set, train_set_labels, test_set, test_set_labels


def evaluate(labels_true, labels_predicted):
    labels_true = labels_true.values.tolist()
    accuracy = metrics.accuracy_score(y_true=labels_true, y_pred=labels_predicted)
    precision = metrics.precision_score(y_true=labels_true, y_pred=labels_predicted, average='macro')
    recall = metrics.recall_score(y_true=labels_true, y_pred=labels_predicted, average='macro')
    f1 = metrics.f1_score(y_true=labels_true, y_pred=labels_predicted, average='macro')

    return accuracy, precision, recall, f1


def test_classifier(train, train_labels, test, test_labels, model):
    model.fit(train, train_labels)
    labels_predicted = model.predict(test)
    labels_true = test_labels
    accuracy, precision, recall, f1 = evaluate(labels_true, labels_predicted)
    return accuracy, precision, recall, f1

In [135]:
class NaiveBayesClassifier:
    def __init__(self):
        self.mean_for_classes = {}
        self.std_for_classes = {}
        self.classes_probs = {}
    
    def fit(self, X, y):
        num_columns = X.shape[1]
        self.classes = y.unique()
        
        for classname in self.classes:
            current_data = X[y==classname]
            self.classes_probs[classname] = len(current_data)/len(X)
            mean, std, _ = train(current_data, num_steps=10000)
            self.mean_for_classes[classname] = mean[0]
            self.std_for_classes[classname] = std[0]
            print(self.classes_probs[classname], self.mean_for_classes[classname], self.std_for_classes[classname])
            
    
    def predict(self, X):
        probs = {}
        predicted = []
        
        for row in X.values:
            for classname in self.classes:
                p = self.classes_probs[classname]
                for i, element in enumerate(row):
                    mean = self.mean_for_classes[classname][i].detach().numpy()
                    std = self.std_for_classes[classname][i].detach().numpy()
                    p *= (1/(np.sqrt(2*np.pi*(std ** 2)))) * (np.e ** (-((element-mean) ** 2)/(2*std ** 2)))
                    # print(p)
                probs[classname] = p
            # print(probs)
            chosen_class = max(probs.items(), key=operator.itemgetter(1))[0]
            # print(chosen_class)
            predicted.append(chosen_class)
        
        return predicted


In [136]:
def classify(dataset, show=False):
    dataset = preprocess_data(dataset)
    if (show):
        columns_data = [dataset[i] for i in dataset.columns]
        true_mean = [np.mean(x) for x in columns_data]
        true_std = [np.std(x) for x in columns_data]
        print(dataset.columns)
        print("{}\n{}".format(true_mean, true_std))
        print()
    train_set, train_labels, test_set, test_labels = split_data(dataset)
    model = NaiveBayesClassifier()
    accuracy, precision, recall, f1 = test_classifier(train_set, train_labels, test_set, test_labels, model)
    return accuracy, precision, recall, f1

In [137]:
%%time
print(classify(data, show=True))

Index(['Alcohol', 'MalicAcid', 'Ash', 'AlcalinityOfAsh', 'Magnesium',
       'TotalPhenols', 'Flavanoids', 'NonflavanoidPhenols', 'Proanthocyanins',
       'ColorIntensivity', 'Hue', 'OD280/OD315', 'Proline', 'classname'],
      dtype='object')
[0.07480827469396367, 0.06765587278178141, 0.07445730939417845, 0.0738829589921611, 0.07420052722354029, 0.0723268778565136, 0.0672848416443273, 0.07089929832226441, 0.07055035959065165, 0.068170567305383, 0.0729155184159942, 0.0723421477467212, 0.0690946925050136, 1.9382022471910112]
[0.004658279231901258, 0.03225926853208698, 0.008607357658972517, 0.012620851740375043, 0.010595248361343046, 0.019667238433975748, 0.03302616558106141, 0.02431598292276999, 0.025310556882843026, 0.03115688176554472, 0.01735813072357437, 0.019610996595824815, 0.029049974108895607, 0.7728548591122252]



HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

0.33064516129032256 tensor([ 0.0004,  0.0981, -0.0124,  0.0148,  0.1130,  0.0467,  0.0562,  0.0175,
         0.1192,  0.0415,  0.0687,  0.1489,  0.1416], grad_fn=<SelectBackward>) tensor([1.0059, 0.9044, 1.1225, 0.8951, 0.9719, 0.9127, 1.0057, 0.9269, 1.0342,
        1.0506, 0.9526, 0.9987, 0.9747], grad_fn=<SelectBackward>)


HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

0.4032258064516129 tensor([0.1844, 0.1272, 0.0156, 0.0172, 0.1691, 0.0646, 0.0514, 0.0349, 0.1408,
        0.0224, 0.0987, 0.0858, 0.0673], grad_fn=<SelectBackward>) tensor([0.9032, 0.9864, 1.0281, 1.0764, 1.0618, 1.0223, 1.0122, 0.9944, 1.0167,
        1.0205, 0.9654, 0.9641, 0.8752], grad_fn=<SelectBackward>)


HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

0.2661290322580645 tensor([ 0.0134,  0.1126,  0.0354, -0.0006,  0.0929, -0.0046,  0.0394,  0.1143,
        -0.0304,  0.2536,  0.0268,  0.0738,  0.0660], grad_fn=<SelectBackward>) tensor([0.9244, 0.9364, 1.0169, 0.9345, 0.9834, 1.0211, 1.0139, 1.0035, 1.0152,
        1.0014, 1.0290, 0.9841, 1.0786], grad_fn=<SelectBackward>)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0.3888888888888889, 0.12962962962962962, 0.3333333333333333, 0.18666666666666668)
Wall time: 1min 39s
