In [2]:
import numpy as np
import pandas as pd
import time
import sys
import os
import shutil

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from dataloader import get_data
from models import TransformerClassifier

In [3]:
ls

Debug Models.ipynb     dataloader.py          [1m[34mtrained_fake[m[m/
Test dataloader.ipynb  models.py              [1m[34mtrained_wiki[m[m/
[1m[34m__pycache__[m[m/           train.py


In [3]:
vocab, data_dict = get_data()

In [4]:
wiki_data, fake_data = data_dict['wiki'], data_dict['fake news']

In [6]:
class NaiveBayes(nn.Module):
    def __init__(self, vocab, num_labels, train_data, alpha=0.001):
        super(NaiveBayes, self).__init__()
        self.vocab_len = len(vocab)
        self.classes = num_labels
        self.p_class = np.zeros(self.classes)
        self.p_vocab = alpha * np.ones((self.classes, self.vocab_len))
        for (x, y) in train_data:
            self.p_class[y] += 1
            for i in x:
                if (i == 0): break # 0 padding
                self.p_vocab[y, i] += 1
        self.p_class /= np.sum(self.p_class)
        self.p_vocab = (self.p_vocab.T / np.sum(self.p_vocab, axis=1)).T
        
    def forward(self, src):
        log_probs = np.log(self.p_class)
        for i in src:
            if (i == 0): break
            log_probs += np.log(self.p_vocab[:,i])
        return np.argmax(log_probs)

In [7]:
nb_model = NaiveBayes(vocab, wiki_data.num_labels(), wiki_data)

In [None]:
nb_model([1, 2, 3 ,4])

1

In [5]:
# split dataset into train, validation, and test
def split_dataset(dataset, train_size, val_size, test_size):
    return torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

def evaluate(model, val_dataset):
    n = len(val_dataset)
    pred, true = np.zeros(n), np.zeros(n)
    for i, (x, y) in enumerate(val_dataset):
        true[i] = y
        pred[i] = model(x)
    print("score: ", round(np.mean(pred == true), 4))
    return np.mean(pred == true)

In [6]:
def run_model_on_dataset(model_class, dataset):
    n = len(dataset)
    n_train, n_val, n_test = n - 2*int(0.15*n), int(0.15*n), int(0.15*n)
    train, val, test = split_dataset(dataset, n_train, n_val, n_test)
    best_model, best_score = None, 0.0
    for alpha in [0.01, 0.05, 0.1, 0.5, 1, 3, 6, 10]:
        nb_model = model_class(vocab, dataset.num_labels(), train, alpha=alpha)
        print("evaluate model, alpha = ", alpha)
        score = evaluate(nb_model, val)
        if (score > best_score):
            best_model = nb_model
            best_score = score
    print("best model, evaluated on final test dataset")
    return evaluate(nb_model, test)

In [None]:
run_model_on_dataset(NaiveBayes, wiki_data)

evaluate model, alpha =  0.01
score:  0.8575
evaluate model, alpha =  0.05
score:  0.8578
evaluate model, alpha =  0.1
score:  0.8699
evaluate model, alpha =  0.5
score:  0.9224
evaluate model, alpha =  1
score:  0.917
evaluate model, alpha =  3
score:  0.8975
evaluate model, alpha =  6
score:  0.8918
evaluate model, alpha =  10
score:  0.8893
best model, evaluated on final test dataset
score:  0.886


0.885959245992131

In [None]:
run_model_on_dataset(NaiveBayes, fake_data)

evaluate model, alpha =  0.01
score:  0.5703
evaluate model, alpha =  0.05
score:  0.5833
evaluate model, alpha =  0.1
score:  0.5999
evaluate model, alpha =  0.5
score:  0.723
evaluate model, alpha =  1
score:  0.7574
evaluate model, alpha =  3
score:  0.7611
evaluate model, alpha =  6
score:  0.7404
evaluate model, alpha =  10
score:  0.7404
best model, evaluated on final test dataset
score:  0.7283


0.7282993197278912

In [7]:
# slightly more sophisticated model
class NaiveBayesBigram(nn.Module):
    def __init__(self, vocab, num_labels, train_data, alpha=0.001):
        super(NaiveBayesBigram, self).__init__()
        self.vocab_len = len(vocab)
        self.classes = num_labels
        self.p_class = np.zeros(self.classes)
        self.p_vocab = alpha * np.ones((self.classes, self.vocab_len, self.vocab_len))
        for (x, y) in train_data:
            self.p_class[y] += 1
            for i in range(len(x)):
                if i == 0: continue # skip first one
                if (x[i+1] == 0): break # 0 padding
                self.p_vocab[y, x[i], x[i+1]] += 1
        self.p_class /= np.sum(self.p_class)
        for i in range(self.p_vocab.shape[0]):
            self.p_vocab[i] = self.p_vocab[i] / np.sum(self.p_vocab[i])
        
    def forward(self, src):
        log_probs = np.log(self.p_class)
        for i in range(len(src)):
            if (i == 0): continue
            if (src[i+1] == 0): break
            log_probs += np.log(self.p_vocab[:, src[i], src[i+1]])
        return np.argmax(log_probs)

In [None]:
run_model_on_dataset(NaiveBayesBigram, wiki_data)