In [1]:
import numpy as np
import pandas as pd
import time
import sys
import os
import shutil

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from dataloader import get_data
from models import TransformerClassifier

In [2]:
ls

Debug Models.ipynb        bert_dataloader.py        models.py
Evaluate.ipynb            bert_evaluate.py          multi_fake_pickle.p
Huggingface Bert.ipynb    bert_models.py            multi_wiki_pickle.p
Model Loading.ipynb       bert_train.py             [1m[34mprecomputed[m[m/
Multi Head model.ipynb    bert_train_multi_head.py  train.py
Test dataloader.ipynb     dataloader.py             [1m[34mwiki[m[m/
[1m[34m__pycache__[m[m/              fake_eval.txt


In [3]:
vocab, data_dict = get_data()

In [4]:
wiki_data, fake_data = data_dict['wiki'], data_dict['fake news']

In [5]:
class NaiveBayes(nn.Module):
    def __init__(self, vocab, num_labels, train_data, alpha=0.001):
        super(NaiveBayes, self).__init__()
        self.vocab_len = len(vocab)
        self.classes = num_labels
        self.p_class = np.zeros(self.classes)
        self.p_vocab = alpha * np.ones((self.classes, self.vocab_len))
        for (x, y) in train_data:
            self.p_class[y] += 1
            for i in x:
                if (i == 0): break # 0 padding
                self.p_vocab[y, i] += 1
        self.p_class /= np.sum(self.p_class)
        self.p_vocab = (self.p_vocab.T / np.sum(self.p_vocab, axis=1)).T
        
    def forward(self, src):
        log_probs = np.log(self.p_class)
        for i in src:
            if (i == 0): break
            log_probs += np.log(self.p_vocab[:,i])
        return np.argmax(log_probs)

In [6]:
nb_model = NaiveBayes(vocab, wiki_data.num_labels(), wiki_data)

In [7]:
nb_model([1, 2, 3 ,4])

1

In [26]:
# split dataset into train, validation, and test
def split_dataset(dataset, train_size, val_size, test_size):
    return torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

def evaluate(model, val_dataset):
    n = len(val_dataset)
    pred, true = np.zeros(n, dtype=int), np.zeros(n, dtype=int)
    for i, (x, y) in enumerate(val_dataset):
        true[i] = y
        pred[i] = model(x)
    print("accuracy: ", round(np.mean(pred == true), 4))
    class_scores = [np.mean(pred[true==i] == true[true==i]) for i in range(np.max(true)+1)]
    print("class-wise accuracies: ", class_scores)
    return np.mean(pred == true)

In [27]:
def run_model_on_dataset(model_class, dataset):
    n = len(dataset)
    n_train, n_val, n_test = n - 2*int(0.15*n), int(0.15*n), int(0.15*n)
    train, val, test = split_dataset(dataset, n_train, n_val, n_test)
    best_model, best_score = None, 0.0
    for alpha in [0.01, 0.05, 0.1, 0.5, 1, 3, 6, 10]:
        nb_model = model_class(vocab, dataset.num_labels(), train, alpha=alpha)
        print("evaluate model, alpha = ", alpha)
        score = evaluate(nb_model, val)
        if (score > best_score):
            best_model = nb_model
            best_score = score
    print("best model, evaluated on final test dataset")
    return evaluate(nb_model, test)

In [28]:
run_model_on_dataset(NaiveBayes, wiki_data)

evaluate model, alpha =  0.01
accuracy:  0.8514
class-wise accuracies:  [0.8589156546552525, 0.7933673469387755]
evaluate model, alpha =  0.05
accuracy:  0.8547
class-wise accuracies:  [0.8591147388678745, 0.8204081632653061]
evaluate model, alpha =  0.1
accuracy:  0.8683
class-wise accuracies:  [0.8769659565996416, 0.801530612244898]
evaluate model, alpha =  0.5
accuracy:  0.9218
class-wise accuracies:  [0.9688765014267702, 0.560204081632653]
evaluate model, alpha =  1
accuracy:  0.9172
class-wise accuracies:  [0.9913730174530493, 0.3469387755102041]
evaluate model, alpha =  3
accuracy:  0.8966
class-wise accuracies:  [0.9993363859579268, 0.10714285714285714]
evaluate model, alpha =  6
accuracy:  0.8889
class-wise accuracies:  [0.9998009157873781, 0.036224489795918365]
evaluate model, alpha =  10
accuracy:  0.8873
class-wise accuracies:  [1.0, 0.020918367346938777]
best model, evaluated on final test dataset
accuracy:  0.8837
class-wise accuracies:  [0.999933368869936, 0.0202869866402

0.8836690351752892

In [29]:
run_model_on_dataset(NaiveBayes, fake_data)

evaluate model, alpha =  0.01
accuracy:  0.5635
class-wise accuracies:  [0.519542772861357, 0.540952380952381, 0.6992481203007519, 0.7468454258675079]
evaluate model, alpha =  0.05
accuracy:  0.5796
class-wise accuracies:  [0.5444321533923304, 0.5219047619047619, 0.631578947368421, 0.748422712933754]
evaluate model, alpha =  0.1
accuracy:  0.6095
class-wise accuracies:  [0.5877581120943953, 0.49714285714285716, 0.6015037593984962, 0.75]
evaluate model, alpha =  0.5
accuracy:  0.7216
class-wise accuracies:  [0.7771017699115044, 0.3314285714285714, 0.09774436090225563, 0.7113564668769716]
evaluate model, alpha =  1
accuracy:  0.7611
class-wise accuracies:  [0.8724188790560472, 0.14476190476190476, 0.0, 0.6198738170347003]
evaluate model, alpha =  3
accuracy:  0.7615
class-wise accuracies:  [0.9917035398230089, 0.005714285714285714, 0.0, 0.1695583596214511]
evaluate model, alpha =  6
accuracy:  0.7384
class-wise accuracies:  [1.0, 0.005714285714285714, 0.0, 0.0]
evaluate model, alpha =  1

0.7336054421768707

In [30]:
# slightly more sophisticated model
class NaiveBayesBigram(nn.Module):
    def __init__(self, vocab, num_labels, train_data, alpha=0.001):
        super(NaiveBayesBigram, self).__init__()
        self.vocab_len = len(vocab)
        self.classes = num_labels
        self.p_class = np.zeros(self.classes)
        self.p_vocab = {y:{} for y in np.arange(self.classes)} # dict of dicts: second key is bigram tuple
        self.class_sum = np.zeros(self.classes) # NOT THE SAME AS P_CLASS
        for (x, y) in train_data:
            self.p_class[y] += 1
            for i in range(len(x)):
                if i+1 == len(x) or x[i+1] == 0: break # 0 padding
                if (x[i], x[i+1]) not in self.p_vocab[y]: 
                    self.p_vocab[y][(x[i], x[i+1])] = 0
                self.p_vocab[y][(x[i], x[i+1])] += 1
                self.class_sum[y] += 1
        for y, d in self.p_vocab.items():
            for bigram in d.keys():
                d[bigram] = (d[bigram] + alpha) / (self.class_sum[y] + self.vocab_len**2 * alpha)
        self.p_class /= np.sum(self.p_class)
        self.no_show = alpha * np.ones(self.classes) / (self.class_sum + self.vocab_len**2 * alpha)
        
    def forward(self, src):
        log_probs = np.log(self.p_class)
        for i in range(len(src)):
            if i+1 == len(src) or src[i+1] == 0: break # 0 padding
            bigram = (src[i], src[i+1])
            scores = [self.p_vocab[y][bigram] if bigram in self.p_vocab[y] else self.no_show[y] \
                      for y in range(self.classes)]
            log_probs += np.log(scores)
        return np.argmax(log_probs)

In [31]:
run_model_on_dataset(NaiveBayesBigram, wiki_data)

evaluate model, alpha =  0.01
accuracy:  0.9088
class-wise accuracies:  [0.9977532544769708, 0.19883966244725737]
evaluate model, alpha =  0.05
accuracy:  0.9062
class-wise accuracies:  [0.9986783849864534, 0.16772151898734178]
evaluate model, alpha =  0.1
accuracy:  0.9045
class-wise accuracies:  [0.9988105464878081, 0.1518987341772152]
evaluate model, alpha =  0.5
accuracy:  0.9008
class-wise accuracies:  [0.9996695962466133, 0.11128691983122363]
evaluate model, alpha =  1
accuracy:  0.8993
class-wise accuracies:  [0.9998678384986454, 0.09651898734177215]
evaluate model, alpha =  3
accuracy:  0.8968
class-wise accuracies:  [1.0, 0.07278481012658228]
evaluate model, alpha =  6
accuracy:  0.8952
class-wise accuracies:  [1.0, 0.058544303797468354]
evaluate model, alpha =  10
accuracy:  0.8943
class-wise accuracies:  [1.0, 0.05063291139240506]
best model, evaluated on final test dataset
accuracy:  0.8948
class-wise accuracies:  [1.0, 0.07057602490918526]


0.8948264724881085

In [39]:
run_model_on_dataset(NaiveBayesBigram, fake_data)

evaluate model, alpha =  0.01
accuracy:  0.8018
class-wise accuracies:  [0.9987049028677151, 0.1297709923664122, 0.015384615384615385, 0.3292021688613478]
evaluate model, alpha =  0.05
accuracy:  0.7985
class-wise accuracies:  [0.9985198889916743, 0.11068702290076336, 0.015384615384615385, 0.31913245546088304]
evaluate model, alpha =  0.1
accuracy:  0.7965
class-wise accuracies:  [0.9987049028677151, 0.09541984732824428, 0.007692307692307693, 0.313710302091402]
evaluate model, alpha =  0.5
accuracy:  0.7854
class-wise accuracies:  [0.9994449583718779, 0.04198473282442748, 0.0, 0.2703330751355538]
evaluate model, alpha =  1
accuracy:  0.7739
class-wise accuracies:  [0.9994449583718779, 0.02099236641221374, 0.0, 0.21301316808675447]
evaluate model, alpha =  3
accuracy:  0.7544
class-wise accuracies:  [0.9994449583718779, 0.0019083969465648854, 0.0, 0.10999225406661503]
evaluate model, alpha =  6
accuracy:  0.7464
class-wise accuracies:  [0.9994449583718779, 0.0019083969465648854, 0.0, 0.

0.7330612244897959

In [None]:
# OLD NAIVE BAYES - VERY BAD, DONT USE (runs out of memory)
"""
# slightly more sophisticated model
class NaiveBayesBigram(nn.Module):
    def __init__(self, vocab, num_labels, train_data, alpha=0.001):
        super(NaiveBayesBigram, self).__init__()
        self.vocab_len = len(vocab)
        self.classes = num_labels
        self.p_class = np.zeros(self.classes)
        self.p_vocab = alpha * np.ones((self.classes, self.vocab_len, self.vocab_len))
        print("start training")
        for (x, y) in train_data:
            self.p_class[y] += 1
            for i in range(len(x)):
                if i == 0: continue # skip first one
                if (x[i+1] == 0): break # 0 padding
                self.p_vocab[y, x[i], x[i+1]] += 1
        print("done with training")
        self.p_class /= np.sum(self.p_class)
        for i in range(self.p_vocab.shape[0]):
            self.p_vocab[i] = self.p_vocab[i] / np.sum(self.p_vocab[i])
        
    def forward(self, src):
        log_probs = np.log(self.p_class)
        for i in range(len(src)):
            if (i == 0): continue
            if (src[i+1] == 0): break
            log_probs += np.log(self.p_vocab[:, src[i], src[i+1]])
        return np.argmax(log_probs)
"""

# Data analysis

In [33]:
import pandas as pd
# wiki dataset
cutoff = 0.3
comment_df = pd.read_csv("../data/attack_annotated_comments.tsv", sep ='\t')
comment_df["comment"] = comment_df["comment"].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comment_df["comment"] = comment_df["comment"].apply(lambda x: x.replace("TAB_TOKEN", " "))
annotation_df = pd.read_csv("../data/attack_annotations.tsv",  sep='\t')
annotation_df = (annotation_df.groupby("rev_id")["attack"].mean() > cutoff)
annotation_df = annotation_df.to_frame().reset_index()
final_df = pd.merge(comment_df, annotation_df, how='inner', on=['rev_id'])

In [34]:
for i, row in enumerate(final_df[final_df['attack'] == True]['comment']):
    if len(row) < 100:
        print(row)
        print()
    if i > 10: break

  Iraq is not good  ===  ===  USA is bad   

Anon  :What the heck are you talking about? This is an encyclopedia, not a book store. 

i have a dick, its bigger than yours! hahaha



In [35]:
for i, row in enumerate(final_df[final_df['attack'] == False]['comment']):
    if len(row) < 100:
        print(row)
        print()
    if i > 10: break

This page will need disambiguation. 



In [36]:
# fake news dataset
body_df = pd.read_csv("../data/fake_news_bodies.csv")
stance_df = pd.read_csv("../data/fake_news_stances.csv")

In [38]:
print(len(stance_df))
print(len(comment_df))

49972
115864
