In [2]:
import numpy as np
import spacy
import random
import pandas as pd
from tqdm import tqdm
from collections import Counter

In [3]:
df = pd.read_csv("data/spam_ham_dataset.csv")

In [4]:
#Dirichlet, hyperparams, tokenizer
ALPHA = 0.1
BETA = 0.1
NUM_TOPICS = 20
sp = spacy.load("en_core_web_sm")

#reproducibility
np.random.seed(42)
random.seed(42)

In [10]:
def generate_frequencies(data, max_docs=10000):
    freqs = Counter()
    all_stopwords = sp.Defaults.stop_words
    all_stopwords.add("enron")
    nr_tokens = 0

    for doc in data[:max_docs]:
        tokens = sp.tokenizer(doc)
        for token in tokens:
            token_text = token.text.lower()
            if token_text not in all_stopwords and token.is_alpha:
                nr_tokens += 1
                freqs[token_text] += 1

    return freqs

def get_vocab(freqs, freq_threshold=3):
    vocab = {}
    vocab_idx_str = {}
    vocab_idx = 0

    for word in freqs:
        if freqs[word] >= freq_threshold:
            vocab[word] = vocab_idx
            vocab_idx_str[vocab_idx] = word
            vocab_idx += 1
    
    return vocab, vocab_idx_str


def tokenize_dataset(data, vocab, max_docs=10000):
    nr_tokens = 0
    nr_docs = 0
    docs = []

    for doc in data[:max_docs]:
        tokens = sp.tokenizer(doc)

        if len(tokens) > 1:
            doc = []
            for token in tokens:
                token_text = token.text.lower()
                if token_text in vocab:
                    doc.append(token_text)
                    nr_tokens += 1
            nr_docs += 1
            docs.append(doc)

    print(f"Number of Labels : {nr_docs}")
    print(f"Number of tokens: {nr_tokens}")
    

    #Numericalize
    corpus = []
    for doc in docs:
        corpus_d = []

        for token in doc:
            corpus_d.append(vocab[token])

        corpus.append(np.array(corpus_d))

    return docs, corpus
    

In [11]:
print(sp.Defaults.stop_words)

{'wherever', 'doing', 'nor', 'several', 'nowhere', 'up', 'give', 'made', 'last', 'me', 'they', 'in', 'seeming', 'someone', 'more', 'get', 'regarding', 'really', 'say', 'hereupon', 'unless', 'ours', 'something', 'seem', '’m', 'always', 'alone', 'can', 'go', 'serious', 'them', 'used', 'be', 'itself', 'whatever', 'sixty', "n't", 'he', 'upon', 'empty', 'with', 'themselves', 'throughout', 'top', 'using', 'might', 'hers', 'thru', 'five', 'been', 'during', 'indeed', 'noone', 'not', 'your', 'our', 'within', 'somewhere', 'thus', 'least', 'all', 'because', '’re', 'onto', 'thereby', 'twenty', 'else', 'has', 'meanwhile', 'four', 'latter', '‘ll', 'first', 'latterly', 'former', 'about', 'everywhere', 'though', 'hereby', 'here', 'being', 'herein', 'yours', 'n’t', 'among', 'ca', 'over', 'some', 'part', 'most', 'may', 'that', 'neither', 'yourselves', 'along', 'nobody', 'herself', '’ll', 'will', 'whose', 'must', 'somehow', 'back', '’s', 'until', 'one', 'keep', 'this', 'further', 'their', 'sometime', 'is

In [12]:
data = df['text'].sample(frac=0.01, random_state=42).values
freqs = generate_frequencies(data)
vocab, vocab_idx_str = get_vocab(freqs)
docs, corpus = tokenize_dataset(data, vocab)
vocab_size = len(vocab)
print(f"VOcab size : {vocab_size}")

Number of Labels : 52
Number of tokens: 7092
VOcab size : 692


In [8]:
print(df.columns)

Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')


In [13]:
def LDS_Collapsed_Gibbs(corpus, num_iter=200):
    #Initialize counts and Z
    Z = []
    num_docs = len(corpus)
    for _, doc in enumerate(corpus):
        Zd = np.random.randint(low=0, high=NUM_TOPICS, size=(len(doc)))
        Z.append(Zd)

    ndk = np.zeros((num_docs, NUM_TOPICS))
    for d in range(num_docs):
        for k in range(NUM_TOPICS):
            ndk[d, k] = np.sum(Z[d] == k)

    nkw = np.zeros((NUM_TOPICS, vocab_size))
    for doc_idx, doc in enumerate(corpus):
        for i, word in enumerate(doc):
            topic = Z[doc_idx][i]
            nkw[topic, word] += 1

    nk = np.sum(nkw, axis=1)
    topic_list = [i for i in range(NUM_TOPICS)]

    #loop 
    for _ in tqdm(range(num_iter)):
        for doc_idx, doc in enumerate(corpus):
            for i in range(len(doc)):
                word = doc[i]
                topic = Z[doc_idx][i]

                #remove z_i because conditioned on z_(-i)
                ndk[doc_idx, topic] -= 1
                nkw[topic, word] -= 1
                nk[topic] -= 1

                p_z = (ndk[doc_idx, :] + ALPHA) * (nkw[:, word] + BETA) / (nk[:] + BETA*vocab_size)
                topic = random.choices(topic_list, weights=p_z, k=1)[0]

                #update n parameters
                Z[doc_idx][i] = topic
                ndk[doc_idx, topic] += 1
                nkw[topic, word] += 1
                nk[topic] += 1
    
    return Z, ndk, nkw, nk

Z, ndk, nkw, nk = LDS_Collapsed_Gibbs(corpus)




KeyboardInterrupt: 