# LDA with Gibbs Sampling

## Import Libraries

Libraries for dataprocessing

In [27]:
import numpy as np
import spacy
spacy.load('en_core_web_sm')
from spacy.lang.en import English
import nltk
from nltk.corpus import reuters, wordnet as wn
from nltk.corpus import stopwords

Libraries for word embeddings

In [None]:
from gensim.models import Word2Vec

In [None]:
from LDA import LDA_word_embed

## Data Preprocessing

In [None]:
stops = stopwords.words("english")
# Add additional stop words
stops += [
    "a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
    "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed"
]

In [None]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

### Dataset 1: NLTK reuters
    a dataset of new articles, using the titles

In [None]:
# fetch titles only, 2000 docs only 
dataset = []
vocab = []

i = 0
for file_id in reuters.fileids():
    
    doc = [get_lemma(w.lower()) for w in reuters.words(file_id) \
                 if (w.isupper()) \
                 if (w.lower() not in stops) \
                 and (not w.isnumeric())]
    if doc:
        doc = [t for t in doc if len(t) > 1]
        dataset.append(doc)
        vocab += doc
        i += 1

    if i >= 2000:
        break

dataset = [[token for token in sublist if len(token) > 1] for sublist in dataset]

In [None]:
print(len(dataset))
dataset

### Dataset 2: dataset.csv
    a dataset of research paper titles

In [None]:
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [None]:
def tokenize_text(text):
    tokens = tokenize(text)
    tokens = [t for t in tokens if len(t) > 4]
    tokens = [t for t in tokens if t not in stops]
    tokens = [get_lemma(t) for t in tokens]
    return tokens

In [None]:
dataset2 = []
vocab2 = []
with open('dataset.csv') as f:
    for line in f:
        tokened_line = tokenize_text(line)
        vocab2 += tokened_line
        dataset2.append(tokened_line)

In [None]:
print(len(dataset2))
dataset2

## Word Embeddings

In [None]:
model_dataset = Word2Vec(dataset, vector_size=100, window=5,min_count=1,workers=4)
model_dataset2 = Word2Vec(dataset, vector_size=100, window=5,min_count=1,workers=4)

In [None]:
embeddings = [[model_dataset.wv[word] for word in doc] for doc in dataset]
embeddings2 = [[model_dataset2.wv[word] for word in doc] for doc in dataset2]

In [None]:
print(model_dataset.wv.most_similar('computer',topn=10))