In [11]:
import warnings
import pandas as pd
warnings.filterwarnings(action='ignore', category=UserWarning)
import gensim
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from gensim.models import TfidfModel
from gensim.models.coherencemodel import CoherenceModel
from nltk.stem.snowball import SnowballStemmer
import re
from gensim.test.utils import datapath
import tunning_lda as tlda 
%matplotlib inline

regex_tokenizer = RegexpTokenizer(r'\w+')
stemmer = SnowballStemmer("english")
wn = WordNetLemmatizer()

## 1. Readding and cleaning the training data

This process is different depending on the input data, in this case, the data contained weird characters and some of the observations were error messages, among some other particularities. Because of that the cleaning process is done in the following lines intead of using the cleaning functions used in other parts of the project. 

In [2]:
# Reading training data
data = pd.read_csv('text_training.csv')

In [3]:
# These documents does not contain useful text
not_included = [642, 569, 567, 582, 595, 597, 611, 640, 636, 598]

In [4]:
# Created a dictionary that has a unique id as key and raw text as value
corpus_dic = {r['domain_url']:r['text'] for i, r in data.iterrows() if r['domain_url'] not in not_included}

In [5]:
# English stop words
sw = list(stopwords.words('english'))

In [6]:
# In these lines we tokenize the text, eliminate punctuation and convert all characters to lower
original_corpus = [regex_tokenizer.tokenize(doc) for doc in corpus_dic.values()]
original_corpus = [[t.lower() for t in doc if t not in sw] for doc in original_corpus]

In [7]:
# Then, we create the original vocabulary of the corpus
original_dictionary = gensim.corpora.Dictionary(original_corpus)
original_vocab = [original_dictionary[i] for i in range(len(original_dictionary))]

In [8]:
# Then, we define the words that are too short or contain rare characters or numbers.
short = [w for w in original_vocab if len(w) <3]
rare =[w for w in original_vocab if re.match("[\d=@}#;%`>*'{):~,+|!/_<?\\(.&-]", w)]
invalid = short+rare

In [9]:
# Finally, we create a clean corpus, dictionary and bag of words
corpus = [[t for t in doc if t not in invalid] for doc in original_corpus]
corpus = [[stemmer.stem(t) for t in doc] for doc in corpus]
dictionary = gensim.corpora.Dictionary(corpus)
bow = [dictionary.doc2bow(line) for line in corpus]

In [10]:
# We also create a TF-IDF bow, as an alternative to common bag of words
TFIDF = TfidfModel(bow)
tf_idf = [TFIDF[b] for b in bow]

## 2. Testing different models

We are using the LDA model, but this model could take different parameters, the most important is the number of topics, but we can also change the number of iterations and passses throgh the data and the decay rate.
The function called "dif_models" simplifies this process, you only have to provide the vectors of the documents considered, the dictionary created previously and a list of different values for every parameter. This funtion returns a data frame that summarizes the results of the different models, the best model (according to coherence) and the topics top terms of the best model.

The performance of every model is evaluated using log-perplexity (the lower, the better) and coherence measure (the higher, the better), these two mettrics are reported in the summary data frame.

In [13]:
# Lists that contains different values for the parameters of the LDA model.
n_list = [3, 5, 10, 20, 25]
iter_list = [200]
pass_list = [150]
decay_list = [0.7]

### 2.1 LDA model using simple bag of words

In [14]:
# Testing model with different parameters
results_df, best_model, best_topics = tlda.dif_models(bow, dictionary, n_list, iter_list, pass_list, decay_list)
results_df

Unnamed: 0,Number of Topics,Iterations,Passes,Decay,Perplexity,Coherence
1,5,200,150,0.7,-7.187546,-0.342533
0,3,200,150,0.7,-7.363833,-0.421302
4,25,200,150,0.7,-7.193414,-1.177357
2,10,200,150,0.7,-7.165929,-1.237508
3,20,200,150,0.7,-7.199592,-2.113676


In [15]:
# Exploring the content of each topic of the best model
tlda.print_topics(best_topics)

Topic  0
princeton, hackprinceton, travel, student, train, about, workshop, team, hardwar, faq
**************************************************************************************************
Topic  1
hackni, fellow, hackathon, student, startup, work, post, univers, new, the
**************************************************************************************************
Topic  2
hack, team, hackathon, event, sponsor, student, project, what, email, provid
**************************************************************************************************
Topic  3
univers, event, team, host, hackathon, learn, america, east, student, the
**************************************************************************************************
Topic  4
builtworld, compani, industri, construct, member, technolog, the, build, confer, built
**************************************************************************************************


### 2.3 LDA model using TF-IDF bag of words

In [16]:
tfidf_results_df, tfidf_best_model, tfidf_best_topics = tlda.dif_models(tf_idf, dictionary, n_list, iter_list, pass_list, decay_list)

  perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words


In [17]:
tfidf_results_df

Unnamed: 0,Number of Topics,Iterations,Passes,Decay,Perplexity,Coherence
4,25,200,150,0.7,-28.976312,-8.674017
3,20,200,150,0.7,-24.678418,-10.647248
2,10,200,150,0.7,-13.859426,-18.34874
1,5,200,150,0.7,-12.10647,-22.725884
0,3,200,150,0.7,-10.9327,-23.616555


In [18]:
tlda.print_topics(tfidf_best_topics)

Topic  0
hackric, vcu, ramhack, citizen, rice, field, privaci, fill, virginia, commonwealth
**************************************************************************************************
Topic  1
cgs, techtogeth, boston, wayfair, shehack, prehack, aggani, beginn, host, women
**************************************************************************************************
Topic  2
stanciu, scherocman, synthes, swag, ste, tatyana, shestopalova, parekh, prev, theo
**************************************************************************************************
Topic  3
cypher, swem, gregg, snacki, earl, python, mari, alexa, mushroom, whereowar
**************************************************************************************************
Topic  4
revolutionuc, hackxx, cuhackit, demonhack, houston, coder, helloworld, cougarc, voyag, cloudflar
**************************************************************************************************
Topic  5
hacknc, slo, desk, front, chumas

### 2.3 Conclusions

After testing different models it is possible to decide which model is better, in this case the LDA model with 5 topics using simple bag of words is the one with best performance. Despite its performance, it is possible that the topics produced by the best model are not easy to interpret, it is possible to provide guidance to the model to make the traing process focus in some predefined terms and improve the interpretation of the topics. This is made in the following section.

## 3. Guided LDA model

The gensim LDA model allows us to create a matrix that give more weight to defined terms in certain topics. By doing this, we can guide the model and produce more choherence topics. 

In [52]:
'''
Defining importants terms and its topics. 
The key is the term and the value is the number of the topic where the term is assigned.
'''
apriori_terms = {
    'sponsoring': 2,
    'sponsors': 2,
    'sponsorship': 2,
    'sponsor': 2,
    'sponsored': 2,
    'sponsorships': 2,
    'promoter': 2,
    'benefactor': 2, 
    'funding':2,
    'aid':2,
    'organizer':2,
    'help':2,
    'support':2
}

#We use the same stemmer that we previously used in the cleaning process
apriori_terms = {stemmer.stem(k): v for k,v in apriori_terms.items()}

In [53]:
# Building a model with the parameters of the best model but including the 
bm = tlda.make_LDA(bow, dictionary, 5, apriori_terms)

In [54]:
# Obtaining the topics and computing the perplexity and coherence of that model
bm_topics, bm_perplexity, bm_coherence = tlda.evaluate_model(bm, bow, dictionary, 5)

In [55]:
tlda.print_topics(bm_topics)

Topic  0
princeton, hackprinceton, travel, student, train, about, workshop, team, hardwar, faq
**************************************************************************************************
Topic  1
hackni, fellow, hackathon, student, startup, work, post, univers, new, the
**************************************************************************************************
Topic  2
hack, team, hackathon, event, sponsor, student, project, what, email, provid
**************************************************************************************************
Topic  3
univers, event, team, host, hackathon, learn, america, east, student, the
**************************************************************************************************
Topic  4
builtworld, compani, industri, construct, member, technolog, the, build, confer, built
**************************************************************************************************


## 4. Saving the best model

In [56]:
# Saving the model and the dictionary to use it to evaluate fresh texts latter.
tlda.save_model(bm, 'best_model')
tlda.save_model(dictionary, 'best_dictionary')