In [34]:
import pandas as pd
import numpy as np

import nltk
import spacy

from gensim.models import Phrases
from gensim.models.phrases import Phraser 
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel

import pyLDAvis.gensim_models
import pyLDAvis

import warnings
warnings.filterwarnings('ignore')

In [53]:
df=pd.read_csv("frame2.csv")

In [54]:
df['words'] = df['words'].str.replace("'", "")

In [80]:
# Only take English transcripts
df = df[df.language == 'en']
df.language.value_counts()

language
en    398
Name: count, dtype: int64

### Clean text (get tri- and bi-grams, lemmatize, and take only selected POS)

In [56]:
# Build bigram and trigram Phrases objects
bigram_phrases = Phrases(df.words, min_count=10)
trigram_phrases = Phrases(bigram_phrases[df.words], min_count=5)

# Create Phraser model objects for faster processing
bigram_model = Phraser(bigram_phrases)
trigram_model = Phraser(trigram_phrases)
trigrams = [trigram_model[bigram_model[word.split()]] for word in df.words]

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
lemmatized_words = []
for sent in trigrams:
    doc = nlp(" ".join(sent))
    lemmatized_words.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])

In [58]:
# Print the first 50 lemmatized words of the first row
print(lemmatized_words[0][:50])

['sledgehammer', 'call', 'name', 'thank', 'much', 'thank', 'thank', 'take', 'seat', 'thank', 'much', 'thank', 'come', 'man', 'let', 'ask', 'guy', 'man', 'go', 'keep', 'taking', 'confederate', 'statue', 'taking', 'temperature', 'room', 'know', 's', 'know', 'weird', 'start', 've', 'phenomenal', 'tour', 've', 've', 'meet', 'people', 'never', 'thought', 'meet', 'life', 'recently', 'meet', 'wild', 'think', 'want', 'meet', 'clear', 'fancy']


In [59]:
id2word = Dictionary(lemmatized_words)
id2word.filter_extremes(no_below=10, no_above=0.4)
id2word.compactify()
corpus = [id2word.doc2bow(word) for word in lemmatized_words]

### Create LDA model and print out topics

In [128]:
num_topics = 7
lda_model = LdaMulticore(corpus=corpus, 
                             id2word=id2word, 
                             num_topics=num_topics, 
                             random_state=1,
                             chunksize=30,
                             passes=40,
                             alpha=0.5,
                             eta=0.91,
                             eval_every=1,
                             per_word_topics=True,
                             workers=2)

In [129]:
lda_model.print_topics(7,num_words=10)

[(0,
  '0.005*"folk" + 0.005*"fart" + 0.004*"hat" + 0.004*"planet" + 0.003*"sport" + 0.003*"list" + 0.003*"television" + 0.003*"tit" + 0.003*"religion" + 0.003*"begin"'),
 (1,
  '0.008*"mate" + 0.007*"quite" + 0.005*"mum" + 0.005*"cheer" + 0.005*"lovely" + 0.004*"round" + 0.004*"film" + 0.004*"cock" + 0.004*"applause" + 0.004*"bloke"'),
 (2,
  '0.005*"police" + 0.005*"jail" + 0.005*"daddy" + 0.004*"folk" + 0.004*"weed" + 0.004*"president" + 0.004*"chicken" + 0.003*"btch" + 0.003*"goin" + 0.003*"rich"'),
 (3,
  '0.008*"fluffy" + 0.005*"rape" + 0.005*"freakin" + 0.004*"basket" + 0.004*"cake" + 0.004*"gift" + 0.004*"frankie" + 0.004*"snoop" + 0.004*"band" + 0.004*"freaking"'),
 (4,
  '0.012*"indian" + 0.007*"immigrant" + 0.007*"asian" + 0.006*"accent" + 0.006*"british" + 0.005*"racism" + 0.005*"german" + 0.005*"bro" + 0.005*"trevor" + 0.005*"snake"'),
 (5,
  '0.002*"awesome" + 0.002*"horrible" + 0.002*"husband" + 0.002*"cute" + 0.002*"ice" + 0.002*"coffee" + 0.002*"adult" + 0.002*"text" +

By looking at some of the key words we try to derive a topic:
- Topic 0 : "Culture"
- Topic 1 : "UK"
- Topic 2 : "Crimes"
- Topic 3 : "Situational"
- Topic 4 : "Immigrants"
- Topic 5 : "Relationships"
- Topic 6 : "Politics"

### Coherence Score

In [130]:
coherence_model_lda = CoherenceModel(model=lda_model, 
                                     texts=lemmatized_words, 
                                     dictionary=id2word, 
                                     coherence='c_v')
coherence_model_lda.get_coherence()

0.34477313969112583

### The pyLDAvis

In [132]:
# Convert LDA model to a format compatible with pyLDAvis
pyLDAvis.enable_notebook()
vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis_data

In [133]:
# Get a list of vectors of topic probabilities
topic_vecs = []
for i in range(len(df.words)):
    top_topics = lda_model.get_document_topics(corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(num_topics)]
    topic_vecs.append(topic_vec)

In [134]:
# Add topic probabilities into main df. Create a new column for each topic.
topic_columns = ['Culture', 'UK', 'Crimes', 'Situational', 'Immigrants', 'Relationships', 'Politics']
LDA_probs = pd.DataFrame(data=topic_vecs, columns=topic_columns, index=df.index)
df = pd.concat([df, LDA_probs], axis=1)

In [136]:
# Saving the csv file
df.to_csv("frame3.csv", index=False)