# Topic modelling and entity extraction

In [73]:
# Usual imports
import numpy as np
import pandas as pd
import string
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import os

# Plotly based imports for visualization
from plotly import tools
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
# !python -m spacy download en_core_web_lg

ModuleNotFoundError: No module named 'gensim'

In [53]:
CLEAN_AND_TOKENIZE = True

INPUT_PATH = os.path.join(os.pardir, 'Datasets')
OUT_PATH = os.path.join(os.pardir, 'Datasets')
INPUT_FILE = 'train.csv'
SAMPLE_SIZE = 50000

NUM_TOPICS = 10

In [54]:
# Loading data
df = pd.read_csv(os.path.join(INPUT_PATH, INPUT_FILE), nrows=SAMPLE_SIZE)
df.set_index('id', inplace=True)
df.fillna('Empty question', inplace=True)
df.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [57]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)
parser = English()
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [58]:
if CLEAN_AND_TOKENIZE:
    df['question1'] = df['question1'].apply(spacy_tokenizer)
    df['question2'] = df['question2'].apply(spacy_tokenizer)

## Entity Extraction

In [59]:
# Creating a spaCy object
nlp = spacy.load('en_core_web_lg')

Single example:

In [62]:
ex = nlp(df['question1'][98])
spacy.displacy.render(ex, style='ent',jupyter=True)

Full dataset:

In [63]:
def get_entities(x):
    entity_types = (
        'PERSON',
        'NORP',
        'FAC',
        'ORG',
        'GPE',
        'LOC',
        'PRODUCT',
        'EVENT',
        'WORK_OF_ART',
        'LAW',
        'LANGUAGE',
        'DATE',
        'TIME',
        'PERCENT',
        'MONEY',
        'QUANTITY',
        'ORDINAL',
        'CARDINAL'
    )
    
    df = pd.DataFrame(
        data=np.zeros([len(x), len(entity_types)]).astype(int),
        columns=entity_types,
        index=x.index
    )
    for i, val in x.iteritems():
        if i % (len(x) / 10) == 0:
            print('Entry {}/{}'.format(i, len(x)))
        doc = nlp(spacy_tokenizer(val), disable=['parser'])
        for ent in doc.ents:
            df.loc[i, ent.label_] += 1
    df.columns = [col + '_COUNT' for col in df.columns]
    return df

In [64]:
ents_df_1 = get_entities(df['question1'])
ents_df_2 = get_entities(df['question2'])
ents_diff = ents_df_1 - ents_df_2
assert ents_diff.index.nunique() == len(ents_diff), 'Index not unique'
ents_diff.to_csv(os.path.join(OUT_PATH, 'FEATURE_entity_counts.csv'))

Entry 0/50000
Entry 5000/50000
Entry 10000/50000
Entry 15000/50000
Entry 20000/50000
Entry 25000/50000
Entry 30000/50000
Entry 35000/50000
Entry 40000/50000
Entry 45000/50000
Entry 0/50000
Entry 5000/50000
Entry 10000/50000
Entry 15000/50000
Entry 20000/50000
Entry 25000/50000
Entry 30000/50000
Entry 35000/50000
Entry 40000/50000
Entry 45000/50000


## Parts of Speech tagging

In [65]:
# # POS tagging
# for i in nlp(review):
#     print(i,"=>",i.pos_)

## Topic-modelling

In [66]:
# Creating a vectorizer
vectorizer = CountVectorizer(
    min_df=5, 
    max_df=0.9, 
    stop_words='english', 
    lowercase=True, 
    token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}'
)
corpus = pd.concat([df['question1'], df['question2']])
vectorized_corpus = vectorizer.fit_transform(corpus)
q1_vectorized = vectorizer.transform(df['question1'])
q2_vectorized = vectorizer.transform(df['question2'])

In [67]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(
    n_components=NUM_TOPICS, 
    max_iter=10, 
    verbose=True,
    random_state=0
)
lda.fit(vectorized_corpus)
q1_lda = lda.transform(q1_vectorized)
q2_lda = lda.transform(q2_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [68]:
topic_similarity = q1_lda * q2_lda
topic_df = pd.DataFrame(
    data=topic_similarity,
    columns=['PROB_BOTH_SHARE_TOPIC_' + str(i) for i in range(topic_similarity.shape[1])],
    index=df.index
)
topic_df.to_csv(os.path.join(OUT_PATH, 'FEATURE_topic_sharing.csv'))

In [69]:
topic_df.head()

Unnamed: 0_level_0,PROB_BOTH_SHARE_TOPIC_0,PROB_BOTH_SHARE_TOPIC_1,PROB_BOTH_SHARE_TOPIC_2,PROB_BOTH_SHARE_TOPIC_3,PROB_BOTH_SHARE_TOPIC_4,PROB_BOTH_SHARE_TOPIC_5,PROB_BOTH_SHARE_TOPIC_6,PROB_BOTH_SHARE_TOPIC_7,PROB_BOTH_SHARE_TOPIC_8,PROB_BOTH_SHARE_TOPIC_9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.000179,0.000179,0.000179,0.000179,0.000179,0.001964,0.664464,0.000179,0.000179,0.000179
1,0.000556,0.000556,0.000556,0.000556,0.011667,0.000556,0.128333,0.006111,0.000556,0.006111
2,0.000286,0.000286,0.034571,0.000286,0.000286,0.305993,0.000286,0.000286,0.000286,0.003143
3,0.0005,0.0205,0.0005,0.0005,0.0005,0.0005,0.0155,0.0005,0.0005,0.0005
4,0.00025,0.030249,0.00025,0.395254,0.00025,0.00025,0.00025,0.00275,0.00025,0.00025


In [70]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

In [71]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('good', 3409.307142278148), ('way', 3217.075125788998), ('learn', 1720.3605737193564), ('start', 1543.3585665382416), ('want', 1416.0468971222376), ('feel', 1270.5732357857507), ('buy', 1001.1150715571399), ('girl', 928.3349284319263), ('person', 859.8524973607471), ('stop', 849.3465175210454)]
Topic 1:
[('like', 3314.766080513468), ('job', 1392.9958490821577), ('live', 897.6805499693997), ('help', 721.3931393154149), ('place', 693.8916388329501), ('math', 642.4429974876906), ('video', 613.3276711712149), ('google', 608.6539645148972), ('software', 565.3066171682856), ('bad', 546.3823265472535)]
Topic 2:
[('time', 2099.822364160718), ('year', 1864.8161738274844), ('difference', 1837.2759309418432), ('doe', 1712.6513252080438), ('mean', 1553.7560450325666), ('book', 1444.164188681905), ('love', 1132.180040813065), ('old', 919.8394864817185), ('improve', 863.266936301231), ('woman', 859.5473070510342)]
Topic 3:
[('money', 1773.1930470195214), ('movie', 1305.42528512

The index in the above list with the largest value represents the most dominant topic for the given review.


# Visualizing LDA results with pyLDAvis

In [72]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, q1_vectorized, vectorizer, mds='tsne')
dash

## How to interpret this graph?
1. Topics on the left while their respective keywords are on the right.
2. Larger topics are more frequent and closer the topics, mor the similarity
3. Selection of keywords is based on their frequency and discriminancy.

**Hover over the topics on the left to get information about their keywords on the right.**