# Topic Modelling in python - data source is a textfile

In [1]:
# coding: utf-8 
#encoding=utf-8
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk 
import unicodedata
import re
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

### Set working directory ( only if not using a vitual environment)

In [2]:

import os
os.chdir('/home/visa/Topic')

In [3]:
df = pd.DataFrame()
f = open("polarity_pos.txt", "r+", encoding="latin1")
txt = f.readlines()


In [4]:
imd = pd.DataFrame(txt, columns=["comments"])
imd["row"] = imd.index

In [5]:
imd.head()

Unnamed: 0,comments,row
0,the rock is destined to be the 21st century's ...,0
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic\n,2
3,if you sometimes like to go to the movies to h...,3
4,"emerges as something rare , an issue movie tha...",4


In [6]:
stop_words = nltk.corpus.stopwords.words('english')
extended_stopwords = ['\'ll','\'d','\'m','\'re','\'s','\'ve','ca n\'t','r','n\'t','ca','see','get','movies','movie','go','say','come','many','another','could','would','made','really','want','even','odd','films','plot','ever','actually','also','movie','film']
stops = stop_words + extended_stopwords

#### Check with stemming and lemmatization to clean the data

In [8]:
text = " this is a test for stemmer and stemming "

from nltk.tokenize import word_tokenize
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
stm = PorterStemmer()
lemm = WordNetLemmatizer()

#tokens = [stm.stem(w) for w in tokens]
tokens = [lemm.lemmatize(w) for w in tokens]
#tokens = lemm.lemmatize(tokens)
print(tokens)

['this', 'is', 'a', 'test', 'for', 'stemmer', 'and', 'stemming']


#### Define function to tokenize and lemmatize the data

In [7]:
def tokenize(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    stm = PorterStemmer()
    lemm = WordNetLemmatizer()
    #tokens = [stm.stem(w) for w in tokens]
    tokens = [lemm.lemmatize(w) for w in tokens]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    #import pdb;pdb.set_trace()
    return filtered_tokens

#### Setiment Analysis with textblob 

In [13]:
imd['polarity'] = imd.comments.apply(lambda s: TextBlob(s).sentiment.polarity)

In [15]:
imd.head()
imd.tail()

Unnamed: 0,comments,row,polarity
10657,a terrible movie that some people will neverth...,10657,-1.0
10658,there are many definitions of 'time waster' bu...,10658,0.5
10659,"as it stands , crocodile hunter has the hurrie...",10659,-0.35
10660,the thing looks like a made-for-home-video qui...,10660,0.0
10661,"enigma is well-made , but it's just too dry an...",10661,-0.183333


#### Document term matrix with TF-IDF values 

In [16]:
term_idf_vectorizer       = TfidfVectorizer(max_df=0.99, max_features=2000,min_df=0.005, stop_words=stops, use_idf=True, tokenizer=tokenize, ngram_range=(1,1))
%time term_idf_matrix     = term_idf_vectorizer.fit_transform(imd.comments) 
term_idf_feature_names    = term_idf_vectorizer.get_feature_names()
term_idf_matrix.shape


CPU times: user 4.48 s, sys: 3.97 ms, total: 4.48 s
Wall time: 4.49 s


(10662, 268)

In [17]:
term_idf_feature_names

['acting',
 'action',
 'actor',
 'almost',
 'although',
 'always',
 'american',
 'amusing',
 'anyone',
 'anything',
 'around',
 'art',
 'attempt',
 'audience',
 'away',
 'back',
 'bad',
 'beautiful',
 'beautifully',
 'becomes',
 'best',
 'better',
 'big',
 'bit',
 'book',
 'boring',
 'boy',
 'care',
 'cast',
 'certainly',
 'character',
 'charm',
 'charming',
 'child',
 'cinema',
 'cinematic',
 'classic',
 'clever',
 'comedy',
 'comic',
 'compelling',
 'culture',
 'dark',
 'day',
 'de',
 'debut',
 'despite',
 'dialogue',
 'direction',
 'director',
 'documentary',
 'doe',
 'done',
 'drama',
 'dull',
 'easy',
 'effect',
 'effort',
 'else',
 'emotional',
 'end',
 'ending',
 'engaging',
 'enjoy',
 'enjoyable',
 'enough',
 'entertaining',
 'entertainment',
 'especially',
 'every',
 'everything',
 'exercise',
 'experience',
 'eye',
 'face',
 'fall',
 'familiar',
 'family',
 'fan',
 'far',
 'fascinating',
 'feature',
 'feel',
 'feeling',
 'filmmaker',
 'find',
 'first',
 'flick',
 'full',
 'fu

In [18]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

#### Topic Modelling using LDA 

In [19]:
lda = LatentDirichletAllocation(n_topics=5, max_iter=10,learning_method='online',learning_offset=10.,random_state=1)
%time lda.fit(term_idf_matrix)
print("\nTopics using Latent Dirichlet Allocation model with Term frequencies: \n")
print_top_words(lda, term_idf_feature_names, 10)



CPU times: user 8.08 s, sys: 12 ms, total: 8.09 s
Wall time: 8.09 s

Topics using Latent Dirichlet Allocation model with Term frequencies: 

Topic #0:
ha comedy work doe much audience way thing thriller le
Topic #1:
good feel never bad time director something better every like
Topic #2:
one performance funny u best well sense make often yet
Topic #3:
little life character drama take like story script big give
Topic #4:
wa love nothing action enough fun kind moment de first



#### Avoid noise in unigrams, TFIDF matrix on bigrams and trigrams

In [22]:
term_idf_vectorizer       = TfidfVectorizer(max_df=0.99, max_features=2000,min_df=0.0005, stop_words=stops, use_idf=True, tokenizer=tokenize, ngram_range=(2,3))
%time term_idf_matrix     = term_idf_vectorizer.fit_transform(imd.comments) 
term_idf_feature_names    = term_idf_vectorizer.get_feature_names()
term_idf_matrix.shape

CPU times: user 4.58 s, sys: 12 ms, total: 4.59 s
Wall time: 4.6 s


(10662, 395)

#### Topic Modelling using LDA ( with bigrams and trigams)

In [23]:
lda = LatentDirichletAllocation(n_topics=5, max_iter=10,learning_method='online',learning_offset=10.,random_state=1)
%time lda.fit(term_idf_matrix)
print("\nTopics using Latent Dirichlet Allocation model with Term frequencies: \n")
print_top_words(lda, term_idf_feature_names, 10)



CPU times: user 4.91 s, sys: 64 µs, total: 4.91 s
Wall time: 4.91 s

Topics using Latent Dirichlet Allocation model with Term frequencies: 

Topic #0:
soap opera running time character study one thing sense humor young woman good intention enough make much better ha done
Topic #1:
love story worth seeing blue crush long time stealing harvard doe make woody allen motion picture every bit home video
Topic #2:
romantic comedy play like look like two hour new york make u high school whole lot try hard waste time
Topic #3:
big screen action sequence give u whole thing may find reign fire guilty pleasure doe much seem like human nature
Topic #4:
feel like special effect subject matter never quite good time de niro one best year ago high crime pretty much



#### Topic modeling using NMF 

In [25]:
# Fit the NMF model
%time nmf = NMF(n_components=5, random_state=1,alpha=.1, l1_ratio=.5).fit(term_idf_matrix)
print("\nFitting the Non-negative Matrix Factorization model with tf-idf features: \n")
print_top_words(nmf, term_idf_feature_names, 10)

CPU times: user 256 ms, sys: 233 ms, total: 489 ms
Wall time: 183 ms

Fitting the Non-negative Matrix Factorization model with tf-idf features: 

Topic #0:
romantic comedy sandra bullock hugh grant interesting character new york much fun start finish point view might well one greatest
Topic #1:
feel like still feel doe feel make feel one feel like one two hour spy kid tv series three hour
Topic #2:
play like like one like bad big screen young woman seem like whole thing point view never seen after-school special
Topic #3:
love story lan yu story one birthday girl edge seat one best like bad good intention good job good thing
Topic #4:
special effect action sequence jackie chan minority report hollywood ending queen damned time machine funny moment after-school special harry potter

