In [1]:
import numpy as np
import pandas as pd
import re, nltk, spacy
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
# Plotting tools
# import pyLDAvis
# import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# f=open("pt.txt","rt")
# g=open("sample.txt","wt")
# f1=f.readlines()
# searchFor=(",")
# replaceWith=("")
# for line in f1:
#     tweetId=line[0:19]
#     g.write(tweetId.replace(","," "))
#     g.write(","+line[20:].replace(","," "))

# f.close()
# g.close()

In [3]:
df = pd.read_csv(r"C:\Users\Nikhil\Desktop\Topicchanges\updated\Countrywiseupdated\INDIA\topic\abc.csv", error_bad_lines=False,encoding='ISO-8859-1')

df['text']=df['text'].apply(str)

df


Unnamed: 0,text,id,Unnamed: 2
0,Paid tributes to Ram Jethmalani ji at his Shra...,1.170000e+18,
1,"Arun Jaitley ji was a remarkable leader, who l...",1.170000e+18,
2,"Live: Speaking at ""Shradhanjali Sabha"" in reme...",1.170000e+18,
3,The states of North-East are some concerns abo...,1.170000e+18,
4,I will be in Assam not a single intruder that ...,1.170000e+18,
...,...,...,...
27773,????????????????????????????? https: //tiksio/...,1.050000e+18,
27774,????????????????????????????? https: //tiksio/...,1.050000e+18,
27775,????????????????????????????? https: //tiksio/...,1.050000e+18,
27776,,,


In [4]:
# Convert to list
data = df.text.values.tolist()
# Remove Emails
data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in data]
# Remove new line characters
data = [re.sub(r'\s+', ' ', sent) for sent in data]
# Remove distracting single quotes
data = [re.sub(r"\'", "", sent) for sent in data]
pprint(data[0])

('Paid tributes to Ram Jethmalani ji at his Shradhanjali Sabha. '
 'https://t.co/bIdUcItZyB')


In [5]:
import gensim

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])

[['paid', 'tributes', 'to', 'ram', 'jethmalani', 'ji', 'at', 'his', 'shradhanjali', 'sabha', 'https', 'co', 'biducitzyb']]


In [6]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

In [7]:
# Initialize spacy ‘en’ model, keeping only tagger component (for efficiency)
# Run in terminal: python -m spacy download 
import os
nlp = spacy.load('en', disable=['parser', 'ner'])
# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'VERB']) #select noun and verb
print(data_lemmatized[:2])

['pay tribute ram http co biducitzyb', 'leader leave last impression polity']


In [8]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,
# minimum reqd occurences of a word 
                             stop_words='english',             
# remove stop words
                             lowercase=True,                   
# convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  
# num chars > 3
                             # max_features=50000,             
# max number of uniq words    
                            )
data_vectorized = vectorizer.fit_transform(data_lemmatized)

In [9]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=20,               # Number of topics
                                      max_iter=10,               
# Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          
# Random state
                                      batch_size=128,            
# n docs in each learning iter
                                      evaluate_every = -1,       
# compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               
# Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)
print(lda_model)  # Model attributes

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=20, n_jobs=-1,
                          perp_tol=0.1, random_state=100, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)


In [10]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))
# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -792568.8326270529
Perplexity:  1174.0799723352377
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 20,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [11]:
# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)
# GridSearchCV(cv=None, error_score='raise',
#        estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
#              evaluate_every=-1, learning_method=None,
#              max_doc_update_iter=100, mean_change_tol=0.001, n_components=10, n_jobs=1,
#              n_topics=None, perp_tol=0.1, random_state=None,
#              topic_word_prior=None, total_samples=1000000.0, verbose=0),
#        fit_params=None, iid=True, n_jobs=1,
#        param_grid={'n_topics': [10, 15, 20, 25, 30], 'learning_decay': [0.5, 0.7, 0.9]},
#        pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
#        scoring=None, verbose=0)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='online',
                                                 learning_offset=50.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=5,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=None,
                                                 perp_tol=0.1, random_state=0,
                                                 topic_word_prior=None,
                                                 total_samples=1000000.0,
             

In [12]:
best_lda_model=model.best_estimator_
print("Model Parameters", model.best_params_)
print("Model log likelihood score", model.best_score_)
print("Model Perplexity", best_lda_model.perplexity(data_vectorized))

Model Parameters {'n_components': 10}
Model log likelihood score -276454.8901105505
Model Perplexity 1045.406557505429


In [13]:
# Create Document — Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)
# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = ["Doc" + str(i) for i in range(len(data))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
# Styling
def color_green(val):
 color = 'green' if val > .1 else 'black'
 return 'color: {col}'.format(col=color)
def make_bold(val):
 weight = 700 if val > .1 else 400
 return 'font-weight: {weight}'.format(weight=weight)
# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.02,0.42,0.42,0.02,0.02,0.02,0.02,0.02,0.02,0.02,1
Doc1,0.03,0.37,0.03,0.37,0.03,0.03,0.03,0.03,0.03,0.03,1
Doc2,0.03,0.03,0.37,0.03,0.03,0.03,0.03,0.37,0.03,0.03,2
Doc3,0.39,0.02,0.02,0.02,0.02,0.21,0.02,0.02,0.28,0.02,0
Doc4,0.03,0.03,0.03,0.03,0.03,0.77,0.03,0.03,0.03,0.03,5
Doc5,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.89,0.01,0.01,7
Doc6,0.02,0.02,0.02,0.02,0.02,0.85,0.02,0.02,0.02,0.02,5
Doc7,0.01,0.01,0.01,0.01,0.01,0.01,0.16,0.01,0.73,0.01,8
Doc8,0.7,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0
Doc9,0.03,0.03,0.03,0.03,0.7,0.03,0.03,0.03,0.03,0.03,4


In [14]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head()

Unnamed: 0,ability,abuse,accept,access,accident,accord,account,accountability,accuse,achieve,...,worship,write,writer,year,yesterday,yoga,youngster,youth,zaye,zone
Topic0,0.100009,0.100006,0.100009,0.100016,0.100006,0.10001,54.08542,0.100001,0.100033,0.100005,...,0.100006,0.100007,0.100004,0.100021,0.100013,0.100024,12.373998,0.100033,0.1,0.100007
Topic1,0.100006,0.100004,0.100008,6.059498,0.100011,0.100012,0.100007,0.100002,0.100023,39.935527,...,0.100018,0.100013,0.100004,0.100031,0.100009,0.100012,0.100016,0.100019,0.100007,0.100001
Topic2,0.100005,0.100004,59.677361,0.10001,63.379,83.467179,0.100003,7.880796,0.100011,0.100015,...,0.100004,0.100008,0.100006,0.10002,0.100007,0.100006,0.100029,0.10002,0.100001,0.10001
Topic3,0.100014,0.100009,0.100008,0.100023,0.100014,0.100007,0.100004,0.100005,0.100012,0.100007,...,0.100009,0.100009,0.100013,0.100016,0.100009,0.100006,0.100003,0.100013,0.100003,0.100007
Topic4,0.100006,0.100012,0.100011,0.100008,0.100065,0.100013,0.100007,0.100002,0.100005,0.100008,...,98.661148,0.100008,0.100004,99.31996,0.10001,0.100003,0.10002,0.100012,0.100002,0.100005


In [15]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,address,farmer,meet,worker,rally,party,case,https,government,reach,student,issue,promise,price,stand
Topic 1,http,minister,mother,leave,economy,report,start,child,opportunity,place,answer,foundation,progress,remain,lay
Topic 2,people,election,tiksio,https,pay,need,tribute,live,team,tell,police,decision,lose,head,believe
Topic 3,wish,birthday,greeting,amp,leader,bjp,death,people,candidate,life,respect,http,constituency,https,kill
Topic 4,family,support,affection,pray,http,love,citizen,vote,power,https,question,program,water,area,news
Topic 5,country,government,make,https,message,ask,district,state,fight,year,want,http,world,lead,village
Topic 6,thank,work,anniversary,look,people,launch,attend,modi,continue,attack,kind,learn,project,road,https
Topic 7,come,time,congratulation,woman,know,https,win,speak,campaign,hear,corruption,write,force,money,order
Topic 8,say,state,development,government,land,society,service,shall,law,democracy,remember,https,truth,leadership,yesterday
Topic 9,today,day,meeting,occasion,hold,year,nation,https,visit,bring,lucknow,change,youth,fund,share


In [17]:
Topics = ["Farmers","Economy","Elections","Greetings","Election Campaign", 
          "Government rule", "Transportation", "Woman Empowerment", "Democracy", "Speech"]
df_topic_keywords["Topics"]=Topics
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Topics
Topic 0,address,farmer,meet,worker,rally,party,case,https,government,reach,student,issue,promise,price,stand,Farmers
Topic 1,http,minister,mother,leave,economy,report,start,child,opportunity,place,answer,foundation,progress,remain,lay,Economy
Topic 2,people,election,tiksio,https,pay,need,tribute,live,team,tell,police,decision,lose,head,believe,Elections
Topic 3,wish,birthday,greeting,amp,leader,bjp,death,people,candidate,life,respect,http,constituency,https,kill,Greetings
Topic 4,family,support,affection,pray,http,love,citizen,vote,power,https,question,program,water,area,news,Election Campaign
Topic 5,country,government,make,https,message,ask,district,state,fight,year,want,http,world,lead,village,Government rule
Topic 6,thank,work,anniversary,look,people,launch,attend,modi,continue,attack,kind,learn,project,road,https,Transportation
Topic 7,come,time,congratulation,woman,know,https,win,speak,campaign,hear,corruption,write,force,money,order,Woman Empowerment
Topic 8,say,state,development,government,land,society,service,shall,law,democracy,remember,https,truth,leadership,yesterday,Democracy
Topic 9,today,day,meeting,occasion,hold,year,nation,https,visit,bring,lucknow,change,youth,fund,share,Speech


In [18]:
# Define function to predict topic for a given text document.
nlp = spacy.load('en', disable=['parser', 'ner'])
def predict_topic(text, nlp=nlp):
    global sent_to_words
    global lemmatization
# Step 1: Clean with simple_preprocess
    mytext_2 = list(sent_to_words(text))
# Step 2: Lemmatize
    mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
# Step 3: Vectorize transform
    mytext_4 = vectorizer.transform(mytext_3)
# Step 4: LDA Transform
    topic_probability_scores = best_lda_model.transform(mytext_4)
    topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), 1:14].values.tolist()
    
    # Step 5: Infer Topic
    infer_topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), -1]
    
    #topic_guess = df_topic_keywords.iloc[np.argmax(topic_probability_scores), Topics]
    return infer_topic, topic, topic_probability_scores
# Predict the topic
mytext = ["Very Useful in diabetes age 30. I need control sugar. thanks Good deal"]
infer_topic, topic, prob_scores = predict_topic(text = mytext)
print(topic)
print(infer_topic)

['work', 'anniversary', 'look', 'people', 'launch', 'attend', 'modi', 'continue', 'attack', 'kind', 'learn', 'project', 'road']
Transportation


In [19]:
def apply_predict_topic(text):
 text = [text]
 infer_topic, topic, prob_scores = predict_topic(text = text)
 return(infer_topic)


df["Topic_key_word"]= df['text'].apply(apply_predict_topic)
df.head()

Unnamed: 0,text,id,Unnamed: 2,Topic_key_word
0,Paid tributes to Ram Jethmalani ji at his Shra...,1.17e+18,,Elections
1,"Arun Jaitley ji was a remarkable leader, who l...",1.17e+18,,Economy
2,"Live: Speaking at ""Shradhanjali Sabha"" in reme...",1.17e+18,,Woman Empowerment
3,The states of North-East are some concerns abo...,1.17e+18,,Farmers
4,I will be in Assam not a single intruder that ...,1.17e+18,,Government rule


In [20]:
df.groupby('Topic_key_word')
# df.nunique()
df


Unnamed: 0,text,id,Unnamed: 2,Topic_key_word
0,Paid tributes to Ram Jethmalani ji at his Shra...,1.170000e+18,,Elections
1,"Arun Jaitley ji was a remarkable leader, who l...",1.170000e+18,,Economy
2,"Live: Speaking at ""Shradhanjali Sabha"" in reme...",1.170000e+18,,Woman Empowerment
3,The states of North-East are some concerns abo...,1.170000e+18,,Farmers
4,I will be in Assam not a single intruder that ...,1.170000e+18,,Government rule
...,...,...,...,...
27773,????????????????????????????? https: //tiksio/...,1.050000e+18,,Farmers
27774,????????????????????????????? https: //tiksio/...,1.050000e+18,,Farmers
27775,????????????????????????????? https: //tiksio/...,1.050000e+18,,Farmers
27776,,,,Farmers


In [21]:
#df.to_csv("enFinal.csv")

df.to_json("topicupdatedindia.json",orient='records')