In [287]:
import re
import pandas as pd
import numpy as np
import scipy.sparse as ss
import matplotlib.pyplot as plt
import corextopic as ct
import vis_topic as vt # jupyter notebooks will complain matplotlib is being loaded twice
from sklearn.feature_extraction.text import CountVectorizer
import gensim
import csv
from gensim.parsing.preprocessing import STOPWORDS
%matplotlib inline

In [288]:
file = "../CSV/big_data.csv"
# 15 Days blocks
file1 = "../CSV/big_data1.csv"
file2 = "../CSV/big_data2.csv"
file3 = "../CSV/big_data3.csv"
file4 = "../CSV/big_data4.csv"

In [289]:
def remove_url(text):
    result = re.sub(r"http\S+", "", text)
    return result

In [290]:
def rem_user(t):
    t = re.sub('@[^\s]+','',t)
    return re.sub('@[^\s]+','',t)

In [291]:
def rem_hash(t):
    return re.sub('#[^\s]+','',t)

In [292]:
def rem_RT(t):
    return re.sub('RT','',t)

In [293]:
def spellcheck(text):
    return  re.sub(r'[^a-zA-Z ]', '', text)

In [294]:
def preprocess(text):
    text = remove_url(text)
    text = rem_user(text)
    text = rem_hash(text)
    text = rem_RT(text)
    text = spellcheck(text)
    return text

In [295]:
def is_valid_word(word):
    if not word.isdigit() and len(word) > 3 and word not in gensim.parsing.preprocessing.STOPWORDS:
        return True
    return False

In [296]:
def save_output(topics, output_file):
    with open(output_file, 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerows([['Topic','Values']])
        total = []
        for n,topic in enumerate(topics):
            topic_words,_ = zip(*topic)
            topic_list = list(topic_words)
            topic_list.insert(0, n+1)
            total.append(topic_list)
        writer.writerows(list(total))
    writeFile.close()

In [317]:
#Importing the data

data = pd.read_csv(file4, error_bad_lines=False, encoding='latin-1');
document = data[['text']]
print(len(document))
document.head()

8950


Unnamed: 0,text
0,https://t.co/BJJ43TYjYl
1,RT @sujakrao: Watch | Does India Have the Fund...
2,RT @dm_ghaziabad: _Â_¢____ _Ñ___Ï_À_ø...
3,RT @iScrew: Here's a (partial) list of all the...
4,RT @iScrew: Here's a (partial) list of all the...


In [318]:
processed_docs = document
processed_docs['text'] = processed_docs['text'].map(preprocess)
processed_docs.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,text
0,
1,Watch Does India Have the Funds to Run the ...
2,Ayushman Bharat scheme of Ministry of h...
3,Heres a partial list of all the preexisting ...
4,Heres a partial list of all the preexisting ...


In [319]:
# Transform tweets data into a sparse matrix
vectorizer = CountVectorizer(stop_words='english', max_features=20000, binary=True)
doc_word = vectorizer.fit_transform(processed_docs['text'])
doc_word = ss.csr_matrix(doc_word)
doc_word.shape # n_docs x m_words

(8950, 2988)

In [320]:
# Get words that label the columns (needed to extract readable topics and make anchoring easier)
words = list(np.asarray(vectorizer.get_feature_names()))
len(words)

2988

In [321]:
not_digit_inds = [ind for ind,word in enumerate(words) if is_valid_word(word)]
doc_word = doc_word[:,not_digit_inds]
words = [word for ind,word in enumerate(words) if is_valid_word(word)]

doc_word.shape[1] == len(words) # n_docs x m_words

True

In [322]:
len(words)

2660

In [323]:
# Train the CorEx topic model with 50 topics
topic_model = ct.Corex(n_hidden=20, words=words, max_iter=200, verbose=False, seed=1)
topic_model.fit(doc_word, words=words);

In [324]:
# Print a single topic from CorEx topic model
topic_model.get_topics(topic=1, n_words=10)

[('consider', 0.27025435896443695),
 ('talk', 0.2675288596829946),
 ('want', 0.2658126091051127),
 ('dhan', 0.16403645993721658),
 ('importantif', 0.1630687675684841),
 ('padhao', 0.1630687675684841),
 ('record', 0.16197536850135028),
 ('swacch', 0.16197536850135028),
 ('ujjwala', 0.16066301586308157),
 ('beti', 0.15692797033679637)]

In [325]:
# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n+1) + ','.join(topic_words))

1: unparalleled,enthusiasm,seeing,pmjayayushman,karyakarta,ghazipur,asked,ayushman,narendramodi,karyak
2: consider,talk,want,dhan,importantif,padhao,record,swacch,ujjwala,beti
3: wait,benefitting,lacs,seen,comment,liberandu,tragedy,scheme,mantri,pradhan
4: benefit,going,providing,country,coverage,medical,people,admissions,approved,weeks
5: hospitals,treatment,railway,anil,today,soon,team,india,free,alag
6: private,hospital,players,doubts,viability,owners,district,especially,program,demand
7: universal,step,major,provision,components,healthcar,complete,families,initiative,wonderful
8: kangra,college,addresses,convocation,rajendra,prasad,says,presents,lies,halftruths
9: beneficiaries,sipping,wine,star,intellectual,letters,send,unaware,schemeas,reports
10: launch,lucknow,gift,swach,treated,denied,holder,success,card,worlds
11: modicare,jobs,benefits,create,days,modis,single,rupee,charging,start
12: services,live,work,centre,video,organise,workshop,thanks,traning,mpprovide
13: benefitted,l

In [326]:
output_file = "results_corex_tweets"+str(4)+".csv"
save_output(topics, output_file)