In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import glob
import json
from random import sample
import sklearn

from pandarallel import pandarallel
pandarallel.initialize()

datafiles = glob.glob("/home/phadke/ONR/ONR/big_data/Twitter/*.csv")

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
plist = ['_BJP.csv', '_INC.csv', '_AAP.csv']
pcount = defaultdict(list)
for d in datafiles:
    for p in plist:
        if p in d:
            pcount[p].append(d)

In [3]:
for p in pcount.keys():
    print(p, len(pcount[p]))

_INC.csv 3006
_AAP.csv 430
_BJP.csv 3255


In [4]:
final_files = defaultdict(list)
for p in pcount:
    final_files[p] = sample(pcount[p], 430)

In [5]:
for p in final_files.keys():
    print(p, len(final_files[p]))

_INC.csv 430
_AAP.csv 430
_BJP.csv 430


## read randomly sampled data

In [6]:
rowlist = []
for p in final_files.keys():
    party = p.replace("_", "").replace(".csv", "")
    for d in final_files[p]:
        with open(d, "r") as jsonfile:
            for line in jsonfile:
                job= json.loads(line)
                if "text" in job:
                    text = job['text']
                else:
                    text = None
                if "screen_name" in job:
                    sn = job['screen_name']
                else:
                    sn = None
                    
                if "lang" in job:
                    language = job['lang']
                else:
                    language = None
                    
                row = [sn, text, party, language]
                rowlist.append(row)


In [7]:
frame = pd.DataFrame(rowlist, columns=['screen_name','text','party', 'language'])

In [8]:
frame.head()

Unnamed: 0,screen_name,text,party,language
0,,‡§∏‡§Ç‡§ó‡§†‡§® ‡§ï‡•á ‡§®‡§ø‡§∞‡•ç‡§¶‡•á‡§∂‡§æ ‡§Ö‡§®‡•Å‡§∏‡§æ‡§∞ ‡§Ü‡§ú ‡§∞‡§æ‡§Æ‡§¨‡§æ‡§ó ‡§Æ‡§£‡•ç‡§°‡§≤ ‡§Æ‡•á‡§Ç ‡§∏...,INC,hi
1,,‡§≠‡§æ‡§ú‡§™‡§æ ‡§Ø‡•Å‡§µ‡§æ ‡§Æ‡•ã‡§∞‡•ç‡§ö‡§æ ‡§ï‡•Ä ‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡•Ä‡§Ø ‡§â‡§™‡§æ‡§ß‡•ç‡§Ø‡§ï‡•ç‡§∑‡§æ @The...,INC,hi
2,,RT @Amit_BJYM_: ‡§ú‡§ø‡§≤‡§æ ‡§Ü‡§ó‡§∞‡§æ ‡§ï‡•Ä ‡§´‡§§‡•á‡§π‡§™‡•Å‡§∞ ‡§∏‡•Ä‡§ï‡§∞‡•Ä ‡§µ‡§ø‡§ß...,INC,hi
3,,"‡§∏‡§π‡§ú ‡§∏‡§∞‡§≤ ‡§∏‡•ç‡§µ‡§≠‡§æ‡§µ ‡§ï‡•á ‡§ß‡§®‡•Ä, ‡§ï‡•Å‡§∂‡§≤ ‡§∏‡§Ç‡§ó‡§†‡§®‡§ï‡§∞‡•ç‡§§‡§æ, ‡§ï‡§æ‡§∞‡•ç‡§Ø‡§ï...",INC,hi
4,,‡§µ‡§ø‡§∂‡•ç‡§µ ‡§ï‡•Ä ‡§∏‡§¨‡§∏‡•á ‡§¨‡§°‡§º‡•Ä ‡§™‡§æ‡§∞‡•ç‡§ü‡•Ä ‡§∏‡•á ‡§ú‡•Å‡§°‡§º‡§®‡•á ‡§ï‡•á ‡§≤‡§ø‡§è 750...,INC,hi


In [9]:
frame.sample(5)

Unnamed: 0,screen_name,text,party,language
1045400,Pankaj_speak94,RT @SahilVastad: ‡§Æ‡•Å‡§∏‡•ç‡§≤‡§ø‡§Æ ‡§Ü‡§Ç‡§¶‡•ã‡§≤‡§® ‡§ï‡§∞‡§§ ‡§Ö‡§∏‡§§‡•Ä‡§≤ ‡§§‡§∞ ‡§§...,AAP,mr
1365422,,"1. Ideas and innovation,\n2. ‡§ú‡•ã‡§ñ‡§º‡§ø‡§Æ ‡§≤‡•á‡§®‡§æ,\n3. ...",BJP,hi
1500508,vardhan08,Today is the 2nd anniversary of Balakot Air st...,BJP,en
675617,Vaibhav_AAP,UP ‡§Æ‡•á ‡§™‡•ç‡§∞‡§æ‡§á‡§µ‡•á‡§ü ‡§∏‡•ç‡§ï‡•Ç‡§≤‡•ã‡§Ç ‡§ï‡•á ‡§¨‡§æ‡§π‡§∞ ‡§µ‡§ø‡§∞‡•ã‡§ß ‡§™‡•ç‡§∞‡§¶‡§∞‡•ç‡§∂‡§® ...,AAP,hi
1023863,honeychd82,RT @Tractor2twitr: Today‚Äôs hashtag is:\n\n#Far...,AAP,en


In [10]:
frame = frame.dropna(subset=['text'])

In [11]:
stopwords = []

stopwordfiles = glob.glob("../lite_data/stopwords/*.txt")
for s in stopwordfiles:
    with open(s, "r") as sfile:
        for line in sfile:
            stopwords.append(line.strip())


In [12]:
print(stopwords)

['‡§Ö‡§ß‡§ø‡§ï', '‡§Ö‡§®‡•á‡§ï', '‡§Ö‡§∂‡•Ä', '‡§Ö‡§∏‡§≤‡§Ø‡§æ‡§ö‡•á', '‡§Ö‡§∏‡§≤‡•á‡§≤‡•ç‡§Ø‡§æ', '‡§Ö‡§∏‡§æ', '‡§Ö‡§∏‡•Ç‡§®', '‡§Ö‡§∏‡•á', '‡§Ü‡§ú', '‡§Ü‡§£‡§ø', '‡§Ü‡§§‡§æ', '‡§Ü‡§™‡§≤‡•ç‡§Ø‡§æ', '‡§Ü‡§≤‡§æ', '‡§Ü‡§≤‡•Ä', '‡§Ü‡§≤‡•á', '‡§Ü‡§π‡•á', '‡§Ü‡§π‡•á‡§§', '‡§è‡§ï', '‡§è‡§ï‡§æ', '‡§ï‡§Æ‡•Ä', '‡§ï‡§∞‡§£‡§Ø‡§æ‡§§', '‡§ï‡§∞‡•Ç‡§®', '‡§ï‡§æ', '‡§ï‡§æ‡§Æ', '‡§ï‡§æ‡§Ø', '‡§ï‡§æ‡§π‡•Ä', '‡§ï‡§ø‡§µ‡§æ', '‡§ï‡•Ä', '‡§ï‡•á‡§≤‡§æ', '‡§ï‡•á‡§≤‡•Ä', '‡§ï‡•á‡§≤‡•á', '‡§ï‡•ã‡§ü‡•Ä', '‡§ó‡•á‡§≤‡•ç‡§Ø‡§æ', '‡§ò‡•á‡§ä‡§®', '‡§ú‡§æ‡§§', '‡§ù‡§æ‡§≤‡§æ', '‡§ù‡§æ‡§≤‡•Ä', '‡§ù‡§æ‡§≤‡•á', '‡§ù‡§æ‡§≤‡•á‡§≤‡•ç‡§Ø‡§æ', '‡§ü‡§æ', '‡§°‡•â', '‡§§‡§∞', '‡§§‡§∞‡•Ä', '‡§§‡§∏‡•á‡§ö', '‡§§‡§æ', '‡§§‡•Ä', '‡§§‡•Ä‡§®', '‡§§‡•á', '‡§§‡•ã', '‡§§‡•ç‡§Ø‡§æ', '‡§§‡•ç‡§Ø‡§æ‡§ö‡§æ', '‡§§‡•ç‡§Ø‡§æ‡§ö‡•Ä', '‡§§‡•ç‡§Ø‡§æ‡§ö‡•ç‡§Ø‡§æ', '‡§§‡•ç‡§Ø‡§æ‡§®‡§æ', '‡§§‡•ç‡§Ø‡§æ‡§®‡•Ä', '‡§§‡•ç‡§Ø‡§æ‡§Æ‡•Å‡§≥‡•á', '‡§§‡•ç‡§∞‡•Ä', '‡§¶‡§ø‡§≤‡•Ä', '‡§¶‡•ã‡§®', '‡§®', '‡§®‡§æ‡§π‡•Ä', '‡§®‡§ø‡§∞‡•ç‡§£‡•ç‡§Ø', '‡§™‡§£', '‡§™‡§Æ', '‡§™‡

## text cleaning

In [13]:
import re
import string
import warnings
from bs4 import BeautifulSoup
#from markdown import markdown

emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U0001F1F2-\U0001F1F4"  # Macau flag
        u"\U0001F1E6-\U0001F1FF"  # flags
        u"\U0001F600-\U0001F64F"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U0001F1F2"
        u"\U0001F1F4"
        u"\U0001F620"
        u"\u200d"
        u"\u2640-\u2642"
        "]+", flags=re.UNICODE)

text = emoji_pattern.sub(r'', text)


remove_urls = lambda x: re.sub("http(.+)?(\W|$)", ' ', x)
remove_RT = lambda x: x.replace("RT ", "")
remove_mentions = lambda x: re.sub("@\S+", '', x)
remove_hashtags = lambda x: re.sub("#\S+", '', x)
remove_digits = lambda x: re.sub("\d+", "", x)
remove_punct = lambda x: re.sub("!|\||\%|\.|\-|\/|:|‚Ä¶|,|\?|‡•§+|'+", "", x)
remove_emojis = lambda x: emoji_pattern.sub("", x)
normalize_spaces = lambda x: re.sub("[\n\r\t ]+", ' ', x)
remove_stop = lambda x: " ".join(i for i in x.lower().split() if i not in stopwords)


preproc_text = lambda x: remove_stop(
                            normalize_spaces(
                                remove_emojis(
                                    remove_punct(
                                        remove_digits(
                                            remove_hashtags(
                                                remove_mentions(
                                                    remove_RT(
                                                        remove_urls(x)))))))))

frame['clean_text'] = frame['text'].parallel_apply(lambda x: preproc_text(x))

In [14]:
testFrame = frame.sample(5)
for idx, row in testFrame.iterrows():
    print(row['text'], row['clean_text'].split())

RT @narendramodi: Continuing the reform trajectory, the Cabinet has approved a PLI Scheme for the Auto Industry and Drone Industry. This wi‚Ä¶ ['continuing', 'reform', 'trajectory', 'cabinet', 'approved', 'pli', 'scheme', 'auto', 'industry', 'drone', 'industry', 'wi']
RT @amitmalviya: In 2018, Rajkumar Roy, a school teacher and father of 2, was found mutilated and dead after he resisted TMC‚Äôs booth captur‚Ä¶ ['rajkumar', 'roy', 'school', 'teacher', 'father', 'mutilated', 'dead', 'resisted', 'tmc‚Äôs', 'booth', 'captur']
@Mysteriousgrl_R @ Iske aage bhi kuchh likh dete üòÇüòÇ

Aur lehnga pehle se hi taiyaar üòÅüòÅ ['@', 'iske', 'aage', 'bhi', 'kuchh', 'likh', 'dete', 'aur', 'lehnga', 'pehle', 'se', 'taiyaar']
RT @narendramodi: With our complementary strengths, India and US can creatively collaborate on a 2030 agenda for clean and green technologi‚Ä¶ ['complementary', 'strengths', 'creatively', 'collaborate', 'agenda', 'clean', 'green', 'technologi']
@myauditors like vivek Oberoi g

In [15]:
for idx, row in testFrame.iterrows():
    print(row['language'], row['clean_text'].split())

en ['continuing', 'reform', 'trajectory', 'cabinet', 'approved', 'pli', 'scheme', 'auto', 'industry', 'drone', 'industry', 'wi']
en ['rajkumar', 'roy', 'school', 'teacher', 'father', 'mutilated', 'dead', 'resisted', 'tmc‚Äôs', 'booth', 'captur']
hi ['@', 'iske', 'aage', 'bhi', 'kuchh', 'likh', 'dete', 'aur', 'lehnga', 'pehle', 'se', 'taiyaar']
en ['complementary', 'strengths', 'creatively', 'collaborate', 'agenda', 'clean', 'green', 'technologi']
ht ['vivek', 'oberoi', ')']


In [16]:
frame.language.unique()

array(['hi', 'en', 'und', 'in', 'ne', 'mr', 'ht', 'tl', 'sv', 'et', 'te',
       'bn', 'kn', 'vi', 'de', 'gu', 'es', 'ca', 'pl', 'or', 'pt', 'cs',
       'sl', 'lv', 'ro', 'fr', 'nl', 'ur', 'pa', 'tr', 'is', 'cy', 'eu',
       'it', 'lt', 'da', 'ta', 'fi', 'ml', 'no', 'hu', 'ar', 'ja', 'zh',
       'iw', 'fa', 'uk', 'bo', 'ps', 'si', 'ru'], dtype=object)

In [17]:
frame.loc[frame['language']=='in']

Unnamed: 0,screen_name,text,party,language,clean_text
38,,@abhinaw121 @AnujBajpai_ @eurasiawale Number s...,INC,in,number send karo enka
428,Divyachauhanbjp,RT @Upadhyaymayank5: @Divyachauhanbjp Janamdin...,INC,in,janamdin ki hardik shubhkamnaye apko
693,Divyachauhanbjp,@AMISHDEVGAN We all Real Indians are with you ...,INC,in,real indians apka kaam ek nationalist journali...
733,Divyachauhanbjp,@papernirbandh Banke bihari bless you,INC,in,banke bihari bless
1086,Divyachauhanbjp,Heartiest Congratulations @ArunSinghbjp Ji Bha...,INC,in,heartiest bhai sahab
...,...,...,...,...,...
1638245,BJP4Palamuru,RT @AvinashButtaBjp: #EcoFriendlyChristmas\n\n...,BJP,in,xmas tree telangana
1638651,BJP4Palamuru,RT @Mayurmatam: Some iconic photos on Sardar P...,BJP,in,iconic photos sardar patel jayanti iron man
1638864,BJP4Palamuru,RT @Mayurmatam: üïâÔ∏èüïâÔ∏èNamah Shivaiah‚ú°Ô∏è‚ú°Ô∏è‚öõÔ∏è‚öõÔ∏è htt...,BJP,in,namah shivaiah
1639369,BJP4Palamuru,RT @drlaxmanbjp: Remembering former union mini...,BJP,in,remembering union padma vibhushan arun jaitely...


In [18]:
print(Counter(frame['language'].tolist()))

Counter({'hi': 817077, 'en': 510354, 'und': 94975, 'mr': 65080, 'gu': 56482, 'kn': 24557, 'ta': 12811, 'pa': 10624, 'bn': 8596, 'in': 8377, 'te': 5738, 'or': 4243, 'ml': 3737, 'ne': 3735, 'tl': 3454, 'et': 2771, 'fr': 929, 'ht': 835, 'es': 681, 'it': 551, 'ca': 368, 'da': 360, 'ro': 338, 'nl': 307, 'pt': 287, 'tr': 285, 'de': 244, 'sv': 227, 'eu': 205, 'fi': 163, 'cs': 149, 'no': 143, 'lt': 116, 'pl': 109, 'sl': 107, 'hu': 102, 'vi': 96, 'lv': 94, 'ur': 85, 'cy': 83, 'is': 38, 'iw': 29, 'ja': 21, 'ar': 20, 'zh': 13, 'uk': 10, 'fa': 4, 'si': 3, 'ps': 2, 'bo': 1, 'ru': 1})


In [19]:
keep_lang = ['en','hi','mr']
lang_frame = frame.loc[frame['language'].isin(keep_lang)]
print(Counter(lang_frame['party'].tolist()))

Counter({'AAP': 506846, 'INC': 462235, 'BJP': 423430})


## phrasing based on languages

In [20]:
corp = lang_frame['clean_text'].tolist()
parties = lang_frame['party'].tolist()
corp_tokens = [c.split() for c in corp]

In [21]:
from gensim.models.phrases import Phrases

In [22]:
phrases = Phrases(corp_tokens, min_count=5, threshold=0.1)

In [23]:
testSent = ['‡§∏‡•ç‡§µ‡§∞‡•ç‡§£‡§ø‡§Æ', '‡§¶‡§ø‡§µ‡§∏', '‡§á‡§§‡§ø‡§π‡§æ‡§∏', '‡§Ö‡§Ç‡§ï‡§ø‡§§', '‡§ö‡•Å‡§ï‡§æ', '‡§ï‡•ç‡§Ø‡•ã‡§Ç‡§ï‡§ø', '‡§Ö‡§Ø‡•ã‡§ß‡•ç‡§Ø‡§æ', '‡§≠‡§ó‡§µ‡§æ‡§®', '‡§∞‡§æ‡§Æ‡§ö‡§Ç‡§¶‡•ç‡§∞', '‡§Æ‡§Ç‡§¶‡§ø‡§∞', '‡§®‡§ø‡§∞‡•ç‡§Æ‡§æ‡§£', '‡§™‡•Å‡§®‡•Ä‡§§']

In [24]:
print(phrases[testSent])

['‡§∏‡•ç‡§µ‡§∞‡•ç‡§£‡§ø‡§Æ', '‡§¶‡§ø‡§µ‡§∏', '‡§á‡§§‡§ø‡§π‡§æ‡§∏', '‡§Ö‡§Ç‡§ï‡§ø‡§§', '‡§ö‡•Å‡§ï‡§æ', '‡§ï‡•ç‡§Ø‡•ã‡§Ç‡§ï‡§ø', '‡§Ö‡§Ø‡•ã‡§ß‡•ç‡§Ø‡§æ_‡§≠‡§ó‡§µ‡§æ‡§®', '‡§∞‡§æ‡§Æ‡§ö‡§Ç‡§¶‡•ç‡§∞', '‡§Æ‡§Ç‡§¶‡§ø‡§∞_‡§®‡§ø‡§∞‡•ç‡§Æ‡§æ‡§£', '‡§™‡•Å‡§®‡•Ä‡§§']


In [25]:
tokes = [phrases[t] for t in corp_tokens]

In [26]:
BJP_data = []
INC_data = []
AAP_data = []

for i in range(len(tokes)):
    if parties[i]=='BJP':
        BJP_data.append(tokes[i])
    if parties[i]=='INC':
        INC_data.append(tokes[i])
    if parties[i]=='AAP':
        AAP_data.append(tokes[i])


## BJP topic modeling

In [41]:
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


In [42]:
len(BJP_data)

423430

In [43]:
BJP_id2word = corpora.Dictionary(BJP_data)

In [44]:
BJP_corpus = [BJP_id2word.doc2bow(text) for text in BJP_data]
print(BJP_corpus[:1])

[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (16, 1)]]


In [47]:
%%time
import gensim
BJP_model = gensim.models.ldamulticore.LdaMulticore(corpus=BJP_corpus,
                                                    id2word=BJP_id2word,
                                                    workers=128,
                                                    num_topics=5,
                                                    random_state=100,
                                                    chunksize=100,
                                                    passes=10,
                                                    per_word_topics=True)

CPU times: user 7min 36s, sys: 5min 14s, total: 12min 51s
Wall time: 10min 51s


In [48]:
from pprint import pprint
pprint(BJP_model.print_topics())
doc_lda = BJP_model[BJP_corpus]

[(0,
  '0.002*"‡§∞‡§æ‡§Æ" + 0.002*"namo_app" + 0.001*"days" + 0.001*"hai" + 0.001*"‚Ä¢" + '
  '0.001*"support" + 0.001*"team" + 0.001*"bengal" + 0.001*"gujarat" + '
  '0.001*"hospital"'),
 (1,
  '0.002*"namo_app" + 0.002*""" + 0.001*"‡§Ü‡§≠‡§æ‡§∞" + 0.001*"‡§ï‡§æ‡§Æ‡§®‡§æ" + '
  '0.001*"wishes" + 0.001*"‡§â‡§§‡•ç‡§§‡§Æ_‡§∏‡•ç‡§µ‡§æ‡§∏‡•ç‡§•‡•ç‡§Ø" + 0.001*"watch" + '
  '0.001*"‡§à‡§∂‡•ç‡§µ‡§∞_‡§Ü‡§™‡§ï‡•á" + 0.001*"‡§à‡§∂‡•ç‡§µ‡§∞_‡§™‡•ç‡§∞‡§æ‡§∞‡•ç‡§•‡§®‡§æ" + 0.001*"‡•ê_‡§∂‡§æ‡§Ç‡§§‡§ø"'),
 (2,
  '0.001*"‡§¶‡§ø‡§µ‡§∏" + 0.001*"assam" + 0.001*"‡§™‡•Ä‡§è‡§Æ" + 0.001*"‡§≤‡•ã‡§ó" + '
  '0.001*"‡§∞‡§æ‡§ú‡§∏‡•ç‡§•‡§æ‡§®" + 0.001*"namo_app" + 0.001*"‡§∏‡•ç‡§µ‡§§‡§Ç‡§§‡•ç‡§∞‡§§‡§æ_‡§∏‡§Ç‡§ó‡•ç‡§∞‡§æ‡§Æ" + '
  '0.001*"‡§Ö‡§µ‡§∏‡§∞" + 0.001*"‡§∏‡§æ‡§¶‡§∞" + 0.001*"‡§Æ‡§æ‡§®‡§®‡•Ä‡§Ø"'),
 (3,
  '0.001*"‡§∂‡•Å‡§≠‡§ï‡§æ‡§Æ‡§®‡§æ‡§ì‡§Ç_‡§ß‡§®‡•ç‡§Ø‡§µ‡§æ‡§¶" + 0.001*"‡§∏‡§¨" + 0.001*"sir" + 0.001*"‡§™‡•Ä‡§è‡§Æ" + '
  '0.001*"‡§Ü‡§¶‡§∞‡§£‡•Ä‡§Ø" + 0.001*"‡§®‡§æ‡§Æ" + 0.001*"‡§π‡•ã‡§ó‡§æ" + 0.001*"‡