In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import glob
import json
from random import sample
import sklearn
import re
import string
import warnings
from bs4 import BeautifulSoup
import gensim
from gensim.models.phrases import Phrases
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from analysis_func.text_preproc import preproc_text
from analysis_func.topic_preproc import prepare_corp
from analysis_func.topic_model import topicmodel
from analysis_func.saveload_topicmodels import save_modelanddata, load_modelanddata
import datetime
import os
import time
import sys


from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
data = pd.read_csv("/data/shruti/ONR/small_data/Twitter_text/twitter_text_bjp_inc_withRT.csv", header=0)
data.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0.1,Unnamed: 0,screen_name,text,party,language,url,is_RT
0,0,,@umashankarsingh मतलब पुतिन पर भी पनौती चढ़ गयी।,INC,hi,,
1,1,,@sambitswaraj @BJP4India @PMOIndia मतलब जो कां...,INC,hi,,
2,2,,@myogiadityanath शायद इसीलिए आपने राम मंदिर ट्...,INC,hi,,
3,3,,@dmbas_ के आदेश के बावजूद DSO द्वारा कोई कार्य...,INC,hi,https://twitter.com/Thakur_deepak_s/status/146...,
4,4,,भारत को जिहादी राष्ट्र बताने वाली देशद्रोही #क...,INC,hi,,


In [3]:
len(data)

25828521

In [4]:
len(data.dropna(subset=['screen_name']))

20857064

In [5]:
len(data.dropna(subset=['url']))

5275905

In [6]:
from tld import get_tld

In [7]:
def get_domain(link):
    try:
        res = get_tld(link, as_object=True)
        return res.fld
    except:
        fakevar=1


data['domain'] = data['url'].parallel_apply(lambda x: get_domain(x))

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,screen_name,text,party,language,url,is_RT,domain
0,0,,@umashankarsingh मतलब पुतिन पर भी पनौती चढ़ गयी।,INC,hi,,,
1,1,,@sambitswaraj @BJP4India @PMOIndia मतलब जो कां...,INC,hi,,,
2,2,,@myogiadityanath शायद इसीलिए आपने राम मंदिर ट्...,INC,hi,,,
3,3,,@dmbas_ के आदेश के बावजूद DSO द्वारा कोई कार्य...,INC,hi,https://twitter.com/Thakur_deepak_s/status/146...,,twitter.com
4,4,,भारत को जिहादी राष्ट्र बताने वाली देशद्रोही #क...,INC,hi,,,


In [9]:
common_domains = ['twitter.com', 'facebook.com', 'google.com', 'm.tech', 'm.sc', 'b.tech', 'page.link', 'youtu.be', 'bit.ly', 'instagram.com','youtube.com']

filtered_link = data.loc[~data['domain'].isin(common_domains)]


In [10]:
filtered_link.head()

Unnamed: 0.1,Unnamed: 0,screen_name,text,party,language,url,is_RT,domain
0,0,,@umashankarsingh मतलब पुतिन पर भी पनौती चढ़ गयी।,INC,hi,,,
1,1,,@sambitswaraj @BJP4India @PMOIndia मतलब जो कां...,INC,hi,,,
2,2,,@myogiadityanath शायद इसीलिए आपने राम मंदिर ट्...,INC,hi,,,
4,4,,भारत को जिहादी राष्ट्र बताने वाली देशद्रोही #क...,INC,hi,,,
5,5,,"मंगल पांडेय, चंद्रशेखर आज़ाद, भगत सिंह, सुखदेव...",INC,hi,,,


In [11]:
filtered_link = filtered_link.dropna(subset=['domain'])

In [12]:
filtered_link.head()

Unnamed: 0.1,Unnamed: 0,screen_name,text,party,language,url,is_RT,domain
16,16,,दुनिया की पहली पार्टी है ये जो इस हद तक गिरी ह...,INC,hi,https://www.bhaskar.com/local/maharashtra/news...,,bhaskar.com
139,139,Thakur_deepak_s,@myogiadityanath आपकी ऐसी गैर जिम्मेदाराना हरक...,INC,hi,https://navbharattimes.indiatimes.com/metro/lu...,,indiatimes.com
141,141,Thakur_deepak_s,@narendramodi आपने कहा था कि देश नबी झुकने दूं...,INC,hi,https://indianexpress.com/article/india/global...,,indianexpress.com
190,190,Thakur_deepak_s,This is absolutely unacceptable. A bloody repo...,INC,en,https://thelogicalindian.com/trending/arnab-go...,,thelogicalindian.com
216,216,Thakur_deepak_s,@sambitswaraj What about dis?\nhttps://t.co/mx...,INC,en,https://timesofindia.indiatimes.com/city/hubba...,,indiatimes.com


In [13]:
news_domains = pd.read_csv("../lite_data/domaintitles_annotated_no_international.csv", header=0)
news_domains.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,domain,title_text,is_news,titleDEsc,international,if_annotated_international
0,0,0,indiatoday.in,"Latest News, Breaking News Today - Bollywood, ...",1.0,"indiatoday.in\nLatest News, Breaking News Toda...",domestic,True
1,1,3,prajavani.net,"Prajavani | ಪ್ರಜಾವಾಣಿ ತಾಜಾ ಸುದ್ದಿ, ಸಮಾಚಾರ, ವಾರ...",1.0,prajavani.net\nPrajavani | ಪ್ರಜಾವಾಣಿ ತಾಜಾ ಸುದ್...,domestic,True
2,2,4,thehindu.com,"\nThe Hindu: Breaking News, India News, Sports...",1.0,"thehindu.com\n\nThe Hindu: Breaking News, Indi...",domestic,True
3,3,6,abplive.com,"Hindi News, Breaking News in Hindi, हिंदी न्यू...",1.0,"abplive.com\nHindi News, Breaking News in Hind...",domestic,True
4,4,8,news18.com,"News, Breaking News, Latest News, News Headlin...",1.0,"news18.com\nNews, Breaking News, Latest News, ...",domestic,True


In [14]:
newslist = news_domains.loc[news_domains['international']=='domestic']['domain'].tolist()

In [15]:
filtered_link = filtered_link.loc[filtered_link['domain'].isin(newslist)]
filtered_link.head()

Unnamed: 0.1,Unnamed: 0,screen_name,text,party,language,url,is_RT,domain
16,16,,दुनिया की पहली पार्टी है ये जो इस हद तक गिरी ह...,INC,hi,https://www.bhaskar.com/local/maharashtra/news...,,bhaskar.com
139,139,Thakur_deepak_s,@myogiadityanath आपकी ऐसी गैर जिम्मेदाराना हरक...,INC,hi,https://navbharattimes.indiatimes.com/metro/lu...,,indiatimes.com
141,141,Thakur_deepak_s,@narendramodi आपने कहा था कि देश नबी झुकने दूं...,INC,hi,https://indianexpress.com/article/india/global...,,indianexpress.com
216,216,Thakur_deepak_s,@sambitswaraj What about dis?\nhttps://t.co/mx...,INC,en,https://timesofindia.indiatimes.com/city/hubba...,,indiatimes.com
218,218,Thakur_deepak_s,Does any media channel has d courage to show d...,INC,en,https://timesofindia.indiatimes.com/city/hubba...,,indiatimes.com


In [16]:
#text preproessing - filter engligh, hindi, marathi stop words, remove puncts, hash, mentions, urls, weird spaces etc.
filtered_link['clean_text'] = filtered_link['text'].parallel_apply(lambda x: preproc_text(x))


In [17]:
filtered_link.head()

Unnamed: 0.1,Unnamed: 0,screen_name,text,party,language,url,is_RT,domain,clean_text
16,16,,दुनिया की पहली पार्टी है ये जो इस हद तक गिरी ह...,INC,hi,https://www.bhaskar.com/local/maharashtra/news...,,bhaskar.com,दुनिया पहली हद गिरी घटिया भरी सब बढ़कर चोर मक्...
139,139,Thakur_deepak_s,@myogiadityanath आपकी ऐसी गैर जिम्मेदाराना हरक...,INC,hi,https://navbharattimes.indiatimes.com/metro/lu...,,indiatimes.com,आपकी ऐसी गैर जिम्मेदाराना हरकतें पूरे उत्तर शर...
141,141,Thakur_deepak_s,@narendramodi आपने कहा था कि देश नबी झुकने दूं...,INC,hi,https://indianexpress.com/article/india/global...,,indianexpress.com,आपने नबी झुकने दूंगा आपने कलंकित कसर छोड़ी
216,216,Thakur_deepak_s,@sambitswaraj What about dis?\nhttps://t.co/mx...,INC,en,https://timesofindia.indiatimes.com/city/hubba...,,indiatimes.com,dis
218,218,Thakur_deepak_s,Does any media channel has d courage to show d...,INC,en,https://timesofindia.indiatimes.com/city/hubba...,,indiatimes.com,media channel courage dis isnt dis news runnin...


In [18]:
import re
import string
import warnings
from bs4 import BeautifulSoup
#from markdown import markdown

emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U0001F1F2-\U0001F1F4"  # Macau flag
        u"\U0001F1E6-\U0001F1FF"  # flags
        u"\U0001F600-\U0001F64F"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U0001F1F2"
        u"\U0001F1F4"
        u"\U0001F620"
        u"\u200d"
        u"\u2640-\u2642"
        "]+", flags=re.UNICODE)

#text = emoji_pattern.sub(r'', text)


remove_urls = lambda x: re.sub("http(.+)?(\W|$)", ' ', x)
remove_RT = lambda x: x.replace("RT ", "")
remove_mentions = lambda x: re.sub("@\S+", '', x)
remove_hashtags = lambda x: re.sub("#\S+", '', x)
remove_digits = lambda x: re.sub("\d+", "", x)
remove_punct = lambda x: re.sub("!|\||\%|\.|\-|\/|:|…|,|\?|।+", "", x)
remove_emojis = lambda x: emoji_pattern.sub("", x)
normalize_spaces = lambda x: re.sub("[\n\r\t ]+", ' ', x)
remove_stop = lambda x: " ".join(i for i in x.lower().split() if i not in stopwords)


In [19]:
len(filtered_link)

97694

## extract phrases

In [23]:
print("prepping corpus for topicmodel")
# prepare data for topic modeling
corp = filtered_link['clean_text'].tolist()
parties = filtered_link['party'].tolist()
corp_tokens = [c.split() for c in corp]
phrases = Phrases(corp_tokens, min_count=10, threshold=0.1)
tokes = [phrases[t] for t in corp_tokens]
print(len(filtered_link), len(corp_tokens), len(tokes))

prepping corpus for topicmodel
97694 97694 97694


In [30]:
print(corp_tokens[250], tokes[250])

['कैसे', 'अच्छे', 'आमजन', 'जेब', 'काटो', 'कारोबार', 'बर्बाद', 'करो'] ['कैसे_अच्छे', 'आमजन_जेब', 'काटो_कारोबार', 'बर्बाद_करो']


In [31]:
domain_dict = defaultdict(list)
i=0
for idx, row in filtered_link.iterrows():
    dom = row['domain']
    domain_dict[dom] = domain_dict[dom] + tokes[i]
    i = i+1

In [32]:
domain_dict.keys()

dict_keys(['bhaskar.com', 'indiatimes.com', 'indianexpress.com', 'livemint.com', 'patrika.com', 'altnews.in', 'oneindia.com', 'indiatoday.in', 'outlookindia.com', 'npg.news', 'lalluram.com', 'ibc24.in', 'thewirehindi.com', 'aajtak.in', 'ndtv.com', 'firstpost.com', 'economictimes.com', 'timesofindia.com', 'businessinsider.in', 'republicworld.com', 'eurasiantimes.com', 'opindia.com', 'india.com', 'kashmirobserver.net', 'aninews.in', 'sundayguardianlive.com', 'news18.com', 'jagran.com', 'etvbharat.com', 'thedailyguardian.com', 'dailyo.in', 'newsnationtv.com', 'tribuneindia.com', 'tv9hindi.com', 'abplive.com', 'theprint.in', 'news9live.com', 'greaterkashmir.com', 'crosstownnews.in', 'theweek.in', 'telegraphindia.com', 'rajyasameeksha.com', 'thewire.in', 'thehindubusinessline.com', 'carandbike.com', 'thelallantop.com', 'thehindu.com', 'scroll.in', 'barandbench.com', 'thequint.com', 'japantimes.co.jp', 'sentinelassam.com', 'nationalheraldindia.com', 'deccanherald.com', 'freepressjournal.in',

In [34]:
keywords2domain = defaultdict(list)
for d in domain_dict.keys():
    for kwrd in domain_dict[d]:
        keywords2domain[kwrd].append(d)

In [38]:
domain_dict['ichowk.in']

['पाकिस्तान',
 'इमरान',
 'खान',
 'बस',
 'चले',
 'भारतइजरायल',
 'संबंधों',
 'खराब',
 'मौका',
 'चूकें',
 'दोनों',
 'देशों',
 'मजबूत',
 'संबंधों',
 'मेरे',
 'आलेख',
 'न्यूज_कवरेज',
 'आई',
 'चौक',
 'सबों',
 'समक्ष',
 'शाहरुख',
 'खान',
 'पत्नी',
 'गौरी',
 'खान',
 'बेटे',
 'आर्यन_खान',
 'लालनपालन',
 'सही',
 'ताकि',
 'बेहतर',
 'नागरिक',
 'बन',
 'उभरे',
 'इतनी',
 'भर',
 'उम्मीद',
 'जरूर',
 'कानून',
 'देंगे',
 'तालिबानी',
 'लड़ाकों',
 'अफगानिस्तान',
 'कब्जा',
 'लिया',
 'इस्लामिक',
 'मुल्कों',
 'दिए',
 'खादपानी',
 'उपज',
 'तालिबान',
 'वर्तमान',
 'समय',
 'अफगानिस्तान',
 'स्थिति',
 'बदतर',
 'दी',
 'महिला',
 'पुरुष',
 'हॉकी',
 'टीमों',
 'टोक्यो_ओलंपिक',
 'खेलों',
 'चमत्कारी',
 'प्रदर्शन',
 'गर्व',
 'महसूस',
 'दोनों',
 'टीमों',
 'अधिकतर',
 'खिलाड़ियों',
 'जिनका',
 'संबंध',
 'छोटेछोटे',
 'शहरों',
 'कस्बों',
 'गांवों',
 'कमाल',
 'दिखाया',
 'सरकारों',
 'संकटकाल',
 'आरोपप्रत्यारोप',
 'समय',
 'गंवाए',
 'बिना',
 'कारगर',
 'स्वास्थ्य',
 'सेवाएं',
 'उपलब्ध',
 'करवाने',
 'चाहिए',
 'गांधीनेहरू',
 'परिवार',
 '

In [39]:
keywords2domain['पाकिस्तान']

['bhaskar.com',
 'bhaskar.com',
 'bhaskar.com',
 'bhaskar.com',
 'bhaskar.com',
 'bhaskar.com',
 'indiatimes.com',
 'indiatimes.com',
 'indiatimes.com',
 'indiatimes.com',
 'indianexpress.com',
 'indianexpress.com',
 'indianexpress.com',
 'indianexpress.com',
 'patrika.com',
 'altnews.in',
 'altnews.in',
 'altnews.in',
 'altnews.in',
 'altnews.in',
 'altnews.in',
 'oneindia.com',
 'oneindia.com',
 'oneindia.com',
 'oneindia.com',
 'oneindia.com',
 'indiatoday.in',
 'thewirehindi.com',
 'thewirehindi.com',
 'thewirehindi.com',
 'thewirehindi.com',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'aajtak.in',
 'ndtv.com',
 'ndtv.com',
 'ndtv.com',
 'ndtv.com',
 'ndtv.com',
 'ndtv.com',
 'republicworld.com',
 'republicworld.com',
 'republicworld.com',
 'republi

In [40]:
for k in keywords2domain.keys():
    keywords2domain[k] = list(set(keywords2domain[k]))

### build keyword network

In [41]:
nodes = defaultdict(int)
edges = defaultdict(int)
for k in keywords2domain.keys():
    domainlist = list(set(keywords2domain[k]))
    if len(domainlist) > 5:
        for i in range(len(domainlist)-1):
            if domainlist[i] not in nodes:
                nodes[domainlist[i]]+=1
            for j in range(i+1, len(domainlist)):
                edgepair = sorted([domainlist[i], domainlist[j]])
                edges[(edgepair[0], edgepair[1])]+=1
        
        nodes[domainlist[len(domainlist)-1]]+=1

In [49]:
import networkx as nx

G = nx.Graph()

# for n in nodes.keys():
#     G.add_node(n, size=nodes[n], value=domainvalues[n])

ndlist = []
    
for e in edges:
    if edges[e] > 50:
        if e[0] not in ndlist:            
            G.add_node(e[0], size=nodes[e[0]])
            ndlist.append(e[0])
        if e[1] not in ndlist:
            G.add_node(e[1], size=nodes[e[1]])
            ndlist.append(e[1])
        
        G.add_edge(e[0], e[1], weight=edges[e])

In [50]:
nx.write_gexf(G, "./Jan24_domainkeywords.gexf", encoding='utf-8')