# Import Libraries and Data

In [1]:
import pandas as pd
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Demonatization_tweets.csv', encoding = 'unicode_escape', engine ='python')
df = df.set_index(df.columns[0])
df.head()

Unnamed: 0_level_0,text
Unnamed: 0,Unnamed: 1_level_1
1,RT @rssurjewala: Critical question: Was PayTM ...
2,RT @Hemant_80: Did you vote on #Demonetization...
3,"RT @roshankar: Former FinSec, RBI Dy Governor,..."
4,RT @ANI_news: Gurugram (Haryana): Post office ...
5,RT @satishacharya: Reddy Wedding! @mail_today ...


# Get Top 50 Mentions by Count

In [3]:
mentions_list = list()

for i in range(len(df)):
    text = df['text'].iloc[i]
    mentions = re.findall('@\w+', text)
    for m in mentions:
        mentions_list.append(m)

mentions_list[0:3]

['@rssurjewala', '@Hemant_80', '@roshankar']

In [4]:
mentions_df = pd.DataFrame(columns=['person', 'count'])
mentions_df['person'] = pd.Series(mentions_list).unique()
for i in range(len(mentions_df)):
    mentions_df['count'][i] = mentions_list.count(mentions_df['person'][i])
    
mentions_df = mentions_df.sort_values(by='count', ascending = False)
mentions_df[:51]

Unnamed: 0,person,count
1051,@evanspiegel,1311
1050,@URautelaForever,1273
11,@narendramodi,1138
8,@gauravcsawant,541
45,@ModiBharosa,540
113,@DrKumarVishwas,358
1072,@5Forty3,286
0,@rssurjewala,284
871,@rahulroushan,284
38,@PMOIndia,246


# Get Top 50 Hashtags by Count

In [5]:
hashtags_list = list()

for i in range(len(df)):
    text = df['text'].iloc[i]
    hashtags = re.findall('#\w+', text)
    for h in hashtags:
        hashtags_list.append(h.lower())

hashtags_list[0:10]

['#demonetization',
 '#demonetization',
 '#demonetization',
 '#demonetization',
 '#demonetization',
 '#reddywedding',
 '#demonetization',
 '#blackmoney',
 '#demonetization',
 '#corruptionfreeindia']

In [6]:
hashtags_df = pd.DataFrame(columns=['hashtag', 'count'])
hashtags_df['hashtag'] = pd.Series(hashtags_list).unique()
for i in range(len(hashtags_df)):
    hashtags_df['count'][i] = hashtags_list.count(hashtags_df['hashtag'][i])
    
hashtags_df = hashtags_df.sort_values(by='count', ascending = False)
hashtags_df[:51]

Unnamed: 0,hashtag,count
0,#demonetization,8652
28,#india,378
20,#nitishkumar,258
813,#demo,180
879,#glvmi,145
2,#blackmoney,121
3,#corruptionfreeindia,103
48,#modi,97
797,#vijaymallya,87
85,#demonetisation,86


# Find Sentences Mentioning 'Prime Minister'

In [7]:
pms_sent_list = list()

for i in range(len(df)):
    text = df['text'].iloc[i]
    
    sent = re.findall('Prime Minister | \
                    Prime Minister\s\w+ | \
                    prime minister | \
                    prime minister\s\w+', text)
    
    if len(sent) > 0:
        pms_sent_list.append(text.lower())

len(pms_sent_list)

61

There are 61 sentences mentioning 'prime minister'

In [8]:
pms_sent_list[0]

'open message of #indian #child to prime minister #modi\r\n@narendramodi #demonetization #demonetisation https://t.co/xkf8ijnouc'

# Use the prepositions to extract relevant information from the tweets

A preposition is a word - almost always a small, common word - that shows direction, location, or time, or that introduces an object.

- I sent a letter TO you -> to is a preposition showing direction
- Someone is AT the door -> at is a preposition showing location
- We will arrive BY noon -> by is a preposition showing time

I'll start by viewing the POS tags for individual tweets, to see what tag corresponds to this type of word.

In [9]:
df = pd.read_csv('Demonatization_tweets.csv', encoding = 'unicode_escape', engine ='python')
df = df.set_index(df.columns[0])

Removing unwanted text:

In [10]:
for i in range(1, len(df)+1):
    df['text'][i] = re.sub('@\w+', "", df['text'][i]) # mentions
    df['text'][i] = re.sub('#\w+', "", df['text'][i]) # hastags
    df['text'][i] = re.sub('&amp;', "", df['text'][i]) # ampersands
    df['text'][i] = re.sub('RT\s:\s', "", df['text'][i]) # RT
    df['text'][i] = re.sub(r'http\S+', '', df['text'][i]) # links
    df['text'][i] = re.sub(r'http:\S+', '', df['text'][i]) # links
    df['text'][i] = re.sub("[^a-zA-Z]", " ", df['text'][i]) # symbols
    df['text'][i] = re.sub("\s\s", " ", df['text'][i]) # double spaces
    df['text'][i] = df['text'][i].lower() # lower case

In [11]:
df.head()

Unnamed: 0_level_0,text
Unnamed: 0,Unnamed: 1_level_1
1,critical question was paytm informed about edi...
2,did you vote on on modi survey app
3,former finsec rbi dy governor cbdt chair harv...
4,gurugram haryana post office employees provid...
5,reddy wedding cartoon


In [12]:
import spacy
from spacy import displacy
nlp=spacy.load('en_core_web_sm')

In [13]:
text = df['text'][1]
doc=nlp(text)
for token in doc:
    print(token.text,'=>',token.pos_,'=>',token.tag_)

critical => ADJ => JJ
question => NOUN => NN
was => AUX => VBD
paytm => PROPN => NNP
informed => VERB => VBN
about => ADP => IN
edict => PROPN => NNP
by => ADP => IN
pm => NOUN => NN
it => PRON => PRP
s => VERB => VBZ
clearly => ADV => RB
fishy => ADJ => JJ
and => CCONJ => CC
requires => VERB => VBZ
full => ADJ => JJ
disclosure => NOUN => NN


The below shows that prepositional words like 'by' and 'about' have a POS label of 'ADP' (tag of 'IN'). It also shows how these words have a relationship to nouns (the noun follows).

In [14]:
displacy.render(doc,jupyter=True)

Using the first tweet, we see that:
- 'about' is a preposition followed by the word 'edict'
- 'by' is a preposition followed by the word 'pm'

In [18]:
text = df['text'][1]
print(text)
doc=nlp(text)

counter = 1

for token in doc:
    if token.pos_ == 'ADP':
        print(token.text, token.pos_, token.tag_)
        print(doc[counter])
    counter = counter + 1

critical question was paytm informed about edict by pm it s clearly fishy and requires full disclosure 
about ADP IN
edict
by ADP IN
pm


We can create a list of words following prepositions...

In [16]:
object_list = []

for i in range(1,len(df)+1):
    text = df['text'][i]
    doc=nlp(text)
    counter = 1
    for token in doc:
        if token.pos_ == 'ADP':
            try:
                object_list.append(doc[counter])
            except:
                pass
        counter = counter + 1

In [17]:
objects_df = pd.DataFrame(columns=['entity', 'count'])
objects_df['entity'] = pd.Series(object_list).unique()

for i in range(len(objects_df)):
    objects_df['count'][i] = object_list.count(objects_df['entity'][i])

objects_df[objects_df['entity'] != ""].head()

Unnamed: 0,entity,count
0,edict,1
1,pm,1
2,on,1
3,modi,1
4,aam,1
