# Import Libraries and Data

In [1]:
import pandas as pd
import re

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')

In [2]:
df = pd.read_csv('tweets.csv')
print(df.shape)
df.head(2)

(5157, 14)


Unnamed: 0,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,id,replyToUID,statusSource,screenName,retweetCount,isRetweet,retweeted
0,RT @rssurjewala: Critical question: Was PayTM ...,False,0.0,,2016-11-23 18:40:30,False,,8.014957e+17,,"<a href=""http://twitter.com/download/android"" ...",HASHTAGFARZIWAL,331.0,True,False
1,RT @Hemant_80: Did you vote on #Demonetization...,False,0.0,,2016-11-23 18:40:29,False,,8.014957e+17,,"<a href=""http://twitter.com/download/android"" ...",PRAMODKAUSHIK9,66.0,True,False


In [3]:
# reduce to text column
df.drop(df.columns[1:], axis=1, inplace=True)

In [4]:
df.loc[512,'text']

'RT @smita_muk: BREAKING NEWS\r\nPMapps result amnounced!\r\n90% Indians support #demonetization\r\n<ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><U+270C><U+270C><U+270C><U+270C><U+270C><U+270C><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086>\r\n@narendramodi Zindabad!'

In [5]:
def preprocess(text):

    # Remove unicode characters
    text = re.sub(r"<U\+[A-Z0-9]+>|<ed>", "", text)
    # Remove newline and rawstring characters
    text = re.sub(r"\n|\r", " ", text)
    text = re.sub(r"  ", " ", text)
    text = re.sub(r"   ", " ", text)
    text = re.sub(r"    ", " ", text)

    return text

In [6]:
df['text'] = df['text'].apply(preprocess)

In [7]:
df.loc[512,'text']

'RT @smita_muk: BREAKING NEWS PMapps result amnounced! 90% Indians support #demonetization  @narendramodi Zindabad!'

# Number of Mentions

In [9]:
def mentions(text):
    mentions = re.findall('@\w+', text)
    return len(mentions)

In [10]:
df['mentions_count'] = df['text'].apply(mentions)
df.head()

Unnamed: 0,text,mentions_count
0,RT @rssurjewala: Critical question: Was PayTM ...,1
1,RT @Hemant_80: Did you vote on #Demonetization...,1
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2


In [11]:
df['mentions_count'].describe()

count    5157.000000
mean        0.946481
std         1.195027
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max        13.000000
Name: mentions_count, dtype: float64

# Number of Hashtags

In [12]:
def hashtags(text):
    hashtags = re.findall('#\w+', text)
    return len(hashtags)

In [16]:
df['hashtags_count'] = df['text'].apply(hashtags)
df[df['mentions_count'] != df['hashtags_count']].head(2)

Unnamed: 0,text,mentions_count,hashtags_count
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1


In [17]:
df['hashtags_count'].describe()

count    5157.000000
mean        1.125461
std         1.362941
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max        10.000000
Name: hashtags_count, dtype: float64

# Number of Name Titles

In [18]:
def title(text):
    count = re.findall('Mr\.|Mrs\.|Dr\.|Miss\s*', text)
    return len(count)

In [25]:
df['text'].apply(title)

0       0
1       0
2       0
3       0
4       0
       ..
5152    0
5153    0
5154    0
5155    0
5156    0
Name: text, Length: 5157, dtype: int64

# Word Count

In [26]:
df['word_count'] = [len(i.split()) for i in df['text']]
df.head(2)

Unnamed: 0,text,mentions_count,hashtags_count,word_count
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11


In [27]:
df['word_count'].describe()

count    5157.000000
mean       16.461703
std         4.752193
min         1.000000
25%        13.000000
50%        17.000000
75%        20.000000
max        29.000000
Name: word_count, dtype: float64

# Number of Characters

In [28]:
df['character_count'] = [len(i) for i in df['text']]
df.head(2)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66


In [40]:
df['character_count'].describe()

count    5157.000000
mean      122.200310
std        24.099436
min         9.000000
25%       110.000000
50%       134.000000
75%       140.000000
max       149.000000
Name: character_count, dtype: float64

# Number of Characters Without Spaces

In [34]:
def remove_spaces(text):
    text = re.sub(r" ", "", text)
    return text

In [35]:
nospace_df = pd.DataFrame()
nospace_df['text'] = df['text'].apply(remove_spaces)

In [36]:
nospace_df['character_count'] = [len(i) for i in nospace_df['text']]
nospace_df.head(2)

Unnamed: 0,text,character_count
0,RT@rssurjewala:Criticalquestion:WasPayTMinform...,125
1,RT@Hemant_80:Didyouvoteon#DemonetizationonModi...,56


# Average Word Length

In [37]:
def avg_word_len(text):
    word_lens = 0
    for token in text.split():
        word_lens += len(token)
    word_count = text.split()
    return word_lens/len(word_count)

In [39]:
df['avg_word_len'] = df['text'].apply(avg_word_len)
df.head(2)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count,avg_word_len
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144,6.2
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66,5.090909


# Count of Stopwords

In [41]:
def stopwords(text):
    doc = nlp(text)
    count = 0
    for token in doc:
        if token.is_stop == True:
            count += 1
    return count

In [42]:
df['stopwords'] = df['text'].apply(stopwords)

In [43]:
df.head(2)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count,avg_word_len,stopwords
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144,6.2,7
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66,5.090909,4


# Count of POS Tags

In [44]:
def pos(text):
    doc = nlp(text)
    count = 0
    for token in doc:
        if token.pos_ in ["NOUN","ADP","ADJ"]:
            count += 1
    return count

In [45]:
df['pos'] = df['text'].apply(pos)
df.head(2)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count,avg_word_len,stopwords,pos
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144,6.2,7,9
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66,5.090909,4,4


# Named Entity Recognition

In [46]:
def ner(text):
    doc = nlp(text)
    count = 0
    for ent in doc.ents:
        if ent.label_:
            count += 1
    return count

In [47]:
df['ner'] = df['text'].apply(ner)
df.head(2)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count,avg_word_len,stopwords,pos,ner
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144,6.2,7,9,3
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66,5.090909,4,4,1


# Assignment Questions

1. Get the count of digits in the documents

In [48]:
def digits(text):
    digits = re.findall('\d', text)
    return len(digits)

In [49]:
df['digits_count'] = df['text'].apply(digits)

In [50]:
df.head(2)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count,avg_word_len,stopwords,pos,ner,digits_count
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144,6.2,7,9,3,0
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66,5.090909,4,4,1,2


2. Get the most frequently used NER

In [69]:
ents_list = []

for i in range(len(df['text'])):
    text = df['text'][i]
    doc = nlp(text)
    for ent in doc.ents:
        ents_list.append(ent)

In [88]:
ents_df = pd.DataFrame()
ents_df['entity_text'] = ents_list
ents_df.head(20)

Unnamed: 0,entity_text
0,(RT)
1,(PayTM)
2,"(about, #, Demonetization)"
3,"(#, Demonetization)"
4,(FinSec)
5,(RBI)
6,"(CBDT, Chair, +, Harvard)"
7,"(Aam, Aadmi)"
8,(Gurugram)
9,(@mail_today)


In [95]:
ents_list_2 = []
for i in range(len(df['text'])):
    text = df['text'][i]
    for j in text.split(sep=" "):
        ents_list_2.append(j)

In [97]:
ents_df_2 = pd.DataFrame()
ents_df_2['entity'] = ents_list_2
ents_df_2.head(2)

Unnamed: 0,entity
0,RT
1,@rssurjewala:


In [98]:
ents_df_2.value_counts()

entity 
to         1652
the        1583
of         1582
RT         1367
in         1330
           ... 
4)            1
4%.           1
bu            1
bubble.       1
bst           1
Name: count, Length: 19844, dtype: int64

3. Get the most frequently used POS tag

In [99]:
pos_list = []

for i in range(len(df['text'])):
    text = df['text'][i]
    doc = nlp(text)
    for token in doc:
        pos_list.append(token.pos_)

In [100]:
pos_df = pd.DataFrame()
pos_df['pos'] = pos_list
pos_df.head(2)

Unnamed: 0,pos
0,PROPN
1,PROPN


In [101]:
pos_df.value_counts()

pos  
NOUN     22680
PROPN    17374
PUNCT    10553
VERB      9566
ADP       9254
ADJ       5275
PRON      4446
AUX       4387
DET       4166
SYM       3689
ADV       2890
SPACE     2388
PART      2307
CCONJ     1720
NUM       1608
SCONJ     1461
X         1057
INTJ       376
Name: count, dtype: int64