In [89]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import spacy
import matplotlib as mpl
import seaborn as sns
import warnings


warnings.filterwarnings('ignore')
plt.style.use('dark_background')
mpl.rcParams['axes.prop_cycle'] = plt.cycler(color=['blue'])

In [90]:
#get encoding of the data:
import chardet

with open('./data/train.csv' , 'rb') as f:
    result = chardet.detect(f.read(100000))
    # print(result)

df = pd.read_csv('./data/train.csv' , encoding = result['encoding'])
df.columns = ['itemid' , 'sentiment' , 'text']
#lower the tweets:
df['text'] = df['text'].str.lower()
df.head()

Unnamed: 0,itemid,sentiment,text
0,1,0,"@railminindia my pnr is 8348062961, i am in wa..."
1,2,0,@sureshpprabhu @railminindia ac not working in...
2,3,0,@railminindia i'm traveling to chennai by trai...
3,4,5,@railminindia irctc is not responding at the t...
4,5,7,@drmbhopal @railminindia @sanjaygupta2012 @drm...


In [91]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1366 entries, 0 to 1365
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   itemid     1366 non-null   int64 
 1   sentiment  1366 non-null   int64 
 2   text       1366 non-null   object
dtypes: int64(2), object(1)
memory usage: 32.1+ KB


# Clearning and feature engineering:


In [92]:
df.head()

Unnamed: 0,itemid,sentiment,text
0,1,0,"@railminindia my pnr is 8348062961, i am in wa..."
1,2,0,@sureshpprabhu @railminindia ac not working in...
2,3,0,@railminindia i'm traveling to chennai by trai...
3,4,5,@railminindia irctc is not responding at the t...
4,5,7,@drmbhopal @railminindia @sanjaygupta2012 @drm...


### punctutation removal and lower-text


In [93]:
import string

transtable = str.maketrans("" , "" , string.punctuation)
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### User mention extration:

create a column that has a list of all the @handles mentioned in the text.


In [94]:
import regex as re
samplet = df['text'].iloc[4]
pattern = r'@\w+'
print(f"{samplet=}")
np.unique(re.findall(pattern , samplet))

samplet='@drmbhopal @railminindia @sanjaygupta2012 @drmncrald matter notified to concerned official @bhusavaldivn'


array(['@bhusavaldivn', '@drmbhopal', '@drmncrald', '@railminindia',
       '@sanjaygupta2012'], dtype='<U16')

In [95]:
df['mentions'] = df['text'].apply(lambda x : re.findall(pattern , x))
df.head()

Unnamed: 0,itemid,sentiment,text,mentions
0,1,0,"@railminindia my pnr is 8348062961, i am in wa...",[@railminindia]
1,2,0,@sureshpprabhu @railminindia ac not working in...,"[@sureshpprabhu, @railminindia]"
2,3,0,@railminindia i'm traveling to chennai by trai...,[@railminindia]
3,4,5,@railminindia irctc is not responding at the t...,[@railminindia]
4,5,7,@drmbhopal @railminindia @sanjaygupta2012 @drm...,"[@drmbhopal, @railminindia, @sanjaygupta2012, ..."


### hashtag extraction:

- Lowered text for hastags


In [96]:
#regex pattern with negative look-behind:
hashpattern = r"(?<!\w)#\w+"
samples = df['text'].iloc[:500]

for i , tweet in enumerate(samples):
    if(re.search(hashpattern , tweet)):
        print(f"{i=} , {tweet=} , {re.findall(hashpattern , tweet)}")

i=8 , tweet='enter to win $150 amazon gift card! #books #pnr #urbanfantasy #romance https://t.co/jbfxzde7p2' , ['#books', '#pnr', '#urbanfantasy', '#romance']
i=11 , tweet='@railminindia #swatch bharat abhiyaan# train stops at kozhikod(calicut) but no one comes in 12218 coacha1 #no water# https://t.co/h5uudjuvel' , ['#swatch', '#no']
i=54 , tweet='@rpfcrsur @railminindia how many months or should i say years should i wait for the conscience of #corrupt #rottenâ€šÃ¤Â¶ https://t.co/2h0v7kwzgy' , ['#corrupt', '#rotten']
i=71 , tweet='rt @karailway: provide 1min stoppage for karnataka sampark kranthi at #haveri @drmmys @gmswr @railminindia @piyushgoyaloffcâ€šÃ¤Â¶ ' , ['#haveri']
i=103 , tweet='@railminindia got double tatkal tickets cnfrmd through same login costing 12k technical issue costed double[irctc #3515189] @drmsecunderabad' , ['#3515189']
i=108 , tweet='@sureshpprabhu @railminindia @gmner_gkp @drm_asn train 15047  light are not working passengers are suffering #help https://t.co/6

In [97]:
df['hashtags'] = df['text'].apply(lambda x : re.findall(hashpattern , x))
df.iloc[:20]

Unnamed: 0,itemid,sentiment,text,mentions,hashtags
0,1,0,"@railminindia my pnr is 8348062961, i am in wa...",[@railminindia],[]
1,2,0,@sureshpprabhu @railminindia ac not working in...,"[@sureshpprabhu, @railminindia]",[]
2,3,0,@railminindia i'm traveling to chennai by trai...,[@railminindia],[]
3,4,5,@railminindia irctc is not responding at the t...,[@railminindia],[]
4,5,7,@drmbhopal @railminindia @sanjaygupta2012 @drm...,"[@drmbhopal, @railminindia, @sanjaygupta2012, ...",[]
5,6,6,@railminindia if you can't give justice to gra...,[@railminindia],[]
6,7,4,@sureshpprabhu @railminindiaplz wrkout smthng ...,"[@sureshpprabhu, @railminindiaplz]",[]
7,8,0,@railminindia @mumbairailusers dirty water flo...,"[@railminindia, @mumbairailusers]",[]
8,9,6,enter to win $150 amazon gift card! #books #pn...,[],"[#books, #pnr, #urbanfantasy, #romance]"
9,10,3,@railminindia still vendors are selling local ...,[@railminindia],[]


### Extract urls:


In [98]:
urlpattern = r"https?://\S+|www\.|S+"
for i , tweet in enumerate(samples):
    if(re.search(hashpattern , tweet)):
        print(f"{i=} , {tweet=} , {re.findall(hashpattern , tweet)}")

i=8 , tweet='enter to win $150 amazon gift card! #books #pnr #urbanfantasy #romance https://t.co/jbfxzde7p2' , ['#books', '#pnr', '#urbanfantasy', '#romance']
i=11 , tweet='@railminindia #swatch bharat abhiyaan# train stops at kozhikod(calicut) but no one comes in 12218 coacha1 #no water# https://t.co/h5uudjuvel' , ['#swatch', '#no']
i=54 , tweet='@rpfcrsur @railminindia how many months or should i say years should i wait for the conscience of #corrupt #rottenâ€šÃ¤Â¶ https://t.co/2h0v7kwzgy' , ['#corrupt', '#rotten']
i=71 , tweet='rt @karailway: provide 1min stoppage for karnataka sampark kranthi at #haveri @drmmys @gmswr @railminindia @piyushgoyaloffcâ€šÃ¤Â¶ ' , ['#haveri']
i=103 , tweet='@railminindia got double tatkal tickets cnfrmd through same login costing 12k technical issue costed double[irctc #3515189] @drmsecunderabad' , ['#3515189']
i=108 , tweet='@sureshpprabhu @railminindia @gmner_gkp @drm_asn train 15047  light are not working passengers are suffering #help https://t.co/6

In [100]:
def processUrl(text):
    return pd.Series((re.findall(urlpattern , text) , re.sub(urlpattern , "" , text)))
df[['urls' ,'text']] = df['text'].apply(processUrl)
df.head(20)

Unnamed: 0,itemid,sentiment,text,mentions,hashtags,urls
0,1,0,"@railminindia my pnr is 8348062961, i am in wa...",[@railminindia],[],[]
1,2,0,@sureshpprabhu @railminindia ac not working in...,"[@sureshpprabhu, @railminindia]",[],[]
2,3,0,@railminindia i'm traveling to chennai by trai...,[@railminindia],[],[]
3,4,5,@railminindia irctc is not responding at the t...,[@railminindia],[],[]
4,5,7,@drmbhopal @railminindia @sanjaygupta2012 @drm...,"[@drmbhopal, @railminindia, @sanjaygupta2012, ...",[],[]
5,6,6,@railminindia if you can't give justice to gra...,[@railminindia],[],[]
6,7,4,@sureshpprabhu @railminindiaplz wrkout smthng ...,"[@sureshpprabhu, @railminindiaplz]",[],[]
7,8,0,@railminindia @mumbairailusers dirty water flo...,"[@railminindia, @mumbairailusers]",[],[https://t.co/mbzuofklxq]
8,9,6,enter to win $150 amazon gift card! #books #pn...,[],"[#books, #pnr, #urbanfantasy, #romance]",[https://t.co/jbfxzde7p2]
9,10,3,@railminindia still vendors are selling local ...,[@railminindia],[],[https://t.co/wdfyllmmek]


In [103]:
emojit = "Hey xoxoxoxoxo ðŸ’ªðŸ™ˆðŸ˜˜"

In [104]:
import emoji

emoji.demojize(emojit)

'Hey xoxoxoxoxo :flexed_biceps::see-no-evil_monkey::face_blowing_a_kiss:'

### Translate emojis to text:


In [107]:
df['etext'] = df['text'].apply(lambda x : emoji.demojize(x))
df.head(20)

Unnamed: 0,itemid,sentiment,text,mentions,hashtags,urls,etext
0,1,0,"@railminindia my pnr is 8348062961, i am in wa...",[@railminindia],[],[],"@railminindia my pnr is 8348062961, i am in wa..."
1,2,0,@sureshpprabhu @railminindia ac not working in...,"[@sureshpprabhu, @railminindia]",[],[],@sureshpprabhu @railminindia ac not working in...
2,3,0,@railminindia i'm traveling to chennai by trai...,[@railminindia],[],[],@railminindia i'm traveling to chennai by trai...
3,4,5,@railminindia irctc is not responding at the t...,[@railminindia],[],[],@railminindia irctc is not responding at the t...
4,5,7,@drmbhopal @railminindia @sanjaygupta2012 @drm...,"[@drmbhopal, @railminindia, @sanjaygupta2012, ...",[],[],@drmbhopal @railminindia @sanjaygupta2012 @drm...
5,6,6,@railminindia if you can't give justice to gra...,[@railminindia],[],[],@railminindia if you can't give justice to gra...
6,7,4,@sureshpprabhu @railminindiaplz wrkout smthng ...,"[@sureshpprabhu, @railminindiaplz]",[],[],@sureshpprabhu @railminindiaplz wrkout smthng ...
7,8,0,@railminindia @mumbairailusers dirty water flo...,"[@railminindia, @mumbairailusers]",[],[https://t.co/mbzuofklxq],@railminindia @mumbairailusers dirty water flo...
8,9,6,enter to win $150 amazon gift card! #books #pn...,[],"[#books, #pnr, #urbanfantasy, #romance]",[https://t.co/jbfxzde7p2],enter to win $150 amazon gift card! #books #pn...
9,10,3,@railminindia still vendors are selling local ...,[@railminindia],[],[https://t.co/wdfyllmmek],@railminindia still vendors are selling local ...


### Tokenization and comparision b/w tokenization approaches for nltk and spacy:


In [109]:
from nltk.tokenize import word_tokenize , sent_tokenize
import spacy
nlp = spacy.load("en_core_web_sm")

# get 100 random tweets
samples = df['etext'].sample(100)
samples


852     @sureshpprabhu @railminindia pnr no : 41200383...
336     @railminindia @soumyaz007 sir, attended u r mo...
820     @railminindia meals on wheels horrible food de...
578     @railminindia hats off @sureshpprabhu and your...
335     @ankulaagarwal @drmbct @railminindia @ir_edmec...
                              ...                        
899     @railminindia sir tea shop at platform no.2 sa...
1358    @railminindia travelling from nagpur to newdel...
200     @railminindia train 17211 on 8 june from gnt t...
1189    @sureshpprabhu @railminindia @westernrly train...
1170    @railminindia @sureshpprabhu sir, i hv mistype...
Name: etext, Length: 100, dtype: object

In [None]:
# number of unique words for nltk and spacy:
nltkavg = 0
lwords = []
for tweet in samples:
    words = [word for word in word_tokenize(tweet)]
    nltkavg += len(words)
    lwords+= [*words]
nltkavg /= 100
print(f"{len(np.unique(lwords))=} , {nltkavg=}")


len(np.unique(lwords))=897 , nltkavg=23.77


In [124]:
# token analysis using spacy:
savg = 0
swords = []

for tweet in samples:
    words = [token.text for token in nlp(tweet)]
    savg += len(words)
    swords += [*words]

savg /= 100
print(f"{len(np.unique(swords))=} , {savg=}")

len(np.unique(swords))=904 , savg=21.67


### Stem text using potter stemmer


In [None]:
%%timeit
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stemtweet(tweet):
    #tokenize tweet:
    words = [token.text for token in nlp(tweet)]
    #stem the tokens:
    swords = [stemmer.stem(word) for word in words]
    #return stemmed tweet:
    return [*swords]
# for tweet in samples:
#     print(f"{tweet= } \n, {stemtweet(tweet)}")

def lemmatizetweet(tweet):
    lwords = [word.lemma_ for word in nlp(tweet) if not word.is_stop]
    return lwords

def getpos(tweet):
    tags = [word.tag_ for word in nlp(tweet) if not word.is_stop]
    return tags

df['stemmed_text'] = df['text'].apply(lambda x : stemtweet(x))
df['lemmatized_text'] = df['text'].apply(lambda x : lemmatizetweet(x))
df['pos_tags'] = df['text'].apply(lambda x : getpos(x))
# df.head()