In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [3]:
df = pd.read_csv("Twitter Sentiments.csv")
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [5]:
df.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

## Preprocessing the text data

In [22]:
## removes pattern in the input text
def remove_pattern(input_text,pattern):
    input_text = re.sub(pattern,"",input_text)
    #r = re.findall(pattern,input_text)
    return input_text

In [30]:
# remove twitter handles (@user)
df['clean_tweet'] = np.vectorize(remove_pattern)(df['tweet'],'@[\w]*')

In [31]:
df.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can't use cause th...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation


In [32]:
# remove special characters, numbers and punctuations
df['clean_tweet'] = df['clean_tweet'].str.replace("[^a-zA-Z#]"," ")

In [33]:
df.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can t use cause th...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation,factsguide society now #motivation


In [34]:
# remove short word
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: ' '.join([word for word in x.split() if len(word)>2]))

In [35]:
df.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when father dysfunctional and selfish drags hi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit can use cause they don...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model love take with all the time
4,5,0,factsguide: society now #motivation,factsguide society now #motivation


In [36]:
# tokenization
tokenized_tweet = df['clean_tweet'].apply(lambda x: x.split())

In [37]:
tokenized_tweet

0        [when, father, dysfunctional, and, selfish, dr...
1        [thanks, for, #lyft, credit, can, use, cause, ...
2                                  [bihday, your, majesty]
3               [#model, love, take, with, all, the, time]
4                  [factsguide, society, now, #motivation]
                               ...                        
31957                              [ate, isz, that, youuu]
31958    [see, nina, turner, the, airwaves, trying, wra...
31959    [listening, sad, songs, monday, morning, otw, ...
31960    [#sikh, #temple, vandalised, #calgary, #wso, c...
31961                       [thank, you, for, you, follow]
Name: clean_tweet, Length: 31962, dtype: object

In [38]:
# Stemming
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [40]:
tokenized_tweet = tokenized_tweet.apply(lambda sentence: [stemmer.stem(word) for word in sentence])

In [41]:
tokenized_tweet

0        [when, father, dysfunct, and, selfish, drag, h...
1        [thank, for, #lyft, credit, can, use, caus, th...
2                                  [bihday, your, majesti]
3               [#model, love, take, with, all, the, time]
4                        [factsguid, societi, now, #motiv]
                               ...                        
31957                              [ate, isz, that, youuu]
31958    [see, nina, turner, the, airwav, tri, wrap, he...
31959    [listen, sad, song, monday, morn, otw, work, sad]
31960    [#sikh, #templ, vandalis, #calgari, #wso, cond...
31961                       [thank, you, for, you, follow]
Name: clean_tweet, Length: 31962, dtype: object

In [None]:
# Combining the words back to sentences
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])