## **Case study on Sentiment analysis**
# The goal will be to build a system that can accurately classify the new tweets sentiments. You can divide the data into train and test. The Evaluation metric you should use is the accuracy.

In [1]:
import pandas as pd
import numpy as np

In [3]:
#from google.colab import drive
#drive.mount('/content/drive')

In [4]:
data = pd.read_csv('tweets.csv')

In [5]:
data.shape

(7920, 3)

In [6]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [7]:
data.label.value_counts()

0    5894
1    2026
Name: label, dtype: int64

In [8]:
data.columns

Index(['id', 'label', 'tweet'], dtype='object')

The label '1' denotes the tweet is negative and label '0' denotes the tweet is positive.

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

In [11]:
from bs4 import BeautifulSoup
import re

In [12]:
#Apply function on review column
data['tweet'] = data['tweet'].apply(denoise_text)

  soup = BeautifulSoup(text, "html.parser")


In [13]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


Removing special characters from the tweets

In [14]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\^ ^", "", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text

In [15]:
#Apply function on review column
data['tweet'] = data['tweet'].apply(remove_special_characters)

In [16]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,fingerprint Pregnancy Test httpsgooglh1MfQV an...
1,2,0,Finally a transparant silicon case ^ ^ Thanks ...
2,3,0,We love this Would you go talk makememories un...
3,4,0,Im wired I know Im George I was made that way ...
4,5,1,What amazing service Apple wont even talk to m...


Thus, the special characters are removed.Now we need to do stemming - to minimize the confusion around words that have similar meanings

In [17]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

In [18]:
#Apply function on review column
data['tweet']= data['tweet'].apply(simple_stemmer)

In [19]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
#set stopwords to english
stop=set(stopwords.words('english'))
print(stop)

{'then', 'more', 'ma', 'am', 'above', 'ours', 'by', 'these', 've', 'them', 'with', 'any', 'been', 'and', 'have', 'into', 'doing', 'too', "mustn't", 'than', 'from', 'against', 'theirs', 'both', 'yourself', 'below', 'nor', 'needn', 'few', 'but', 'at', "hasn't", 'they', 'she', 'do', "won't", 'shan', 'during', "it's", 'y', 'we', 'once', 'won', 'about', 'aren', 'those', 'had', 'was', 'didn', 'mustn', "shan't", 'under', 'up', 'our', 'over', 'd', 'weren', 'each', "wasn't", 'no', 'her', 'has', 'very', "wouldn't", 'hadn', 'were', 'itself', 'your', 'out', 'you', 'before', 'because', 'himself', 'through', 'if', "weren't", 'further', 'as', 'whom', 'all', 'it', 'to', 'this', "that'll", 'll', 'its', 'my', 'mightn', 'isn', "you're", "you'd", 'after', 's', 'did', 'who', "she's", 'some', "you've", 'in', 'yourselves', 'herself', 'most', "aren't", 'haven', 'own', 'does', 'him', 'not', 'ourselves', 'should', 'how', 'having', 'o', 'yours', 'only', "doesn't", 'themselves', 'his', "shouldn't", 'hers', 'where

In [21]:
from nltk.tokenize.toktok import ToktokTokenizer

Tokenizing the text

In [22]:

tokenizer1=ToktokTokenizer()

In [23]:
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

In [24]:
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer1.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [25]:
#Apply function on review column
data['tweet'] = data['tweet'].apply(remove_stopwords)

Now, the cleaning and preprocessing part got completed

**Feature engineering in text:**

In [26]:
data['tweet']

0       fingerprint pregnanc test httpsgooglh1mfqv and...
1       final transpar silicon case ^ ^ thank uncl yay...
2       love thi would go talk makememori unplug relax...
3       im wire know im georg wa made way iphon cute d...
4       amaz servic appl wont even talk question unles...
                              ...                        
7915    live loud lol liveoutloud selfi smile soni mus...
7916    would like wish amaz day make everi minut coun...
7917    help love 90 year old neighbor ipad thi morn h...
7918    final got smart pocket wifi stay connect anyti...
7919    appl barcelona appl store bcn barcelona travel...
Name: tweet, Length: 7920, dtype: object

In [27]:
all_words = " ".join(data['tweet'])

In [28]:
nltk.download("punkt")
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [29]:
all_words = word_tokenize(all_words)

In [30]:
from nltk import FreqDist

In [31]:
dist = FreqDist(all_words)

In [32]:
dist

FreqDist({'iphon': 3810, 'appl': 2895, 'samsung': 1414, 'new': 1184, 'phone': 1022, 'follow': 890, 'soni': 850, 'thi': 680, 'rt': 528, 'ipad': 525, ...})

In [33]:
num_unique_word = len(dist)

In [34]:
num_unique_word

20587

In [35]:
r_len = []

for text in data['tweet']:
  word = word_tokenize(text)
  l = len(word)
  r_len.append(l)

In [36]:
import numpy as np
MAX_TWEET_LEN = np.max(r_len)

In [37]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [38]:
tokenizer = Tokenizer(num_words = num_unique_word)

In [39]:
### fit_on_texts
tokenizer.fit_on_texts(list(data['tweet']))

In [40]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnanc test httpsgooglh1mfqv and...
1,2,0,final transpar silicon case ^ ^ thank uncl yay...
2,3,0,love thi would go talk makememori unplug relax...
3,4,0,im wire know im georg wa made way iphon cute d...
4,5,1,amaz servic appl wont even talk question unles...


In [41]:
data["tweet"] = tokenizer.texts_to_sequences(data['tweet'])

In [42]:
data

Unnamed: 0,id,label,tweet
0,1,0,"[2005, 5639, 997, 5640, 14, 15, 21, 22, 104, 7..."
1,2,0,"[55, 2006, 2007, 20, 36, 2384, 212, 7, 254, 36..."
2,3,0,"[11, 8, 81, 50, 531, 5642, 2858, 372, 1, 92, 2..."
3,4,0,"[25, 1428, 123, 25, 3673, 84, 202, 271, 1, 22,..."
4,5,1,"[62, 296, 2, 156, 120, 531, 922, 2859, 319, 56..."
...,...,...,...
7915,7916,0,"[99, 1189, 48, 20577, 69, 41, 7, 35, 234, 2057..."
7916,7917,0,"[81, 12, 138, 62, 13, 40, 96, 115, 218, 246, 2..."
7917,7918,0,"[290, 11, 1183, 91, 179, 20580, 10, 8, 164, 77..."
7918,7919,0,"[55, 34, 567, 1324, 295, 793, 532, 20581, 10, ..."


In [43]:
from tensorflow.keras.preprocessing import sequence

In [44]:
X = sequence.pad_sequences(data['tweet'], 38)
y = data['label']

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
x_train, x_test, y_train , y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [47]:
from tensorflow.keras.models import Sequential

In [48]:
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding

In [49]:
model = Sequential()
model.add(Embedding(input_dim = 20587, output_dim = 150, input_length = 38))

model.add(LSTM(128, dropout = 0.2 ))

model.add(Dense(1, activation = "sigmoid"))

In [52]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 38, 150)           3088050   
                                                                 
 lstm (LSTM)                 (None, 128)               142848    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 3,231,027
Trainable params: 3,231,027
Non-trainable params: 0
_________________________________________________________________


In [53]:
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ['accuracy'])

In [54]:
history1 = model.fit(x_train, y_train, epochs = 3, batch_size = 32)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [55]:
model.evaluate(x_test, y_test)



[0.3198358714580536, 0.872474730014801]

# **Thus, the accuracy of the model is seen as 87.25%**