# Main Links
- [Till Preprocessing](https://www.kaggle.com/code/poojag718/sentiment-analysis-machine-learning-approach)
- [Word2Vec Flavours](https://towardsdatascience.com/nlp-101-word2vec-skip-gram-and-cbow-93512ee24314#:~:text=Continuous%20Bag%20of%20Words%20Model%20(CBOW)%20and%20Skip%2Dgram&text=In%20the%20CBOW%20model%2C%20the,used%20to%20predict%20the%20context%20.)
- [Kaggle 1](https://www.kaggle.com/code/rahulabrsl/sentiment-analysis-using-rnn)
- [Kaggle 2](https://www.kaggle.com/code/barirahzainal/my-project-nlp-sentiment-analysis-dataset)
- [Kaggle 3](https://www.kaggle.com/code/hakim11/deep-text-analysis-with-lstm-and-keras-tuner)

- From aniruth code
```python
tqdm.pandas(desc="Calculating embeddings")
x_w2v_train = X_train.apply((lambda x: calculate_mean_embedding(X_train, word2vec_model)))
```



In [1]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from string import punctuation

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer
from nltk.stem.wordnet import WordNetLemmatizer


import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('./resources/Sentiment Analysis/training.1600000.processed.noemoticon.csv', delimiter=',', encoding='ISO-8859-1')
df.columns = ['Sentiment','id','date','query','user','text']
df.head()

Unnamed: 0,Sentiment,id,date,query,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [3]:
df = df[['Sentiment','text']]

In [4]:
df.Sentiment.value_counts()

Sentiment
0    799996
4    248576
Name: count, dtype: int64

In [5]:
# 0 Represents Negative sentiment, 1 represents positive sentiments
df['Sentiment'] = df['Sentiment'].replace({4:1})

## Downsampling the Dataset

In [6]:
from sklearn.utils import resample

In [7]:
## majority class 0
df_majority = df[df['Sentiment']==0]
## minority class 1
df_minority = df[df['Sentiment']==1]

In [8]:
df_minority.shape

(248576, 2)

In [9]:
# downsample the majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,   
                                 n_samples=len(df_minority),    
                                 random_state=1234) 

In [10]:
df = pd.concat([df_majority_downsampled, df_minority], ignore_index=True)

df.head()

Unnamed: 0,Sentiment,text
0,0,Wow slept for almost 12hours. Sleepy me!! Uni ...
1,0,gets bored with an idea too easily ... like tw...
2,0,To my girls - sorry i've been a homebody latel...
3,0,BK once again for the weekend...If it wasnt fo...
4,0,@DonnieWahlberg Now why didn't you do that las...


## Preprocessing
1. removing stop words
1. removing punctuations
1. Lemmatizing
1. removing tags
1. removing special characters
1. lowercase conversion

In [None]:
# def pre_processing(tweet: str):
    
#     # Remove Leading Blank Spaces
#     tweet = tweet.strip()
    
#     # Lower Case
#     tweet = tweet.lower()
    
#     # Remove URLS 
#     url_pattern = re.compile(r"https?://\S+|www\.\S+")
#     tweet = re.sub(url_pattern, "", tweet)
    
#     # Remove UserName
#     username_pattern = re.compile(r"@\w+")
#     tweet = re.sub(username_pattern, "", tweet)
    
#     # Remove Hashtags
#     hashtag_pattern = re.compile(r"#\w+")
#     tweet = re.sub(hashtag_pattern, "", tweet)
    
#     # Character normalization // todaaaaay -> today
#     tweet = re.sub(r"([a-zA-Z])\1{2,}", r'\1', tweet)
    
#     # Remove Special Characters
#     tweet = re.sub(r'[^a-zA-Z\s]', "", tweet)
    
#     # Word Tokenizer
#     tweet = nltk.word_tokenize(tweet)
    
#     # Remove Stop Words 
#     stop_words = set([re.sub(r'[^a-zA-Z\s]', "", word) for word in nltk.corpus.stopwords.words("english")])
#     tweet = [word for word in tweet if word not in stop_words]
    
#     # lemmatization
#     def get_pos(word):
#         tag = nltk.pos_tag([word])[0][1][0].upper()
#         tag_dict = {"N": "n", "V": "v", "R": "r", "J": "a"}
#         return tag_dict.get(tag, "n")
    
#     lemma = nltk.stem.WordNetLemmatizer()
#     tweet = [lemma.lemmatize(word, pos=get_pos(word)) for word in tweet]
    
#     return tweet

In [11]:
## remove stopwords and punctuation marks
stuff_to_be_removed = list(stopwords.words('english'))+list(punctuation)
stemmer = LancasterStemmer()

corpus = df['text'].tolist()
print(len(corpus))
print(corpus[0])

497152
Wow slept for almost 12hours. Sleepy me!! Uni now, boo! I wanna stay home, drink tea and watch house... 


In [12]:
%time
final_corpus = []
final_corpus_joined = []
for i in df.index:
    text = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    #Convert to lowercase
    text = text.lower()
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    ##Convert to list from string
    text = text.split()

    #Lemmatisation
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text 
            if not word in stuff_to_be_removed] 
    text1 = " ".join(text)
    final_corpus.append(text)
    final_corpus_joined.append(text1)

CPU times: user 1 µs, sys: 1e+03 ns, total: 2 µs
Wall time: 3.1 µs


In [13]:
# Storing it separately

data_cleaned = pd.DataFrame()
data_cleaned["text"] = final_corpus_joined
data_cleaned["Sentiment"] = df["Sentiment"].values

In [14]:
data_cleaned['Sentiment'].value_counts()

Sentiment
0    248576
1    248576
Name: count, dtype: int64

In [15]:
data_cleaned.head()

Unnamed: 0,text,Sentiment
0,wow slept almost hour sleepy uni boo wanna sta...,0
1,get bored idea easily like twitter,0
2,girl sorry homebody lately dont feel well does...,0
3,bk weekend wasnt puppy stay as,0
4,donniewahlberg last night atlanta,0


## EDA 
- [Done here](https://www.kaggle.com/code/poojag718/sentiment-analysis-machine-learning-approach)
- Inferences such as most commonly used words are derived from here


## Generating Representations

In [16]:
%pip install tensorflow
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

Note: you may need to restart the kernel to use updated packages.


In [17]:
# BoW
vectorizer = CountVectorizer(stop_words='english')
X_BOW = vectorizer.fit_transform(data_cleaned["text"])

In [18]:
X_BOW.shape

(497152, 261354)

In [19]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(data_cleaned["text"])

X_tfidf.shape

(497152, 261574)

In [20]:
# Continuous Bag of Words
from gensim.models import Word2Vec

corpus = [row.split() for row in data_cleaned["text"]]
model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, workers=4)

model.wv['good']

array([-1.1182606 ,  1.7777096 ,  0.7973752 ,  0.8141941 ,  1.3995241 ,
       -0.79540646,  1.1742413 ,  2.8262644 , -0.12120201, -0.3273218 ,
       -0.04284912, -0.22610478,  0.41366115,  0.10061679,  1.317079  ,
        0.5151939 ,  0.9813977 , -0.6533208 , -2.0191126 ,  0.96022767,
       -1.1635702 , -1.0577912 ,  1.3365839 , -2.6485252 ,  2.274607  ,
       -0.2931893 , -1.7153697 , -1.4721566 , -3.6930425 , -1.1394417 ,
        0.05420652, -1.2849993 , -0.83042616, -2.0884793 ,  0.47499567,
        2.9425147 , -0.04272367, -0.83591   ,  0.8721052 ,  0.53756064,
        0.9883547 , -1.3672761 , -1.0058049 ,  1.2677329 ,  0.79487175,
       -1.1289707 ,  1.1040866 , -0.45718995, -0.7899515 ,  1.0882237 ,
        3.9268768 , -0.34488592, -1.421561  , -1.2979196 , -2.804344  ,
       -0.08888749,  1.4201676 ,  1.3335626 , -0.1557181 ,  0.14437312,
       -0.27882034,  0.912197  ,  0.26403153, -1.1179705 , -1.4956924 ,
        0.31003264,  0.66143256,  1.4282879 ,  0.11695191,  0.33

In [21]:
# Skip Gram

from gensim.models import Word2Vec

corpus = [row.split() for row in data_cleaned["text"]]
model_sg = Word2Vec(corpus, vector_size=100, window=5, min_count=1, sg=1, workers=4)

model_sg.wv['good']

array([-3.29547286e-01,  4.69685733e-01,  2.63183057e-01,  1.51043385e-01,
        4.86368954e-01, -2.26075500e-01,  3.54234129e-01,  6.94557786e-01,
       -2.22389713e-01, -3.68814170e-01, -1.77476704e-01, -6.83002889e-01,
        2.24974096e-01,  3.09925973e-01,  2.67965406e-01, -5.84631741e-01,
        1.99876234e-01, -3.61189991e-02, -3.31291929e-02, -7.46793151e-01,
        6.73167631e-02,  4.32450354e-01,  4.45160270e-01, -5.71976364e-01,
        1.78891644e-01,  4.80847239e-01, -8.28774795e-02, -4.06122237e-01,
       -6.81171298e-01,  9.30287838e-02,  3.53204429e-01,  6.39024097e-03,
       -3.08226109e-01, -3.34249884e-01, -1.13148771e-01,  6.36421919e-01,
       -3.34780157e-01, -4.88316149e-01,  1.83805823e-01, -2.85217285e-01,
        2.78409183e-01, -5.44831872e-01,  1.99248996e-02, -2.35316738e-01,
        4.40954566e-01, -3.74710888e-01,  2.30027661e-01, -2.47032166e-01,
       -8.13237764e-03,  1.02975607e+00,  2.16338798e-01, -7.27806628e-01,
       -8.78275186e-03, -

## Model Building

In [22]:
from keras.utils import to_categorical
from keras.layers import Dense,SimpleRNN,Embedding,Flatten
from keras.models import Sequential

from keras.utils import pad_sequences

In [23]:
# Create embeddings for training data
embeddings = []

for row in data_cleaned["text"]:
    row_embedding = []
    for word in row.split():
        if word in model_sg.wv:
            row_embedding.append(model_sg.wv[word])
    embeddings.append(row_embedding)

embeddings[0]

[array([-0.41702428,  0.7543085 ,  0.23013362,  0.09798618, -0.05113993,
        -0.5643958 , -0.00301535,  0.35979074, -0.17433064, -0.702858  ,
        -0.31191343,  0.12797348, -0.03602573, -0.12030561,  0.2502345 ,
        -0.11915765,  0.2952049 , -0.47085777,  0.14797251, -0.67680556,
         0.37875062, -0.07424723,  0.01022182,  0.01705857, -0.2500005 ,
         0.14489642, -0.18030508,  0.2522615 , -0.6190076 , -0.0309769 ,
        -0.26808903, -0.3745586 ,  0.298401  , -0.02193589,  0.13528177,
         0.6138308 , -0.44820544, -0.33143553, -0.58981663, -0.5356574 ,
         0.46984562, -0.35260516,  0.2403222 , -0.29928496,  0.33920068,
        -0.27917305, -0.39974925, -0.35565653,  0.6241776 ,  0.07282512,
         0.06566047, -0.44675037, -0.45237282, -0.46944106, -0.5150804 ,
         0.41656467,  0.34487677, -0.06763136, -0.3050555 , -0.24599636,
        -0.21111742,  0.0976947 , -0.39384472, -0.31348228, -0.4690299 ,
         0.7010563 ,  0.04475857,  0.12481832, -0.7

In [24]:
# Pad Sequences

max_length = 20

X_pad = pad_sequences(embeddings, maxlen=max_length, padding='post')

X_pad.shape

(497152, 20, 100)

In [25]:
y = data_cleaned["Sentiment"]

y.shape

(497152,)

In [26]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

X_train.shape

(397721, 20, 100)

In [27]:
y_train = to_categorical(y_train, 3)
y_test = to_categorical(y_test, 3)

In [28]:
X_train[0]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [29]:
# Simple RNN - NOTE THAT WE ARE NOT USING AN EMBEDDING LAYER HERE

model = Sequential()
model.add(SimpleRNN(128, input_shape=(X_train.shape[1],X_train.shape[2]), return_sequences=True))
model.add(Flatten())
model.add(Dense(3, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

model.summary()

Epoch 1/5
[1m6215/6215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 5ms/step - accuracy: 0.6157 - loss: 0.6450 - val_accuracy: 0.6365 - val_loss: 0.6259
Epoch 2/5
[1m6215/6215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 6ms/step - accuracy: 0.6363 - loss: 0.6262 - val_accuracy: 0.6374 - val_loss: 0.6233
Epoch 3/5
[1m6215/6215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 6ms/step - accuracy: 0.6419 - loss: 0.6220 - val_accuracy: 0.6432 - val_loss: 0.6155
Epoch 4/5
[1m6215/6215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 6ms/step - accuracy: 0.6566 - loss: 0.6078 - val_accuracy: 0.6557 - val_loss: 0.6067
Epoch 5/5
[1m6215/6215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 5ms/step - accuracy: 0.6639 - loss: 0.5989 - val_accuracy: 0.6575 - val_loss: 0.6038


In [30]:
# Metrics
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)

y_pred = np.argmax(y_pred, axis=1)

y_test = np.argmax(y_test, axis=1)

print(classification_report(y_test, y_pred))

[1m3108/3108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step
              precision    recall  f1-score   support

           0       0.64      0.72      0.68     49666
           1       0.68      0.59      0.63     49765

    accuracy                           0.66     99431
   macro avg       0.66      0.66      0.66     99431
weighted avg       0.66      0.66      0.66     99431



In [32]:
# Input Sentiment
input = "What is not to like about this product"

# Clean the input
input = re.sub('[^a-zA-Z]', ' ', input)
input = input.lower()

# Tokenize the input
input = word_tokenize(input)

# Create embeddings for the input
input_embedding = []
for word in input:
    if word in model_sg.wv:
        input_embedding.append(model_sg.wv[word])

# Pad the input
input_pad = pad_sequences([input_embedding], maxlen=max_length, padding='post')

# Predict
output = model.predict(input_pad)

output

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


array([[3.6847892e-01, 6.3152087e-01, 1.6416880e-07]], dtype=float32)