Embeddings Tutorial - https://www.kaggle.com/code/rajmehra03/a-detailed-explanation-of-keras-embedding-layer

In [2]:
import numpy as np
import pandas as pd

# import nltk
# from nltk.tokenize import word_tokenize
# nltk.download('punkt_tab')

from sklearn.feature_extraction.text import CountVectorizer

In [3]:
df = pd.read_csv("../dataset/final_dataset/sentiment_analysis_dataset.csv")
df.head()

Unnamed: 0,text,label
0,matt forte steve smith cin,negative
1,but not possible others nice try modi what goo...,positive
2,let complete visionary mms that denied this an...,positive
3,dont see much longevity for govts rajasthan ka...,positive
4,pappu may ask evidence for this too jai modi,negative


In [4]:
df["label"] = df["label"].replace({
    "negative":0,
    "positive":1
})

  df["label"] = df["label"].replace({


In [5]:
df.head()

Unnamed: 0,text,label
0,matt forte steve smith cin,0
1,but not possible others nice try modi what goo...,1
2,let complete visionary mms that denied this an...,1
3,dont see much longevity for govts rajasthan ka...,1
4,pappu may ask evidence for this too jai modi,0


# Data Splitting

In [6]:
X = df["text"]
y = df["label"]

In [7]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=41)

# Text Vectorization

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# binary is true because we want to know if certain hate words orccurs in a text data or not rather 
# than calculating the frequency.

tfidf = TfidfVectorizer(max_features=500)
X_train_trf = tfidf.fit_transform(X_train)
X_test_trf = tfidf.transform(X_test)

In [79]:
X_train_trf.toarray()

array([[0.10965012, 0.        , 0.        , ..., 0.05717945, 0.        ,
        0.08202022],
       [0.        , 0.        , 0.        , ..., 0.38512108, 0.        ,
        0.        ],
       [0.        , 0.        , 0.47978385, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.32342432, 0.        ,
        0.        ]])

# Model Building

In [93]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,precision_score,recall_score,classification_report

In [81]:
nb = GaussianNB()
nb.fit(X_train_trf.toarray(),y_train)
y_pred = nb.predict(X_test_trf.toarray())


In [82]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.72      0.76      0.74      1067
           1       0.71      0.67      0.69       933

    accuracy                           0.72      2000
   macro avg       0.72      0.71      0.71      2000
weighted avg       0.72      0.72      0.72      2000



# Deep Learning

In [73]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense,SimpleRNN,Flatten,Embedding, LSTM, Bidirectional, Input
from tensorflow.keras.models import Model

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["text"])

In [12]:
# count no of sentences or rows
tokenizer.document_count

263794

In [13]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)


In [24]:
max_len = 0
for i in X_train_sequences:
    max_len = max(max_len,len(i))
max_len

2493

In [96]:
# vocab_size = len(tokenizer.word_index) + 1

# input_word = Input(shape=(50,))


# word_embedding = Embedding(input_dim=vocab_size,output_dim=1,input_length=50)(input_word)
# word_vec = Flatten()(word_embedding)
# embed_model = Model(input_word,word_vec)

# embed_model.compile(optimizer="adam",loss='binary_crossentropy',metrics=['acc'])

In [25]:
X_train_padded_sequences = pad_sequences(sequences=X_train_sequences,padding="post",maxlen=50)
X_test_padded_sequences = pad_sequences(sequences=X_test_sequences,padding="post",maxlen=50)

In [26]:
X_train_padded_sequences.shape,X_test_padded_sequences.shape


((211035, 50), (52759, 50))

In [61]:
vocab_size = len(tokenizer.word_index) + 1
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=10, input_length=50))
model.add(Bidirectional(LSTM(units=50, return_sequences=True)))
model.add(Bidirectional(LSTM(units=80, return_sequences=True)))
model.add(Bidirectional(LSTM(units=100, return_sequences=True)))
model.add(Bidirectional(LSTM(units=10,return_sequences=False)))
model.add(Dense(units=2, activation='softmax'))


In [62]:
model.build(input_shape=(None, 50))
model.summary()

In [37]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [32]:
model.fit(X_train_padded_sequences,y_train,epochs=5,validation_data=(X_test_padded_sequences,y_test),batch_size=64)

Epoch 1/5
[1m3298/3298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1183s[0m 354ms/step - accuracy: 0.8075 - loss: 0.4022 - val_accuracy: 0.9197 - val_loss: 0.1958
Epoch 2/5
[1m3298/3298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1183s[0m 359ms/step - accuracy: 0.9414 - loss: 0.1482 - val_accuracy: 0.9292 - val_loss: 0.1797
Epoch 3/5
[1m 637/3298[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m14:07[0m 319ms/step - accuracy: 0.9663 - loss: 0.0913

KeyboardInterrupt: 

In [65]:
X_train_padded_sequences.shape

(211035, 50)

In [72]:
 # Define model with only an Embedding layer
vocab_size = len(tokenizer.word_index) + 1  # Adjust vocab_size to your tokenizer
# Define model with Embedding and Flatten layer
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=50, input_length=50))
model.add(Flatten())  # Flatten the output of the embedding layer
model.add(Dense(10, activation="linear"))

# Compile the model
model.compile(optimizer="adam", loss='mse')

# Train the model with the same input as target
model.fit(X_train_padded_sequences, X_train_padded_sequences, epochs=10, verbose=1)


Epoch 1/10


ValueError: Dimensions must be equal, but are 50 and 10 for '{{node compile_loss/mse/sub}} = Sub[T=DT_FLOAT](compile_loss/mse/Cast, sequential_21_1/dense_15_1/Add)' with input shapes: [?,50], [?,10].

In [95]:
gnb = GaussianNB()
gnb.fit(embeddings,y_train)
y_pred = gnb.predict(embeddings_test)
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.59      0.69      0.64     28813
           1       0.54      0.43      0.48     23946

    accuracy                           0.57     52759
   macro avg       0.57      0.56      0.56     52759
weighted avg       0.57      0.57      0.57     52759

