<a href="https://colab.research.google.com/github/preetamjumech/LLM/blob/main/SentimentClassification_TensorflowHub_01_09_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip -q install tensorflow==2.15.0 tensorflow-hub keras==2.15.0

In [2]:
#use keras 2 (keras 3 will support this code)

# import keras
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd

from tensorflow.keras import layers, losses, Sequential, optimizers, metrics

In [3]:
hub.__version__

'0.16.1'

In [7]:
columns = ["id", "country", "Label", "Text"]

tweets_data = pd.read_csv("twitter_training.csv", names = columns)
print(tweets_data.shape)
tweets_data = tweets_data.sample(frac=0.001)
print(tweets_data.shape)

tweets_data.sample(5)

(74682, 4)
(75, 4)


Unnamed: 0,id,country,Label,Text
35588,8111,Microsoft,Negative,@ Microsoft @ PlayStation at some point of my ...
35351,8071,Microsoft,Neutral,net Cracks Infrastructure For Infamous Server ...
16328,3195,Dota2,Negative,"Due to a lack of players, I have to play Dota ..."
35274,8059,Microsoft,Neutral,Microsoft Teams Suffers Outage As Demand Surge...
60781,4814,GrandTheftAuto(GTA),Positive,I think about how awesome it would be to die i...


In [8]:
tweets_data = tweets_data.drop(columns = ["id", "country"])

tweets_data.dropna(inplace = True, axis = 0 )

tweets_data = tweets_data.drop_duplicates()

tweets_data.shape

(74, 2)

In [9]:
tweets_data["Label"] = tweets_data["Label"].replace({"Negative": 0, "Neutral": 1, "Positive": 2, "Irrelevant": 3})

tweets_data.sample(5)

  tweets_data["Label"] = tweets_data["Label"].replace({"Negative": 0, "Neutral": 1, "Positive": 2, "Irrelevant": 3})


Unnamed: 0,Label,Text
30333,0,WTF
35351,1,net Cracks Infrastructure For Infamous Server ...
11328,0,Dumbasses
40041,3,So.. Streaming on Facebook - played two full h...
28757,0,@PlayApex speak as bitch


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(
    tweets_data, test_size = 0.2, stratify = tweets_data["Label"], random_state = 123)
X_train, X_val = train_test_split(
    X_train, test_size = 0.1, stratify = X_train["Label"], random_state = 123)

X_train.shape, X_val.shape, X_test.shape

((53, 2), (6, 2), (15, 2))

In [11]:
BATCH_SIZE = 128

raw_train_ds = tf.data.Dataset.from_tensor_slices(
    (X_train["Text"].values, X_train["Label"].values)).shuffle(10000).batch(batch_size = BATCH_SIZE)

raw_val_ds = tf.data.Dataset.from_tensor_slices(
    (X_val["Text"].values, X_val["Label"].values)).batch(batch_size = BATCH_SIZE)

raw_test_ds = tf.data.Dataset.from_tensor_slices(
    (X_test["Text"].values, X_test["Label"].values)).batch(batch_size = BATCH_SIZE)

In [12]:
train_examples_batch, train_labels_batch = next(iter(raw_train_ds))

train_examples_batch[:5]

<tf.Tensor: shape=(5,), dtype=string, numpy=
array([b'Top 4:. League of Legends. Counter Strike - 1.6 e GO. Runescape. Fifa',
       b"The latest 'Apex Legends' hero is a master thief named Loba dlvr.it/RVnFXS https://t.co/ezLjauS691",
       b'i My love call of duty : infinite warfare',
       b'Due to a lack of players, I have to play Dota 2 again to represent PUPCOL.',
       b'i have only ever seen three walls of The Home Depot. this leads people to believe there is not a fourth wall, only more door depot'],
      dtype=object)>

In [13]:
train_labels_batch[:5]

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([2, 1, 2, 0, 2])>

In [14]:
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2" # dicrectly feed the raw text , dont need to do any preprocessing

# embedding = "https://tfhub.dev/google/nnlm-en-dim128/2"
# embedding = "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2" #removes punctuations

hub_layer = hub.KerasLayer(
    embedding, input_shape = [],
    dtype = tf.string, trainable = True
)

hub_layer(train_examples_batch[:3]).shape

TensorShape([3, 50])

In [15]:
model = Sequential()

model.add(hub_layer)
model.add(layers.Dense(32, activation = "relu"))
model.add(layers.Dense(4))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 50)                48190600  
                                                                 
 dense (Dense)               (None, 32)                1632      
                                                                 
 dense_1 (Dense)             (None, 4)                 132       
                                                                 
Total params: 48192364 (183.84 MB)
Trainable params: 48192364 (183.84 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [16]:
model.compile(
    optimizer = optimizers.Adam(learning_rate = 0.001),
    loss = losses.SparseCategoricalCrossentropy(from_logits = True),
    metrics = ['accuracy']
)

In [17]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

callback_list = [
    EarlyStopping(
        patience = 2,
        monitor = "val_accuracy"
    ),
ReduceLROnPlateau(
        patience = 1,
        factor = 0.1,
    )
]

In [18]:
EPOCHS = 3

history = model.fit(
    raw_train_ds,
    validation_data = raw_val_ds,
    epochs = EPOCHS,
    callbacks = callback_list
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [19]:
loss, accuracy = model.evaluate(raw_test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  1.3609062433242798
Accuracy:  0.4000000059604645
