In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [2]:
#import data
data = pd.read_csv('twitter_training.csv')
data.head(2)

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...


In [3]:
data.describe()

Unnamed: 0,2401
count,74681.0
mean,6432.640149
std,3740.423819
min,1.0
25%,3195.0
50%,6422.0
75%,9601.0
max,13200.0


In [4]:
#rename columns
data.columns = (['ID', 'Game', 'Sentiment', 'Post'])

In [5]:
data[:1]

Unnamed: 0,ID,Game,Sentiment,Post
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...


## DATA PROCESSING

In [7]:
#drop null values
data.dropna(inplace = True) 

In [8]:
#check if nyul values
data.isna().sum()

ID           0
Game         0
Sentiment    0
Post         0
dtype: int64

#### Handling Target Data

In [9]:
data['Sentiment'] = data['Sentiment'].map({'Negative':0,'Positive':1, 
                                           'Neutral':2, 'Irrelevant':3})

In [10]:
#assignment of variables
x,y = data['Post'], data['Sentiment']

In [11]:
#split into training &  testing
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size = 0.8, random_state = 49)

In [12]:
data['Sentiment'].value_counts()

Sentiment
0    22358
1    20654
2    18108
3    12875
Name: count, dtype: int64

In [15]:
from tensorflow.keras.utils import to_categorical  
# Convert labels to one-hot encoding
train_labels = to_categorical(y_train, num_classes=4)
test_labels = to_categorical(y_test, num_classes=4) 

In [17]:
#tokenize text
max_vocab = 10000 #maximum number of words
max_len = 20   #maximum sequence(sentence) length

tokenizer = Tokenizer(num_words = max_vocab)
tokenizer.fit_on_texts(x)
#transforms sequences of texts to sequence of integers, 
train_sequences = tokenizer.texts_to_sequences(x_train) 
test_sequences = tokenizer.texts_to_sequences(x_test)


In [18]:
# Pad sequences
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post', truncating='post')

In [19]:
#observe sample of padded data and text sequences
test_padded[:1], test_sequences[:1]

(array([[ 191, 1349,  138,    2,   50, 1746, 2998,  131, 1958,    3, 1045,
         1499,   17,    5,  149, 1165,  134,  108,    0,    0]]),
 [[191,
   1349,
   138,
   2,
   50,
   1746,
   2998,
   131,
   1958,
   3,
   1045,
   1499,
   17,
   5,
   149,
   1165,
   134,
   108]])

In [23]:
# #removing stop words
# stop_words = stopwords
# def cleaning(text):
#     if not isinstance(text, str):  # Convert non-string values to empty strings
#         return ""
#     text = text.lower()
#     text = "".join(word for word in text.split() if word not in stopwords) #remmove stopwords
#     return text


# data['Post'] = data['Post'].apply(cleaning)

In [27]:
#checking if train and test data are in the right format
test_padded[:1], test_labels[:1]

(array([[ 191, 1349,  138,    2,   50, 1746, 2998,  131, 1958,    3, 1045,
         1499,   17,    5,  149, 1165,  134,  108,    0,    0]]),
 array([[0., 1., 0., 0.]]))

#### Train Model

In [24]:
import tensorflow as tf
embedding_dim = 100 #word embedding dimension
hidden_dim = 128   #GRU hidden size
dropout_rate = 0.5
modelGRU = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_vocab, embedding_dim, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(hidden_dim, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(hidden_dim)),
    tf.keras.layers.Dropout(dropout_rate),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(dropout_rate),
    tf.keras.layers.Dense(4, activation='softmax')  # Sigmoid for multinomial classification
]
    
)




In [25]:
modelGRU.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [26]:
#cross checking if model possess values we parsed to it 
modelGRU.summary()

In [28]:
modelGRU.fit(train_padded, train_labels, epochs = 5,
             batch_size = None, 
             validation_data = (test_padded, test_labels))

Epoch 1/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m274s[0m 129ms/step - accuracy: 0.5066 - loss: 1.1246 - val_accuracy: 0.7330 - val_loss: 0.7038
Epoch 2/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 121ms/step - accuracy: 0.7911 - loss: 0.5758 - val_accuracy: 0.7989 - val_loss: 0.5360
Epoch 3/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 131ms/step - accuracy: 0.8731 - loss: 0.3533 - val_accuracy: 0.8271 - val_loss: 0.4855
Epoch 4/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 116ms/step - accuracy: 0.9021 - loss: 0.2693 - val_accuracy: 0.8408 - val_loss: 0.4800
Epoch 5/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m222s[0m 120ms/step - accuracy: 0.9202 - loss: 0.2165 - val_accuracy: 0.8486 - val_loss: 0.5035


<keras.src.callbacks.history.History at 0x18652340810>

#### Model's performance is efficient judging by the validation score

In [29]:
#make predictions and compare to true value
np.argmax(modelGRU.predict(test_padded[4:5])),  data['Sentiment'][4:5]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step


(1,
 4    1
 Name: Sentiment, dtype: int64)

In [32]:
# creating a function to test model on manually inputed data 

def tester():
    data = str(input("enter comment please  "))
    
    # data = data.lower()
    # data = "".join(word for word in tester_sequences.split() if word not in stop_words)
    
    tester_sequences = tokenizer.texts_to_sequences([data])  #tokenize data
    tester_padded = pad_sequences(tester_sequences, maxlen = max_len, padding ='post', truncating = 'post') #pad data
    
    predicted_aggregates = modelGRU.predict(tester_padded)  #using aggregate to output raw value
    value = np.argmax(predicted_aggregates)
    print(value)


    if value == 0:
        print(f"Sentiment : NEGATIVE")
    elif value == 1:
        print(f"Sentiment : POSITIVE")
    elif value == 2:
        print(f"Sentiment : NEUTRAL")
    elif value == 3:
        print(f"Sentiment : IRRELVANT")



In [31]:
tester()

enter comment please   i dislike this game. it is very boring


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
0
Sentiment : NEGATIVE


In [33]:
tester()

enter comment please   Oh mine! this game is so fascinating.  


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
1
Sentiment : POSITIVE
