In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import random
import numpy as np
import tensorflow as tf
seed_value = 42
np.random.seed(seed_value)
tf.random.set_seed(seed_value)
random.seed(seed_value)

In [None]:
import pandas as pd
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Hsoub/L9/data/Arabic Sentiment Analysis Dataset - SS2030.csv',sep=';')

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
import string

def cleanText(text):
    numbers="0123456789"
    arabic_punctuation='''`÷×؛<>_()*^ـ،/:"؟.,'~¦+|!”…“–ـ'''
    english_punctuation=string.punctuation
    del_chars=english_punctuation+arabic_punctuation+numbers
    for char in del_chars:
        text = text.replace(char, "")
    text = text.replace('\n', ' ')
    text = text.strip(' ')
    listStopwords = stopwords.words('arabic')
    tokens_list=word_tokenize(text)
    filtered = []
    for txt in tokens_list:
        if txt not in listStopwords:
            filtered.append(txt)
    return filtered

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
data['text'] = data['text'].apply(cleanText)

In [None]:
texts = data['text'].values
labels = data['Sentiment'].values

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)
sequences

[[31, 3, 3406, 9278],
 [2, 1956, 31, 3, 377, 9279],
 [2,
  9280,
  9281,
  5041,
  893,
  9282,
  9283,
  3407,
  5042,
  2503,
  5043,
  31,
  3,
  1151,
  2504,
  9284,
  9285],
 [2, 9286, 31, 3, 9287, 118, 256, 9288],
 [2,
  9289,
  40,
  149,
  499,
  18,
  40,
  499,
  18,
  40,
  65,
  3408,
  9290,
  500,
  754,
  620,
  9291,
  283,
  40,
  3409,
  18,
  65,
  33,
  32,
  86,
  9292,
  755,
  1152,
  1324,
  756,
  182,
  284,
  165,
  31,
  81,
  5044,
  183,
  5,
  1595,
  5045,
  9293,
  3410,
  9294],
 [2,
  9295,
  16,
  5046,
  2505,
  2506,
  93,
  77,
  240,
  5047,
  5048,
  5049,
  2507,
  814,
  5050,
  5051,
  539,
  49,
  5052,
  621,
  18,
  212,
  5053,
  1325,
  31,
  3,
  29,
  5054,
  815,
  622,
  77,
  686,
  5055,
  687,
  34,
  3411,
  2508,
  2509,
  5056,
  257,
  1326,
  5057,
  9296],
 [9297, 5058, 9298, 31, 3, 1957, 9299, 2510],
 [9300,
  9301,
  9302,
  9303,
  9304,
  9305,
  9306,
  9307,
  9308,
  3412,
  5059,
  2511,
  9309,
  9310,
  9311,
  50

In [None]:
from keras.preprocessing.sequence import pad_sequences
maxlen = max([len(seq) for seq in sequences])
sequences_padded = pad_sequences(sequences, maxlen=maxlen)
sequences_padded

array([[    0,     0,     0, ...,     3,  3406,  9278],
       [    0,     0,     0, ...,     3,   377,  9279],
       [    0,     0,     0, ...,  2504,  9284,  9285],
       ...,
       [    0,     0,     0, ...,    45,  9276, 27737],
       [    0,     0,     0, ...,   572,  1063,   280],
       [    0,     0,     0, ...,   775,   489, 27738]], dtype=int32)

In [None]:
vocab_size = len(tokenizer.word_index )+1
vocab_size

27739

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sequences_padded, labels, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split( X_test, y_test, test_size=0.5, random_state=42)

In [None]:
from keras.layers import Input, Embedding, LSTM, Dense
from keras.models import Sequential

model = Sequential()
model.add(Input (shape=(maxlen,)))
embedding_dim = 300
model.add(Embedding(input_dim= vocab_size,
                    input_length=maxlen,
                    output_dim= embedding_dim,
                    ))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [None]:
epochs=5
batch_size=16
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val))

Epoch 1/5
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 138ms/step - accuracy: 0.6766 - loss: 0.5760 - val_accuracy: 0.8417 - val_loss: 0.3391
Epoch 2/5
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 139ms/step - accuracy: 0.9706 - loss: 0.0952 - val_accuracy: 0.8542 - val_loss: 0.4630
Epoch 3/5
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 142ms/step - accuracy: 0.9989 - loss: 0.0042 - val_accuracy: 0.8511 - val_loss: 0.5176
Epoch 4/5
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 143ms/step - accuracy: 1.0000 - loss: 0.0014 - val_accuracy: 0.8542 - val_loss: 0.6348
Epoch 5/5
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 139ms/step - accuracy: 1.0000 - loss: 4.3898e-04 - val_accuracy: 0.8542 - val_loss: 0.7190


<keras.src.callbacks.history.History at 0x7855fc23cd30>

In [None]:
y_pred_probs = model.predict(X_test)

y_pred = (y_pred_probs > 0.5).astype(int)
from sklearn.metrics import accuracy_score
accuracy = round(100*accuracy_score(y_test, y_pred),2)
print(f'Test Accuracy: {accuracy:.2f}')

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step
Test Accuracy: 85.74
