In [4]:
import re
import numpy as np
import pandas as pd
import gensim
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

# 1. Load the pretrained Word2Vec model
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True)

# 2. Load and preprocess the dataset (as before)
df = pd.read_csv('urdu-sentiment-corpus-v1.tsv', sep='\t')
df.dropna(inplace=True)

# Preprocess text (cleaning, tokenization)
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

df['text'] = df['text'].apply(clean_text)

# 3. Tokenize the text data
MAX_VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
word_index = tokenizer.word_index

padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# 4. Create the embedding matrix using the pretrained Word2Vec model
embedding_dim = 300  # Since the GoogleNews vectors have 300 dimensions
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    if word in w2v_model.key_to_index:
        embedding_matrix[i] = w2v_model[word]

# 5. Prepare data (train/test split)
X = padded_sequences
y = df['label'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Build the GRU model
model = Sequential()
model.add(Embedding(input_dim=len(word_index) + 1,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False))  # Set trainable=False to keep the embeddings frozen
model.add(GRU(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 7. Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

# 8. Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Plotting the training process
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject