In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import ast
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import re

In [None]:
# Notebook display configuration
from IPython.display import HTML, display
def set_css():
    display(HTML('''
    <style>
      pre { white-space: pre-wrap; }
    </style>
    '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_path = '/content/drive/MyDrive/Artificial Intelligence/Class: Natural Language Processing NLP/Project /Phase 1/Output/QuranDF_wTokensTags.xlsx'
QURAN_DF = pd.read_excel(df_path)
QURAN_DF['Tokens'] = QURAN_DF['Tokens'].apply(ast.literal_eval)

# **Data Preparation for Modeling**

1.   **Data Consolidation**

      **- Generate Input (X) and Target (Y) Sequences**

In [None]:
# Prepare Input (X) and Target (Y) Sequences
X = []
Y = []
for tokens_list in QURAN_DF['Tokens']:
    for i in range(1, len(tokens_list)):
        X.append(' '.join(tokens_list[:i]))
        Y.append(tokens_list[i])

In [None]:
print(f"X: {X[:5]}")
print(f"Y: {Y[:5]}")

X: ['بِسْمِ', 'بِسْمِ اللَّهِ', 'بِسْمِ اللَّهِ الرَّحْمَـٰنِ', 'الْحَمْدُ', 'الْحَمْدُ لِلَّهِ']
Y: ['اللَّهِ', 'الرَّحْمَـٰنِ', 'الرَّحِيمِ', 'لِلَّهِ', 'رَبِّ']


In [None]:
# Testing
# Display the first few sequences and their corresponding targets
for i in range(5):
    print(f"X[{i}]: {X[i]}")
    print(f"Y[{i}]: {Y[i]}\n")

X[0]: بِسْمِ
Y[0]: اللَّهِ

X[1]: بِسْمِ اللَّهِ
Y[1]: الرَّحْمَـٰنِ

X[2]: بِسْمِ اللَّهِ الرَّحْمَـٰنِ
Y[2]: الرَّحِيمِ

X[3]: الْحَمْدُ
Y[3]: لِلَّهِ

X[4]: الْحَمْدُ لِلَّهِ
Y[4]: رَبِّ



**Tokenizer and Sequences Preparation**



In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Instantiate the tokenizer
tokenizer = Tokenizer()

# Since 'X' is a list of lists (each inner list being a sequence of tokens),
# and 'Y' is a list of next tokens, you need to flatten 'X' for fitting
# and prepare 'Y' as a list of texts as well.
all_sequences = [' '.join(seq) for seq in X] + [' '.join([y]) for y in Y]  # Convert Y tokens to strings for uniformity

# Fit the tokenizer on your sequences
tokenizer.fit_on_texts(all_sequences)

# Convert tokens to sequences
X_seq = tokenizer.texts_to_sequences([' '.join(seq) for seq in X])
Y_seq = tokenizer.texts_to_sequences([' '.join([y]) for y in Y])

# Since 'Y' is essentially a single token (next word) for each sequence,
# and we've converted it to a sequence of integers (which will be just one integer),
# we need to flatten 'Y_seq' to be a 1D array.
Y_seq = [seq[0] for seq in Y_seq]  # Flatten Y_seq

# Pad sequences for 'X'
X_padded = pad_sequences(X_seq, padding='post')

# 'Y' doesn't need padding since it's a single token per sequence,
# but make sure it's correctly shaped for training.

print(f"Shape of X_padded: {X_padded.shape}")
print(f"Length of Y_seq: {len(Y_seq)}")

Shape of X_padded: (5875, 1049)
Length of Y_seq: 5875


In [None]:
# Display the first few sequences
for i in range(3):
    print(f"Original X[{i}]: {X[i]}")
    print(f"Encoded X_seq[{i}]: {X_seq[i]}")
    print(f"Padded X_padded[{i}]: {X_padded[i]}")
    print(f"Original Y[{i}]: {Y[i]}")
    print(f"Encoded Y_seq[{i}]: {Y_seq[i]}\n")

Original X[0]: بِسْمِ
Encoded X_seq[0]: [14, 3, 21, 5, 7, 3]
Padded X_padded[0]: [14  3 21 ...  0  0  0]
Original Y[0]: اللَّهِ
Encoded Y_seq[0]: 49

Original X[1]: بِسْمِ اللَّهِ
Encoded X_seq[1]: [14, 3, 21, 5, 7, 3, 2, 4, 4, 10, 1, 12, 3]
Padded X_padded[1]: [14  3 21 ...  0  0  0]
Original Y[1]: الرَّحْمَـٰنِ
Encoded Y_seq[1]: 447

Original X[2]: بِسْمِ اللَّهِ الرَّحْمَـٰنِ
Encoded X_seq[2]: [14, 3, 21, 5, 7, 3, 2, 4, 4, 10, 1, 12, 3, 2, 4, 15, 10, 1, 27, 5, 7, 1, 23, 18, 9, 3]
Padded X_padded[2]: [14  3 21 ...  0  0  0]
Original Y[2]: الرَّحِيمِ
Encoded Y_seq[2]: 305



In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
import torch

# Assuming X_padded contains your input sequences and Y_seq the corresponding next tokens
# Splitting the dataset into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X_padded, Y_seq, test_size=0.1, random_state=42)

# Convert to PyTorch tensors
X_train = torch.tensor(X_train)
Y_train = torch.tensor(Y_train)
X_val = torch.tensor(X_val)
Y_val = torch.tensor(Y_val)

**Creating Data Loaders**

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 10  # Adjust based on your GPU memory

# Assuming Y_train and Y_val contain the index of the next token to predict
def expand_labels_for_mlm(input_ids, labels_indices, ignore_index=-100):
    # Create a labels tensor filled with ignore_index
    labels = torch.full(input_ids.shape, ignore_index)
    for i, label_index in enumerate(labels_indices):
        # Set the label for the position we want to predict
        if label_index < input_ids.shape[1]:  # Ensure the index is within the sequence length
            labels[i, label_index] = input_ids[i, label_index]
    return labels

# Example usage
labels_train = expand_labels_for_mlm(X_train, Y_train)
labels_val = expand_labels_for_mlm(X_val, Y_val)


# Create Tensor datasets
train_data = TensorDataset(X_train, Y_train)
val_data = TensorDataset(X_val, Y_val)

# Create DataLoaders
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [None]:
import re

# Collect all word tokens
All_Tokens = list(QURAN_DF['Tokens'])
All_vocab_list = []

for token_list in All_Tokens:
    for token in token_list:
        # Remove non-Arabic characters from the token
        cleaned_token = re.sub(r'[^\u0600-\u06FF\s]', '', token)
        # Check if the cleaned token is not empty
        if cleaned_token.strip():
            if cleaned_token not in All_vocab_list:
                All_vocab_list.append(cleaned_token)

vocab_size = len(All_vocab_list)
print(f"Vocab Size: {vocab_size}")

Vocab Size: 2757


# **Build Baseline Model**

In [None]:
import torch
import torch.nn as nn

class LSTMBaselineModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LSTMBaselineModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        # Embed the input sequence
        embeds = self.embedding(x)
        # Pass through the LSTM
        lstm_out, _ = self.lstm(embeds)
        # Get the last hidden state
        last_hidden = lstm_out[:, -1, :]
        # Pass the last hidden state through the linear layer
        output = self.linear(last_hidden)
        return output

embedding_dim = 100
hidden_dim = 128

model = LSTMBaselineModel(vocab_size, embedding_dim, hidden_dim)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LSTMBaselineModel(
  (embedding): Embedding(2757, 100)
  (lstm): LSTM(100, 128, batch_first=True)
  (linear): Linear(in_features=128, out_features=2757, bias=True)
)

In [None]:
# Assuming your DataLoader is set up and called `data_loader`
for epoch in range(1):
    model.train()  # Set model to training mode
    total_loss = 0
    for batch in train_dataloader :
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_dataloader )}')

Epoch 1, Loss: 7.702776829551884


In [None]:
import torch
import numpy as np

# Function to calculate accuracy
def calculate_accuracy(y_true, y_pred):
    correct = (y_true == y_pred).float()  # convert into float for division
    acc = correct.sum() / len(correct)
    return acc

# Function for making predictions and evaluating model
def evaluate_model(model, data_loader, device):
    model.eval()  # Set model to evaluation mode
    total_accuracy = 0
    total_count = 0

    with torch.no_grad():
        for batch in data_loader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)

            # Convert outputs to probabilities and predict class with highest probability
            probabilities = torch.softmax(outputs, dim=1)
            predictions = torch.argmax(probabilities, dim=1)

            # Calculate accuracy
            accuracy = calculate_accuracy(labels, predictions)
            total_accuracy += accuracy.item()
            total_count += 1

    # Calculate average accuracy across all batches
    avg_accuracy = total_accuracy / total_count
    return avg_accuracy

# Assuming you have a validation DataLoader called `val_dataloader`
val_accuracy = evaluate_model(model, val_dataloader, device)
print(f'Validation Accuracy: {val_accuracy:.4f}')


Validation Accuracy: 0.0051
