In [None]:
!pip install transformers[torch]
!pip install accelerate
!pip install pytorch_model_summary
!pip install torchmetrics
!pip install apex.pytorch

In [None]:
# wrap text to the viewing window for this notebook
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
# Imports
import ast
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.optim as optim
from torch.optim import AdamW
from collections import defaultdict
from gensim.models import Word2Vec

import torch
import torch.nn as nn
from pytorch_model_summary import summary
from torchmetrics import Accuracy, F1Score, Precision, Recall
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split
from accelerate import Accelerator
from transformers import AutoModelForMaskedLM, BertGenerationEncoder, BertGenerationConfig, BertForMaskedLM, AutoTokenizer, BertTokenizer, AdamW, AutoModel
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from torch.nn.utils.rnn import pad_sequence

import spacy
from spacy.lang.ar import Arabic
from spacy.lang.ar.examples import sentences

from google.colab import drive

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df_path = '/content/drive/MyDrive/Artificial Intelligence/Class: Natural Language Processing NLP/Project /Phase 1/Output/QuranDF_wTokensTags.xlsx'
QURAN_DF = pd.read_excel(df_path)
QURAN_DF.head()

Unnamed: 0,Sura Number,Ayah Number,Text,Cleaned Text,Tokens,Token_POS,POS_Tags,Token_POS_Sura_Ayah
0,1,1,بِسْمِ ٱللَّهِ ٱلرَّحْمَـٰنِ ٱلرَّحِيمِ,بِسْمِ اللَّهِ الرَّحْمَـٰنِ الرَّحِيمِ,"['بِسْمِ', 'اللَّهِ', 'الرَّحْمَـٰنِ', 'الرَّح...","{'بِسْمِ': 'P+N', 'اللَّهِ': 'PN', 'الرَّحْمَـ...","['P+N', 'PN', 'DET+ADJ', 'DET+ADJ']","[('بِسْمِ', 'P+N', 1, 1), ('اللَّهِ', 'PN', 1,..."
1,1,2,ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَـٰلَمِينَ,الْحَمْدُ لِلَّهِ رَبِّ الْعَـٰلَمِينَ,"['الْحَمْدُ', 'لِلَّهِ', 'رَبِّ', 'الْعَـٰلَمِ...","{'الْحَمْدُ': 'DET+N', 'لِلَّهِ': 'P+PN', 'رَب...","['DET+N', 'P+PN', 'N', 'DET+N']","[('الْحَمْدُ', 'DET+N', 1, 2), ('لِلَّهِ', 'P+..."
2,1,3,ٱلرَّحْمَـٰنِ ٱلرَّحِيمِ,الرَّحْمَـٰنِ الرَّحِيمِ,"['الرَّحْمَـٰنِ', 'الرَّحِيمِ']","{'الرَّحْمَـٰنِ': 'DET+ADJ', 'الرَّحِيمِ': 'DE...","['DET+ADJ', 'DET+ADJ']","[('الرَّحْمَـٰنِ', 'DET+ADJ', 1, 3), ('الرَّحِ..."
3,1,4,مَـٰلِكِ يَوْمِ ٱلدِّينِ,مَـٰلِكِ يَوْمِ الدِّينِ,"['مَـٰلِكِ', 'يَوْمِ', 'الدِّينِ']","{'مَـٰلِكِ': 'N', 'يَوْمِ': 'N', 'الدِّينِ': '...","['N', 'N', 'DET+N']","[('مَـٰلِكِ', 'N', 1, 4), ('يَوْمِ', 'N', 1, 4..."
4,1,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ,اِيَّاكَ نَعْبُدُ وَاِيَّاكَ نَسْتَعِينُ,"['اِيَّاكَ', 'نَعْبُدُ', 'وَاِيَّاكَ', 'نَسْتَ...","{'اِيَّاكَ': 'PRON', 'نَعْبُدُ': 'V', 'وَاِيَّ...","['PRON', 'V', 'CONJ+PRON', 'V']","[('اِيَّاكَ', 'PRON', 1, 5), ('نَعْبُدُ', 'V',..."


In [None]:
tokenized_text = QURAN_DF['Tokens'].apply(ast.literal_eval)
pd.DataFrame(tokenized_text.head())

Unnamed: 0,Tokens
0,"[بِسْمِ, اللَّهِ, الرَّحْمَـٰنِ, الرَّحِيمِ]"
1,"[الْحَمْدُ, لِلَّهِ, رَبِّ, الْعَـٰلَمِينَ]"
2,"[الرَّحْمَـٰنِ, الرَّحِيمِ]"
3,"[مَـٰلِكِ, يَوْمِ, الدِّينِ]"
4,"[اِيَّاكَ, نَعْبُدُ, وَاِيَّاكَ, نَسْتَعِينُ]"


In [None]:
# Define the context window size
window_size = 5

# Create training samples for skip-gram model
training_samples = []
for tokens in tokenized_text:
    for i, target_word in enumerate(tokens):
        context_words = []
        for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
            if j != i:
                context_words.append(tokens[j])
        if context_words:
            training_samples.append((context_words, target_word))

# Display sample training samples
for context, target in training_samples[:5]:
    print(f"Context: {context}, Target: {target}")

Context: ['اللَّهِ', 'الرَّحْمَـٰنِ', 'الرَّحِيمِ'], Target: بِسْمِ
Context: ['بِسْمِ', 'الرَّحْمَـٰنِ', 'الرَّحِيمِ'], Target: اللَّهِ
Context: ['بِسْمِ', 'اللَّهِ', 'الرَّحِيمِ'], Target: الرَّحْمَـٰنِ
Context: ['بِسْمِ', 'اللَّهِ', 'الرَّحْمَـٰنِ'], Target: الرَّحِيمِ
Context: ['لِلَّهِ', 'رَبِّ', 'الْعَـٰلَمِينَ'], Target: الْحَمْدُ


In [None]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, sg=1)
# Save the trained model
word2vec_model.save("./word2vec_model.bin")

In [None]:
X = [context for context, target in training_samples]
y = [target for context, target in training_samples]

print(X[:5])
print(y[:5])

print(len(X))
print(len(y))

[['اللَّهِ', 'الرَّحْمَـٰنِ', 'الرَّحِيمِ'], ['بِسْمِ', 'الرَّحْمَـٰنِ', 'الرَّحِيمِ'], ['بِسْمِ', 'اللَّهِ', 'الرَّحِيمِ'], ['بِسْمِ', 'اللَّهِ', 'الرَّحْمَـٰنِ'], ['لِلَّهِ', 'رَبِّ', 'الْعَـٰلَمِينَ']]
['بِسْمِ', 'اللَّهِ', 'الرَّحْمَـٰنِ', 'الرَّحِيمِ', 'الْحَمْدُ']
6168
6168


In [None]:
# Building the vocabulary from both X and Y
all_words = [word for context, target in training_samples for word in context + [target]]
vocab = {word: i+1 for i, word in enumerate(set(all_words))}  # +1 to reserve 0 for padding
vocab_size = len(vocab) + 1  # +1 for padding token
word_to_ix = vocab  # For readability
ix_to_word = {ix: word for word, ix in vocab.items()}  # For readability


print(list(ix_to_word.keys())[:10])
print(list(ix_to_word.values())[:10])
print(vocab_size)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
['الْكِتَـٰبُ', 'قَرْضًا', 'فَاحْذَرُوهُ', 'اَمْوَٰتًۭا', 'قِبْلَتَكَ', 'كُلَّمَا', 'وَاِنَّهُۥ', 'كَمَا', 'عُقْدَةَ', 'تَيَمَّمُوا۟']
2758


In [None]:
# Define a custom dataset class for Quranic text data
class QuranDataset(Dataset):
    def __init__(self, X, Y, vocab):
        self.X = X
        self.Y = Y
        self.vocab = vocab
        self.max_len = max(len(context) for context in X)  # Update to get the max length of the context

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if isinstance(self.X[idx], str):  # Check if X[idx] is a string
            X = [self.vocab[word] for word in self.X[idx].split()]
        else:
            X = [self.vocab[word] for word in self.X[idx]]  # If X[idx] is already a list
        Y = self.vocab[self.Y[idx]]
        X_padded = np.pad(X, (0, self.max_len - len(X)), constant_values=0)
        return torch.tensor(X_padded, dtype=torch.long), torch.tensor(Y, dtype=torch.long)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_dataset = QuranDataset(X_train, y_train, vocab)
test_dataset = QuranDataset(X_test, y_test, vocab)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

sequences = [
    [vocab[word] for word in context + [target]]
    for context, target in training_samples
]

# Pad sequences to the same length
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
input_ids = torch.tensor(padded_sequences)

# Convert padded sequences to tensors
tensor_sequences = torch.tensor(padded_sequences, dtype=torch.long)


In [None]:
def initialize_weights(model):
    if isinstance(model, nn.Linear):
        nn.init.xavier_uniform_(model.weight)
        if model.bias is not None:
            nn.init.constant_(model.bias, 0)
    elif isinstance(model, nn.Embedding):
        nn.init.normal_(model.weight, mean=0, std=0.02)
    elif isinstance(model, nn.LayerNorm):
        nn.init.constant_(model.weight, 1)
        nn.init.constant_(model.bias, 0)

In [None]:
# Load the trained Word2Vec model
word2vec_model_dir = "./word2vec_model.bin"
word2vec_model = Word2Vec.load(word2vec_model_dir)

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        max_len = x.size(1)
        pe = self.pe[:, :max_len, :self.d_model]  # Slice along dimensions 1 and 2
        x = x + pe
        return x

In [None]:
class BERTLikeModel(nn.Module):
    def __init__(self, word2vec_model, num_layers=6, embedding_dim=96, num_heads=12, ff_dim=3072, dropout=0.1):
        super(BERTLikeModel, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.pos_encoder = PositionalEncoding(embedding_dim)
        # Ensure embedding dimension is divisible by the number of attention heads
        assert self.embedding_dim % self.num_heads == 0, "Embedding dimension must be divisible by the number of attention heads"

        # Initialize embedding layer
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(word2vec_model.wv.vectors))

        # Initialize positional encoding
        self.pos_encoder = PositionalEncoding(embedding_dim)  # Add this line

        # Initialize transformer encoder layers
        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=self.embedding_dim, nhead=self.num_heads, dim_feedforward=ff_dim, dropout=dropout)
            for _ in range(num_layers)
        ])

        # Initialize transformer encoder
        self.encoder = nn.TransformerEncoder(self.encoder_layers, num_layers=num_layers)

    def forward(self, input_ids):
        embedded_input = self.embedding(input_ids)
        embedded_input = self.pos_encoder(embedded_input)
        embedded_input = embedded_input.transpose(0, 1)
        encoded_output = self.encoder(embedded_input)
        return encoded_output

# Adjust the embedding dimension or number of attention heads
num_layers = 6  # Example: Choose the number of layers
embedding_dim = 96  # Example: Choose an embedding dimension divisible by 12
num_heads = 12

# Initialize your BERT-like model with appropriate arguments
model = BERTLikeModel(word2vec_model, num_layers=num_layers, embedding_dim=embedding_dim, num_heads=num_heads)


In [None]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Define number of epochs
num_epochs = 3

# Move model to device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0.0
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False)

    for input_ids, labels in progress_bar:
        # Move batch to device
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids)

        # Flatten the outputs and labels
        outputs = outputs.view(-1, outputs.shape[-1])
        labels = labels.view(-1)

        # Compute loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Update progress bar
        progress_bar.set_postfix({'Loss': total_loss / len(train_dataloader)})

    # Print average loss for the epoch
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {total_loss / len(train_dataloader)}')

print('Training complete!')




RuntimeError: The size of tensor a (100) must match the size of tensor b (96) at non-singleton dimension 2