In [None]:
!pip install transformers==4.28.0


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m97.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transform

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import tensorflow_hub as hub
import tensorflow as tf
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Load models

#### Load aragpt model

In [None]:
!unzip -q "/content/drive/MyDrive/aragpt_final_one.zip" -d "/content/"

mkdir: cannot create directory ‘’: No such file or directory


In [None]:
aragpt_folder_path = '/content/aragpt_model'
!mkdir -p "$aragpt_folder_path"

In [None]:
aragpt_tokenizer = GPT2Tokenizer.from_pretrained(
    'aubmindlab/aragpt2-base'
    )
aragpt_model = GPT2LMHeadModel.from_pretrained(
    aragpt_folder_path,
    pad_token_id = aragpt_tokenizer.eos_token_id
    )

In [None]:
aragpt_generator_ = pipeline(
    'text-generation',
    model = aragpt_folder_path,
    tokenizer='aubmindlab/aragpt2-base'
    )

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/4.52M [00:00<?, ?B/s]

### Load the gpt model

In [None]:
!unzip -q "/content/drive/MyDrive/gpt_for_poems (2).zip" -d "/content/"

In [None]:
folder_path = '/content/gpt_model'
!mkdir -p "$folder_path"

In [None]:
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_model = GPT2LMHeadModel.from_pretrained(
    folder_path,
    pad_token_id = gpt_tokenizer.eos_token_id
    )

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
english_generator = pipeline(
    'text-generation',
    model = folder_path,
    tokenizer = gpt_tokenizer
    )

### Load the "Uneversal Sentence Encoder (USE)" model for embedding.

In [None]:
#Load the pre-trained model#
embed = hub.load(
    "https://tfhub.dev/google/universal-sentence-encoder-large/5"
    )

# Define some usefull functions

In [None]:
def calculate_similarity(embed1, embed2):
    # Define two vectors
    vector1 = np.array(embed1)
    vector2 = np.array(embed2)

    # Reshape the vectors to have a 2D shape
    vector1 = vector1.reshape(1, -1)
    vector2 = vector2.reshape(1, -1)

    # Calculate the cosine similarity
    similarity = cosine_similarity(vector1, vector2)

    return similarity

In [None]:
def generate_embedding_for_prediction(seed_line, generator_):

    #Generate text based on a line
    predicted_line = generator_(seed_line, max_length = 20)[0]['generated_text']
    predicted_embedding = embed( [predicted_line] )

    return [predicted_embedding]

In [None]:
def generate_embedding_for_arabic_prediction(seed_line: str):
    return generate_embedding_for_prediction(seed_line, aragpt_generator_)

def generate_embedding_for_english_prediction(seed_line: str):
    return generate_embedding_for_prediction(seed_line, english_generator)

# Evaluate the English one.

### Get embedding for the verses in the test dataset because each one except the first one will be the ground truth prediction for the verses before it.

### Read the test data

In [None]:
# Read the text file
with open('test_dataset_en.txt', 'r') as file:
    lines = file.readlines()

# Create a DataFrame with each row as a sample
df = pd.DataFrame({'text': lines})


df['text'] = df['text'].astype(str)

# Remove empty rows
df = df[df['text'] != '']

df = df.head(150)

### Get embeddings for the ground truth

In [None]:
english_embeddings = embed(
    df['text'].to_list()
)

In [None]:
df['embeddings'] = [embedded_sentence for embedded_sentence in english_embeddings]

In [None]:
df

Unnamed: 0,text,embeddings
0,dark storms his genial powers controul\n,"(tf.Tensor(0.030426197, shape=(), dtype=float3..."
1,and willingly i close the deep ensanguin'd scr...,"(tf.Tensor(-0.006629959, shape=(), dtype=float..."
2,years follow years slow rolling on\n,"(tf.Tensor(0.054826934, shape=(), dtype=float3..."
3,no daring deeds of valour done\n,"(tf.Tensor(-0.051109336, shape=(), dtype=float..."
4,till on this spot a monarch's fateh\n,"(tf.Tensor(0.017863087, shape=(), dtype=float3..."
...,...,...
145,whatever crowns the hill or smiles along the v...,"(tf.Tensor(0.045052286, shape=(), dtype=float3..."
146,outcast of nature man the wretched thrall\n,"(tf.Tensor(0.058078557, shape=(), dtype=float3..."
147,of bitterdropping sweat of sweltry pain\n,"(tf.Tensor(-0.013565971, shape=(), dtype=float..."
148,of cares that eat away thy heart with gall\n,"(tf.Tensor(-0.02529077, shape=(), dtype=float3..."


### Get embeddings for the predicted text

In [None]:
df['predicted_embedding'] = df['text'].map(generate_embedding_for_english_prediction)

Input length of input_ids is 17, but `max_length` is set to 15. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 16, but `max_length` is set to 15. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


In [None]:
df

Unnamed: 0,text,embeddings,predicted_embedding
0,dark storms his genial powers controul\n,"(tf.Tensor(0.030426197, shape=(), dtype=float3...","[((tf.Tensor(0.04336138, shape=(), dtype=float..."
1,and willingly i close the deep ensanguin'd scr...,"(tf.Tensor(-0.006629959, shape=(), dtype=float...","[((tf.Tensor(-0.00882976, shape=(), dtype=floa..."
2,years follow years slow rolling on\n,"(tf.Tensor(0.054826934, shape=(), dtype=float3...","[((tf.Tensor(0.025598433, shape=(), dtype=floa..."
3,no daring deeds of valour done\n,"(tf.Tensor(-0.051109336, shape=(), dtype=float...","[((tf.Tensor(-0.023306172, shape=(), dtype=flo..."
4,till on this spot a monarch's fateh\n,"(tf.Tensor(0.017863087, shape=(), dtype=float3...","[((tf.Tensor(0.032662664, shape=(), dtype=floa..."
...,...,...,...
145,whatever crowns the hill or smiles along the v...,"(tf.Tensor(0.045052286, shape=(), dtype=float3...","[((tf.Tensor(0.034978043, shape=(), dtype=floa..."
146,outcast of nature man the wretched thrall\n,"(tf.Tensor(0.058078557, shape=(), dtype=float3...","[((tf.Tensor(0.09407078, shape=(), dtype=float..."
147,of bitterdropping sweat of sweltry pain\n,"(tf.Tensor(-0.013565971, shape=(), dtype=float...","[((tf.Tensor(-0.060619242, shape=(), dtype=flo..."
148,of cares that eat away thy heart with gall\n,"(tf.Tensor(-0.02529077, shape=(), dtype=float3...","[((tf.Tensor(-0.050496154, shape=(), dtype=flo..."


### Find the cosine similiraty between embeddings.


In [None]:
df['similarity'] = [i-i for i in range(df.shape[0])]
df = df[df['predicted_embedding'].notna()]
df.shape[0]


150

In [None]:
for i in range(1, df.shape[0] - 1):
    predicted_embedding = df['predicted_embedding'].iloc[i][0]
    ground_truth_embedding = df['predicted_embedding'].iloc[i + 1][0]

    df['similarity'].iloc[i] = calculate_similarity(predicted_embedding, ground_truth_embedding)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['similarity'].iloc[i] = calculate_similarity(predicted_embedding, ground_truth_embedding)


#### get some stats about similarities between the ground truth with the predicted embeddings.

In [None]:
df['similarity'].describe()

count    150.000000
mean       0.171373
std        0.095159
min       -0.040577
25%        0.109182
50%        0.165519
75%        0.231312
max        0.526495
Name: similarity, dtype: float64

In [None]:
print(
    english_generator(
        'dark storms his genial powers controul\n',
        max_length = 80
        )[0]['generated_text']
      )

dark storms his genial powers controul
and in a moment when my heart felt warm'd his influence unvary'd
all my fears were borne away with a calm roar
and when our friendship expired the gilded bell
then i saw the wreaths of virgil's flame flying
and at our parting to each other sojourn'd in the air
a thousand times


## Model Enhancement:
  1. fine-tune the model on more data.
  2. Increase number of epochs during the tunning process.

# Evaluate the Arabic one.

In [None]:
print(
    aragpt_generator_(
        'لا  قلت شعرا ولا سمعت غنا ولا',
        max_length = 80)[0]['generated_text']
      )

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


لا  قلت شعرا ولا سمعت غنا ولا شربت الخمر
إذا ما جئته لم تغن عنه ولم تطعني عن شرب الخمر
ألم تر أن الله لا يفارقه وإن كان في الأرض من خمر
فما أنت إلا خلائق قد عفت وما أنت إلا سقيم
ولقد علمت أني لست أدري إن كنت صادقا أو كاذبا
وأعلم أنني إذا جئت


### Read the test data

In [None]:
# Read the text file
with open('test_dataset.txt', 'r') as file:
    lines = file.readlines()

# Create a DataFrame with each row as a sample
df = pd.DataFrame({'text': lines})


df['text'] = df['text'].astype(str)

# Remove empty rows
df = df[df['text'] != '']


In [None]:
df = df.head(150)
df

Unnamed: 0,text
0,إن أنت لم تلق لي المودة في صدر حبيبي وأنت مقتدر\n
1,لا قلت شعرا ولا سمعت غنا ولا جرى في مفاصلي الس...
2,ولا أزال القرآن أدرسه أروح في درسه وأبتكر\n
3,وألزم الصوم والصلاة ولا أزال دهري بالخير آتمر\n
4,فما مضت بعد ذاك ثالثة حتى أتاني الحبيب يعتذر\n
...,...
145,فقد جنيت الهموم منه وقد خليت قلبي يعوم في الفكر\n
146,لا أسعد القلب في هواه ولا يطمع في عزتي ولا خوري\n
147,عف ضميري وطيب خبري ولذتي في الحديث والنظر\n
148,هارون يا خير الخلائف كلهم ممن مضى فيهم وهذا ال...


### Generate embeddings for the ground truth

In [None]:
embeddings = embed(
    df['text'].to_list()
)

In [None]:
df['embeddings'] = [embedded_sentence for embedded_sentence in embeddings]
df['predicted_embedding'] = [i-i for i in range(df.shape[0])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['embeddings'] = [embedded_sentence for embedded_sentence in embeddings]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predicted_embedding'] = [i-i for i in range(df.shape[0])]


In [None]:
for i in range(150):
    df['predicted_embedding'].iloc[i + 1] = generate_embedding_for_arabic_prediction(df['text'].iloc[i])

In [None]:
df

Unnamed: 0,text,embeddings,predicted_embedding
0,إن أنت لم تلق لي المودة في صدر حبيبي وأنت مقتدر\n,"(tf.Tensor(-0.06311622, shape=(), dtype=float3...",0
1,لا قلت شعرا ولا سمعت غنا ولا جرى في مفاصلي الس...,"(tf.Tensor(0.002743538, shape=(), dtype=float3...","[((tf.Tensor(-0.06398336, shape=(), dtype=floa..."
2,ولا أزال القرآن أدرسه أروح في درسه وأبتكر\n,"(tf.Tensor(-0.02488508, shape=(), dtype=float3...","[((tf.Tensor(-0.01238151, shape=(), dtype=floa..."
3,وألزم الصوم والصلاة ولا أزال دهري بالخير آتمر\n,"(tf.Tensor(-0.070586234, shape=(), dtype=float...","[((tf.Tensor(-0.047599357, shape=(), dtype=flo..."
4,فما مضت بعد ذاك ثالثة حتى أتاني الحبيب يعتذر\n,"(tf.Tensor(-0.09535104, shape=(), dtype=float3...","[((tf.Tensor(-0.05444303, shape=(), dtype=floa..."
...,...,...,...
145,فقد جنيت الهموم منه وقد خليت قلبي يعوم في الفكر\n,"(tf.Tensor(-0.035766374, shape=(), dtype=float...","[((tf.Tensor(-0.032759454, shape=(), dtype=flo..."
146,لا أسعد القلب في هواه ولا يطمع في عزتي ولا خوري\n,"(tf.Tensor(0.0029054228, shape=(), dtype=float...","[((tf.Tensor(-0.046809435, shape=(), dtype=flo..."
147,عف ضميري وطيب خبري ولذتي في الحديث والنظر\n,"(tf.Tensor(-0.032108683, shape=(), dtype=float...","[((tf.Tensor(-0.017058348, shape=(), dtype=flo..."
148,هارون يا خير الخلائف كلهم ممن مضى فيهم وهذا ال...,"(tf.Tensor(-0.092846386, shape=(), dtype=float...","[((tf.Tensor(-0.013523326, shape=(), dtype=flo..."


### Finding similarity between the textual embedding.

In [None]:
df['similarity'] = [i-i for i in range(df.shape[0])]
df = df[df['predicted_embedding'].notna()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['similarity'] = [i-i for i in range(df.shape[0])]


In [None]:
for i in range(1, 149):
    predicted_embedding = df['predicted_embedding'].iloc[i][0]
    ground_truth_embedding = df['predicted_embedding'].iloc[i + 1][0]

    df['similarity'].iloc[i] = calculate_similarity(predicted_embedding, ground_truth_embedding)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['similarity'].iloc[i] = calculate_similarity(predicted_embedding, ground_truth_embedding)


In [None]:
df

Unnamed: 0,text,embeddings,predicted_embedding,similarity
0,إن أنت لم تلق لي المودة في صدر حبيبي وأنت مقتدر\n,"(tf.Tensor(-0.06311622, shape=(), dtype=float3...",0,0.000000
1,لا قلت شعرا ولا سمعت غنا ولا جرى في مفاصلي الس...,"(tf.Tensor(0.002743538, shape=(), dtype=float3...","[((tf.Tensor(-0.06398336, shape=(), dtype=floa...",0.818250
2,ولا أزال القرآن أدرسه أروح في درسه وأبتكر\n,"(tf.Tensor(-0.02488508, shape=(), dtype=float3...","[((tf.Tensor(-0.01238151, shape=(), dtype=floa...",0.918899
3,وألزم الصوم والصلاة ولا أزال دهري بالخير آتمر\n,"(tf.Tensor(-0.070586234, shape=(), dtype=float...","[((tf.Tensor(-0.047599357, shape=(), dtype=flo...",0.872685
4,فما مضت بعد ذاك ثالثة حتى أتاني الحبيب يعتذر\n,"(tf.Tensor(-0.09535104, shape=(), dtype=float3...","[((tf.Tensor(-0.05444303, shape=(), dtype=floa...",0.822680
...,...,...,...,...
145,فقد جنيت الهموم منه وقد خليت قلبي يعوم في الفكر\n,"(tf.Tensor(-0.035766374, shape=(), dtype=float...","[((tf.Tensor(-0.032759454, shape=(), dtype=flo...",0.842943
146,لا أسعد القلب في هواه ولا يطمع في عزتي ولا خوري\n,"(tf.Tensor(0.0029054228, shape=(), dtype=float...","[((tf.Tensor(-0.046809435, shape=(), dtype=flo...",0.795328
147,عف ضميري وطيب خبري ولذتي في الحديث والنظر\n,"(tf.Tensor(-0.032108683, shape=(), dtype=float...","[((tf.Tensor(-0.017058348, shape=(), dtype=flo...",0.869164
148,هارون يا خير الخلائف كلهم ممن مضى فيهم وهذا ال...,"(tf.Tensor(-0.092846386, shape=(), dtype=float...","[((tf.Tensor(-0.013523326, shape=(), dtype=flo...",0.781546


In [None]:
df['similarity'].describe()

count    150.000000
mean       0.714752
std        0.168792
min        0.000000
25%        0.654878
50%        0.768025
75%        0.822542
max        0.958033
Name: similarity, dtype: float64

**Comparing this result with results I had from the english transformer, Aragpt-2 performed better than it.**