<a href="https://colab.research.google.com/github/Rama-Has/Poem_Generator_Rama_Hasiba_12010022/blob/main/Models_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install transformers==4.28.0 


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transform

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import tensorflow_hub as hub
import tensorflow as tf
import pandas as pd 
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

### Load models

#### Load aragpt model

In [None]:
!unzip -q "/content/aragpt_final_one (1).zip" -d "/content/"

aragpt_folder_path = '/content/aragpt_model'
!mkdir -p "$folder_path"


In [None]:
aragpt_tokenizer = GPT2Tokenizer.from_pretrained(
    'aubmindlab/aragpt2-base'
    )
aragpt_model = GPT2LMHeadModel.from_pretrained(
    aragpt_folder_path, 
    pad_token_id = gpt_tokenizer.eos_token_id
    )  

In [None]:
aragpt_generator_ = pipeline(
    'text-generation', 
    model = aragpt_folder_path, 
    tokenizer='aubmindlab/aragpt2-base'
    )

### Load the gpt model

In [None]:
!unzip -q "/content/gpt_for_poems (2).zip" -d "/content/"

folder_path = '/content/gpt_model'
!mkdir -p "$folder_path"

In [None]:
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_model = GPT2LMHeadModel.from_pretrained(
    folder_path, 
    pad_token_id = gpt_tokenizer.eos_token_id
    )

In [None]:
english_generator = pipeline(
    'text-generation', 
    model = folder_path, 
    tokenizer = gpt_tokenizer
    )

### Load the "Uneversal Sentence Encoder (USE)" model for embedding.

In [None]:
#Load the pre-trained model#
embed = hub.load(
    "https://tfhub.dev/google/universal-sentence-encoder-large/5"
    )

# Define some usefull functions

In [None]:
def calculate_similarity(embed1, embed2):
    # Define two vectors
    vector1 = np.array(embed1)
    vector2 = np.array(embed2)

    # Reshape the vectors to have a 2D shape
    vector1 = vector1.reshape(1, -1)
    vector2 = vector2.reshape(1, -1)

    # Calculate the cosine similarity
    similarity = cosine_similarity(vector1, vector2)

    return similarity

In [None]:
def generate_embedding_for_prediction(seed_line, generator_): 
    
    #Generate text based on a line
    predicted_line = generator_(seed_line, max_length = 60)[0]['generated_text'] 
    predicted_embedding = embed( [predicted_line] ) 
    
    return [predicted_embedding]

In [None]:
def generate_embedding_for_arabbic_prediction(seed_line: str):
    return generate_embedding_for_prediction(seed_line, aragpt_generator_)

def generate_embedding_for_english_prediction(seed_line: str):
    return generate_embedding_for_prediction(seed_line, english_generator)

# Evaluate the English one.

### Get embedding for the verses in the test dataset because each one except the first one will be the ground truth prediction for the verses before it.

### Read the test data

In [None]:
# Read the text file
with open('test_dataset.txt', 'r') as file:
    lines = file.readlines()

# Create a DataFrame with each row as a sample
df = pd.DataFrame({'text': lines})


df['text'] = df['text'].astype(str) 

# Remove empty rows  
df = df[df['text'] != '']

df = df.head(150)

### Get embeddings for the ground truth

In [None]:
english_embeddings = embed(
    df['text'].to_list()
)

In [None]:
df['embeddings'] = [embedded_sentence for embedded_sentence in english_embeddings]

In [None]:
df

### Get embeddings for the predicted text

In [None]:
df['predicted_embedding'] = df['text'].map(generate_embedding_for_english_prediction)

In [None]:
df

### Find the cosine similiraty between embeddings.


In [None]:
df['similarity'] = [i-i for i in range(df.shape[0])]
df = df[df['predicted_embedding'].notna()]
df.shape[0]


In [None]:
for i in range(1, df.shape[0] - 1): 
    predicted_embedding = df['predicted_embedding'].iloc[i][0]
    ground_truth_embedding = df['predicted_embedding'].iloc[i + 1][0]

    df['similarity'].iloc[i] = calculate_similarity(predicted_embedding, ground_truth_embedding)


#### get some stats about similarities between the ground truth with the predicted embeddings.

In [None]:
df['similarity'].describe()

In [None]:
print(
    generator_(
        'dark storms his genial powers controul\n', 
        max_length = 50
        )[0]['generated_text'] 
      )

## Model Enhancement:
  1. fine-tune the model on more data.
  2. Increase number of epochs during the tunning process.

# Evaluate the Arabic one.

In [None]:

print(
    aragpt_generator_(
        'لا  قلت شعرا ولا سمعت غنا ولا', 
        max_length = 60)[0]['generated_text']
      )

### Read the test data

In [None]:
# Read the text file
with open('test_dataset.txt', 'r') as file:
    lines = file.readlines()

# Create a DataFrame with each row as a sample
df = pd.DataFrame({'text': lines})


df['text'] = df['text'].astype(str) 

# Remove empty rows  
df = df[df['text'] != '']


In [None]:
# generate_embedding_for_arabbic_prediction

In [None]:
df['embeddings'] = [embedded_sentence for embedded_sentence in embeddings]
df['predicted_embedding'] = [i-i for i in range(df.shape[0])]


In [None]:
for i in range(150):
    df['predicted_embedding'].iloc[i + 1] = generate_embedding_for_arabbic_prediction(df['text'].iloc[i])

In [None]:
df = df.head(150).copy()
df

### Finding similarity between the textual embedding.

In [None]:
df['similarity'] = [i-i for i in range(df.shape[0])]
df = df[df['predicted_embedding'].notna()]

In [None]:
for i in range(1, 150): 
    predicted_embedding = df['predicted_embedding'].iloc[i][0]
    ground_truth_embedding = df['predicted_embedding'].iloc[i + 1][0]

    df['similarity'].iloc[i] = calculate_similarity(predicted_embedding, ground_truth_embedding)

In [None]:
df

In [None]:
df['similarity'].describe()

**Comparing this result with results I had from the english transformer, Aragpt-2 performed better than it.**