## Importing Libraries

In [None]:
!pip install transformers==4.28.0
# !pip install git+https://github.com/huggingface/transformers
# !pip install --upgrade transformers
# !pip install transformers

In [None]:
!pip install tensorflow

In [None]:
from google.colab import files
import io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from google.colab import drive
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead, TextDataset, DataCollatorForLanguageModeling, pipeline, GPT2LMHeadModel, GPT2Tokenizer#take the input text then encode it from text to numbers
import shutil

# Reading Data
-------------------

In [None]:
#mount google drive to the colab runtime
drive.mount('/content/drive')

In [None]:
csv_data = pd.read_csv('/content/drive/MyDrive/merged_data.csv')

In [None]:
csv_data

The dataset consists of 4 columns, we are interested on the "Verse" one only. It contains 199002 sample(verse), I decided to train the model with 16000 sample due to the resource limitation I have.

In [None]:
csv_data['Meter'].head(16000).unique()

In [None]:
csv_data = csv_data[['Verse']]

In [None]:
csv_data

## Preparing the Corpus

In [None]:
corpus = csv_data['Verse'].to_list()[0: 16000]

In [None]:
corpus[0: 4]

Get the length of each verse, where length represent number of words in that verse


In [None]:
verse_length = [len(line.split(' ')) for line in corpus]
verse_length[0: 5]

In [None]:
plt.boxplot(verse_length)
plt.title('Verse length distribution')
plt.show()

In [None]:
print(
    f'Longest verse has {max(verse_length)} word.'
    f'\nShorter one has {min(verse_length)}.'
    f'\n{int(np.mean(verse_length))} is the mean number of words per verse.\n'
)

In [None]:
# Generating the wordCloud to visiualize the text
wordcloud = WordCloud(max_font_size=50,
                      max_words=80,
                      background_color="black").generate(" ".join(corpus))

# Plotting the WordCloud
plt.figure(figsize=(8, 4))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig("WordCloud.png")
plt.show()

## Text Preprocessing

Making samples, that I'll use in the training, consistant by converting letters to lower case.

> 1. Convert to lower case

In [None]:
corpus = [sentence.lower() for sentence in corpus]

-----------------------------
# GPT-2 Fine Tunning

In [None]:
df = pd.DataFrame(corpus, columns = ['verse'])
df

#### Write verses into a text file so they can be used while the training using the generator.

In [None]:
df.to_csv(
    r'full_text.txt',
    header=None,
    index=None,
    sep=' ',
    mode='a'
    )

#### Load the GPT-2 model with its tokenizer

In [None]:
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_model = GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id = gpt_tokenizer.eos_token_id)

In [None]:
gpt_tokenizer.decode(gpt_tokenizer.eos_token_id)

In [None]:
corpus[:5]

### Splitting the data into train, and test without shuffeling.

In [None]:
train_path = '/content/train_dataset.txt'
test_path = '/content/test_dataset.txt'

In [None]:
train, test = train_test_split(
    df,
    test_size=0.15,
    shuffle=False
    )

Write train and test data each one into distinct text file.

In [None]:
train.to_csv(
    r'train_dataset.txt',
    header=None,
    index=None,
    sep=' ',
    mode='a'
    )

test.to_csv(
    r'test_dataset.txt',
    header=None,
    index=None,
    sep=' ',
    mode='a'
    )

#### Remove double qoutes that were added to verses after writing them in the text file.

In [None]:
def remove_double_qoutes(file_path: str) -> None:

    # Read the file
    with open(file_path, 'r') as file:
        text = file.read()

    # Remove double quotes
    text_without_quotes = text.replace('"', '')

    # Write the modified text back to the file
    with open(file_path, 'w') as file:
        file.write(text_without_quotes)

In [None]:
remove_double_qoutes(train_path)
remove_double_qoutes(test_path)

### Define load_dataset function that will load and prepare the dataset for the model.

In [None]:
def load_dataset(train_path, test_path, tokenizer):

    train_dataset = TextDataset(
          tokenizer = gpt_tokenizer,
          file_path = train_path,
          #maximum sequence length
          block_size=128
          )

    test_dataset = TextDataset(
          tokenizer = gpt_tokenizer,
          file_path = test_path,
          block_size=128
          )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer = gpt_tokenizer,
        mlm=False
    )

    return train_dataset, test_dataset, data_collator

In [None]:
train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, gpt_tokenizer)

In [None]:
# !pip install --upgrade accelerate

In [None]:
training_args = TrainingArguments(
    #The output directory
    output_dir = "./gpt2-Poems",
    #overwrite the content of the output directory
    overwrite_output_dir = True,
    # number of training epochs
    num_train_epochs = 15,
    # batch size for training
    per_device_train_batch_size=32,
    # batch size for evaluation
    per_device_eval_batch_size=64,
    # Number of update steps between two evaluations.
    eval_steps = 400,
    # after # steps model is saved
    save_steps=800,
    # number of warmup steps for learning rate scheduler
    warmup_steps=500,
    )

#initialize a trainer object to train the GPT-2 model using my data and training settings.
trainer = Trainer(
    model=gpt_model,
    args=training_args,
    data_collator=data_collator,  #collate and batch the data, tokenize verses
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

### Train the model

In [None]:
trainer.train()

## Save the model

In [None]:
# trainer.save_model('./gpt_for_poems')

# # Compress the folder into a zip file
# shutil.make_archive("/content/gpt_for_poems", "zip", "/content/gpt_for_poems")


In [None]:
df['similarity'].describe()