In [None]:
!nvidia-smi

In [None]:
import datetime 
now1 = datetime.datetime.now()
print('Started at ', now1 )

In [None]:
!pip install -U -q transformers
!pip install -U -q simpletransformers

In [None]:
import pandas as pd 
import gc
import re

In [None]:
dfap = pd.read_csv('../input/arabic-poetry/Arabic_poetry_dataset.csv')
dfap.head()

In [None]:
cats = ['العصر العباسي', 'العصر المملوكي', 'العصر الايوبي', 'العصر العثماني', 'العصر الاموي', 'العصر الأندلسي', 'عصر المخضرمون', 'العصر الجاهلي', 'العصر الإسلامي']
dfap_s = dfap[dfap['category'].isin(cats)].reset_index()
len(dfap_s)

In [None]:
dfap_s = dfap_s.sample(frac=1.0, random_state=42).reset_index(drop=True)
dfap_s.head()

In [None]:
poem = dfap_s.iloc[4]['poem_text']
poem

In [None]:
accents = re.compile(r'[\u064b-\u0652\u0640]') # harakaat and tatweel
def poemify(txt):    
    p = accents.sub('',txt).split('\n')
    string =""
    for i in range(0,len(p) -1, 2):
        string = string + f"{p[i]} z {p[i+1]} \n" 
        #print(f"{p[i]} - {p[i+1]}")
    #print (len(p))
    return string

In [None]:
with open("train.txt", "w") as f:
    for poem in dfap_s['poem_text'][:35000]:
        f.write(poemify(poem) )

with open("test.txt", "w") as f:
    for poem in dfap_s['poem_text'][35000:]:
        f.write(poemify(poem))

In [None]:
train_df = pd.read_csv('train.txt', sep='z', header=None)
test_df = pd.read_csv('test.txt', sep='z', header=None)
len(train_df), len(test_df)

In [None]:
train_df.iloc[90:100]

In [None]:
train_df = pd.DataFrame({
    'prefix': ["poetry line completion" for i in range(len(train_df))],
    'input_text': train_df[0],
    'target_text': train_df[1],
})

test_df = pd.DataFrame({
    'prefix': ["poetry line completion" for i in range(len(test_df))],
    'input_text': test_df[0],
    'target_text': test_df[1],
})

In [None]:
train_df.head()

In [None]:
len(train_df), len(test_df)

In [None]:
train_df = train_df.sample(frac=0.50, random_state=42)
test_df = test_df.sample(frac=0.25, random_state=42)
len(train_df), len(test_df)

In [None]:
from simpletransformers.t5 import T5Model

model_args = {
    "max_seq_length": 15, # was 196, 12 for poetry --- try 24 or so for poetry? see stats in df
    "train_batch_size": 12, # was 16
    "eval_batch_size": 12, # was 64
    "num_train_epochs": 2,
    "evaluate_during_training": True,
    "evaluate_during_training_steps": 15000,
    "evaluate_during_training_verbose": True,
    
    "use_multiprocessing": False,
    "fp16": False,

    "save_steps": -1,
    "save_eval_checkpoints": False,
    "save_model_every_epoch": False,

    "reprocess_input_data": True,
    "overwrite_output_dir": True,
}

In [None]:
model = T5Model("mt5","google/mt5-large", args=model_args) # t5, mt5-base

In [None]:
model.train_model(train_df, eval_data=test_df)

In [None]:
#save_model('mt5') #========================================================
model.save_model('mt5p',model=model.model)

In [None]:
test_pred = model.predict([
    "poetry line completion: صنت نفسي عما يدنس نفسي" ,
    "poetry line completion: اختلاف النهار والليل ينسي" ,
    "poetry line completion: لا تشتر العبد إلا والعصا معه" ,
    "poetry line completion: إذا بلغ الفطام لنا رضيع" ,
    "poetry line completion: وطني لو شغلت بالخلد عنه" ,
    "poetry line completion: واحر قلباه ممن قلبه شبم" ,
    "poetry line completion: وللحرية الحمراء باب" ,
    "poetry line completion: ستبدي لك الأيام ما كنت جاهلا" ,
    "poetry line completion: على قلب قسطنطين منه تعجب" ,
    "poetry line completion: هل غادر الشعراء من متردم" ,
    "poetry line completion: سأحمل روحي على راحتي" ,
])
test_pred

In [None]:
now2 = datetime.datetime.now()
print(f'Ended at {now2} and took {now2 - now1}' )