# model 1

## Downloading data

In [2]:
import pandas as pd
train=pd.read_csv(r"F:\programming\fatsai\Google Patent Phrase Similarity Dataset\train.csv")

In [3]:
validation=pd.read_csv(r"F:\programming\fatsai\Google Patent Phrase Similarity Dataset\validation.csv")

In [4]:
test=pd.read_csv(r"F:\programming\fatsai\Google Patent Phrase Similarity Dataset\test.csv")


## preprocessing data

In [5]:
train['input'] = 'TEXT1: ' + train.context + '; TEXT2: ' + train.target + '; ANC1: ' + train.anchor
validation['input'] = 'TEXT1: ' + validation.context + '; TEXT2: ' + validation.target + '; ANC1: ' + validation.anchor
test['input'] = 'TEXT1: ' + test.context + '; TEXT2: ' + test.target + '; ANC1: ' + test.anchor

In [6]:
train.input.head()

0     TEXT1: C09; TEXT2: cholesterol; ANC1: aralkynyl
1         TEXT1: C09; TEXT2: aralkyl; ANC1: aralkynyl
2     TEXT1: C09; TEXT2: heterocycle; ANC1: aralkynyl
3            TEXT1: C09; TEXT2: acyl; ANC1: aralkynyl
4    TEXT1: C09; TEXT2: heterocyclic; ANC1: aralkynyl
Name: input, dtype: object

In [7]:
from datasets import Dataset #using datasel library (hugging face) to run model

ds1 = Dataset.from_pandas(train) #cinverting our data from into Dataset
ds2 = Dataset.from_pandas(validation)
ds3 = Dataset.from_pandas(test)


In [8]:
model_nm = 'microsoft/deberta-v3-small' #choosing our model(we need it here because every model has its own tokenazition)

In [9]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer
tokz = AutoTokenizer.from_pretrained(model_nm) #our special tokenazier for this model



In [10]:
tokz.tokenize("hello guys, im sajjad ashrafin!")

['▁hello', '▁guys', ',', '▁im', '▁saj', 'jad', '▁as', 'hra', 'fin', '!']

In [11]:
tokz.tokenize("the weather is so hot !") #Uncommon words  splitted  into pieces. The start of a new word is represented by ▁

['▁the', '▁weather', '▁is', '▁so', '▁hot', '▁!']

In [12]:
def tok_func(x): #a function that back tokenizied inpots
    return tokz(x["input"])
tok_ds1 = ds1.map(tok_func, batched=True)# a mappinfg for doing parallel running that adds a new item to our dataset called input_ids
tok_ds2 = ds2.map(tok_func, batched=True)
tok_ds3 = ds3.map(tok_func, batched=True)




Map:   0%|          | 0/36473 [00:00<?, ? examples/s]

Map:   0%|          | 0/2843 [00:00<?, ? examples/s]

Map:   0%|          | 0/9232 [00:00<?, ? examples/s]

In [13]:
row = tok_ds1[0]
print(row['input'], row['input_ids'])#now we can convert every english covab in our data to numbers using new coloumn
tok_ds1 = tok_ds1.rename_columns({'score':'labels'})
tok_ds2 = tok_ds2.rename_columns({'score':'labels'})
tok_ds3 = tok_ds3.rename_columns({'score':'labels'})

TEXT1: C09; TEXT2: cholesterol; ANC1: aralkynyl [1, 54453, 435, 294, 716, 4505, 346, 54453, 445, 294, 9888, 346, 23702, 435, 294, 266, 17226, 9593, 63791, 2]


## Run and validation

In [14]:
import numpy as np # we use correlation as our metric 
def corr(x,y): #we build corrlation using numpy
    return np.corrcoef(x,y)[0][1] 

def corr_d(eval_pred): 
    return {'pearson': corr(*eval_pred)}# then convert it to the dictinery for transformers requirment 


In [15]:
from transformers import TrainingArguments,Trainer #we need trainer to build model
bs = 128 #our batch size
epochs = 4 # and number of epochs 
lr = 8e-5 # starting learning rate
args = TrainingArguments('outputs',#the directory where our wheight will save
                        learning_rate=lr,#specifieng learning rate for start
                        warmup_ratio=0.1,#how to increase learning with steps
                        lr_scheduler_type='cosine', #lower the learning rate using a cosine curve 
                        fp16=True,
                        evaluation_strategy="epoch", #evaluate the model after each epochs 
                        per_device_train_batch_size=bs, #size of our batch 
                        per_device_eval_batch_size=bs*2, 
                        num_train_epochs=epochs,#total number of training epochs 
                        weight_decay=0.01,#usinh L2 regularzation
                        report_to='none')# our argument for fitting model

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1) #create our model
trainer = Trainer(model, #specifieng model
                args, #illustarate arguments
                train_dataset=tok_ds1, #our train set
                eval_dataset=tok_ds2, #our test set
                tokenizer=tokz, #out tokenizer
                compute_metrics=corr_d) #what is our metrics

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:

trainer.train(); #run the model

  0%|          | 0/1140 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

{'eval_loss': 0.02567780390381813, 'eval_pearson': 0.7861627477043353, 'eval_runtime': 26.5973, 'eval_samples_per_second': 106.89, 'eval_steps_per_second': 0.451, 'epoch': 1.0}
{'loss': 0.034, 'grad_norm': 0.6738039255142212, 'learning_rate': 5.516578043170591e-05, 'epoch': 1.75}


  0%|          | 0/12 [00:00<?, ?it/s]

{'eval_loss': 0.026195835322141647, 'eval_pearson': 0.7923502239694946, 'eval_runtime': 26.4853, 'eval_samples_per_second': 107.343, 'eval_steps_per_second': 0.453, 'epoch': 2.0}


  0%|          | 0/12 [00:00<?, ?it/s]

{'eval_loss': 0.02681119740009308, 'eval_pearson': 0.7957460309786399, 'eval_runtime': 26.4809, 'eval_samples_per_second': 107.36, 'eval_steps_per_second': 0.453, 'epoch': 3.0}
{'loss': 0.0143, 'grad_norm': 0.1502688080072403, 'learning_rate': 3.6193470242834415e-06, 'epoch': 3.51}


  0%|          | 0/12 [00:00<?, ?it/s]

{'eval_loss': 0.02654223144054413, 'eval_pearson': 0.7952388016668152, 'eval_runtime': 26.4923, 'eval_samples_per_second': 107.314, 'eval_steps_per_second': 0.453, 'epoch': 4.0}
{'train_runtime': 4875.4978, 'train_samples_per_second': 29.924, 'train_steps_per_second': 0.234, 'train_loss': 0.022605083072394654, 'epoch': 4.0}


In [18]:
trainer.save_model("my_deberta_model")

# model 2

## data preprocessing

In [19]:
model_nm = 'microsoft/deberta-v3-base' #choosing our model(we need it here because every model has its own tokenazition)

In [20]:
tokz = AutoTokenizer.from_pretrained(model_nm) #our special tokenazier for this model

In [21]:
tokz.tokenize("what a cool day is today!")

['▁what', '▁a', '▁cool', '▁day', '▁is', '▁today', '!']

In [22]:
def tok_func(x): #a function that back tokenizied inpots
    return tokz(x["input"])
tok_ds1 = ds1.map(tok_func, batched=True)# a mappinfg for doing parallel running that adds a new item to our dataset called input_ids
tok_ds2 = ds2.map(tok_func, batched=True)
tok_ds3 = ds3.map(tok_func, batched=True)
tok_ds1 = tok_ds1.rename_columns({'score':'labels'})
tok_ds2 = tok_ds2.rename_columns({'score':'labels'})
tok_ds3 = tok_ds3.rename_columns({'score':'labels'})

Map:   0%|          | 0/36473 [00:00<?, ? examples/s]

Map:   0%|          | 0/2843 [00:00<?, ? examples/s]

Map:   0%|          | 0/9232 [00:00<?, ? examples/s]

In [23]:
bs = 8 #our batch size
epochs = 2 # and number of epochs 
lr = 8e-5 # starting learning rate
args = TrainingArguments('outputs',#the directory where our wheight will save
                        learning_rate=lr,#specifieng learning rate for start
                        warmup_ratio=0.1,#how to increase learning with steps
                        lr_scheduler_type='cosine', #lower the learning rate using a cosine curve 
                        fp16=True,
                        evaluation_strategy="epoch", #evaluate the model after each epochs 
                        per_device_train_batch_size=bs, #size of our batch 
                        per_device_eval_batch_size=bs, 
                        gradient_accumulation_steps=2,  
                        num_train_epochs=epochs,#total number of training epochs 
                        weight_decay=0.01,#usinh L2 regularzation
                        report_to='none')# our argument for fitting model

## Run  and validation

In [24]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1) #create our model
trainer = Trainer(model, #specifieng model
                args, #illustarate arguments
                train_dataset=tok_ds1, #our train set
                eval_dataset=tok_ds2, #our test set
                tokenizer=tokz, #out tokenizer
                compute_metrics=corr_d) #what is our metrics

In [25]:
trainer.train(); #run the model

  0%|          | 0/4560 [00:00<?, ?it/s]

{'loss': 0.0161, 'grad_norm': 0.7374388575553894, 'learning_rate': 7.997731288189876e-05, 'epoch': 0.22}
{'loss': 0.0187, 'grad_norm': 0.2470538467168808, 'learning_rate': 7.658156206974005e-05, 'epoch': 0.44}
{'loss': 0.0172, 'grad_norm': 0.34760668873786926, 'learning_rate': 6.789187204375981e-05, 'epoch': 0.66}
{'loss': 0.0156, 'grad_norm': 0.3444696366786957, 'learning_rate': 5.516578043170591e-05, 'epoch': 0.88}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.029392264783382416, 'eval_pearson': 0.7609570315423655, 'eval_runtime': 72.004, 'eval_samples_per_second': 39.484, 'eval_steps_per_second': 4.944, 'epoch': 1.0}
{'loss': 0.0132, 'grad_norm': 0.27112889289855957, 'learning_rate': 4.0244956960401305e-05, 'epoch': 1.1}
{'loss': 0.0103, 'grad_norm': 0.29238006472587585, 'learning_rate': 2.5288684285044283e-05, 'epoch': 1.32}
{'loss': 0.0112, 'grad_norm': 0.44043195247650146, 'learning_rate': 1.2461375129665934e-05, 'epoch': 1.54}
{'loss': 0.0116, 'grad_norm': 0.1736825406551361, 'learning_rate': 3.6193470242834415e-06, 'epoch': 1.75}
{'loss': 0.012, 'grad_norm': 0.21496210992336273, 'learning_rate': 4.3600933994007996e-08, 'epoch': 1.97}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.026234453544020653, 'eval_pearson': 0.7914985209786293, 'eval_runtime': 71.9804, 'eval_samples_per_second': 39.497, 'eval_steps_per_second': 4.946, 'epoch': 2.0}
{'train_runtime': 6788.875, 'train_samples_per_second': 10.745, 'train_steps_per_second': 0.672, 'train_loss': 0.013981913722920836, 'epoch': 2.0}


In [26]:
trainer.save_model("microsoft/deberta-v3-base")

# model 3

## Data preprocessing

In [27]:
model_nm = "nreimers/MiniLM-L6-H384-uncased" #choosing our model(we need it here because every model has its own tokenazition)

In [28]:
tokz = AutoTokenizer.from_pretrained(model_nm) #our special tokenazier for this model

In [29]:
tokz.tokenize("lets explore this new model!")

['lets', 'explore', 'this', 'new', 'model', '!']

In [30]:
def tok_func(x): #a function that back tokenizied inpots
    return tokz(x["input"])
tok_ds1 = ds1.map(tok_func, batched=True)# a mappinfg for doing parallel running that adds a new item to our dataset called input_ids
tok_ds2 = ds2.map(tok_func, batched=True)
tok_ds3 = ds3.map(tok_func, batched=True)
tok_ds1 = tok_ds1.rename_columns({'score':'labels'})
tok_ds2 = tok_ds2.rename_columns({'score':'labels'})
tok_ds3 = tok_ds3.rename_columns({'score':'labels'})

Map:   0%|          | 0/36473 [00:00<?, ? examples/s]

Map:   0%|          | 0/2843 [00:00<?, ? examples/s]

Map:   0%|          | 0/9232 [00:00<?, ? examples/s]

## Train and validation

In [31]:
import numpy as np

def corr(x, y):
    return np.corrcoef(x, y)[0][1]

def corr_d(eval_pred):
    predictions, labels = eval_pred

    # Flatten the predictions if needed
    if predictions.ndim > 1 and predictions.shape[1] == 1:
        predictions = predictions.squeeze(axis=1)

    return {'pearson': corr(predictions, labels)}

In [32]:
from transformers import EarlyStoppingCallback
bs = 16 #our batch size
epochs = 8 # and number of epochs 
lr = 8e-5 # starting learning rate
args = TrainingArguments('outputs',#the directory where our wheight will save
                        learning_rate=lr,#specifieng learning rate for start
                        warmup_ratio=0.1,#how to increase learning with steps
                        lr_scheduler_type='cosine', #lower the learning rate using a cosine curve 
                        fp16=True,
                        load_best_model_at_end=True,
                        evaluation_strategy="epoch", #evaluate the model after each epochs 
                        save_strategy="epoch",        
                        metric_for_best_model="pearson",  
                        per_device_train_batch_size=bs, #size of our batch 
                        per_device_eval_batch_size=bs*2, 
                        gradient_accumulation_steps=2,  
                        num_train_epochs=epochs,#total number of training epochs 
                        weight_decay=0.01,#usinh L2 regularzation
                        report_to='none')# our argument for fitting model

In [33]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1) #create our model
trainer = Trainer(model, #specifieng model
                args, #illustarate arguments
                train_dataset=tok_ds1, #our train set
                eval_dataset=tok_ds2, #our test set
                tokenizer=tokz, #out tokenizer
                compute_metrics=corr_d, #what is our metrics
                callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Stops after 2 evals without improvement
) 

In [34]:
trainer.train(); #run the model

  0%|          | 0/9120 [00:00<?, ?it/s]

{'loss': 0.0132, 'grad_norm': 0.3463342487812042, 'learning_rate': 4.385964912280702e-05, 'epoch': 0.44}
{'loss': 0.0144, 'grad_norm': 0.2563301622867584, 'learning_rate': 7.997731288189876e-05, 'epoch': 0.88}


  0%|          | 0/89 [00:00<?, ?it/s]

{'eval_loss': 0.03520682826638222, 'eval_pearson': 0.7109768359911046, 'eval_runtime': 6.0016, 'eval_samples_per_second': 473.71, 'eval_steps_per_second': 14.829, 'epoch': 1.0}
{'loss': 0.0133, 'grad_norm': 0.24565860629081726, 'learning_rate': 7.899126790831869e-05, 'epoch': 1.32}
{'loss': 0.0134, 'grad_norm': 0.4267203211784363, 'learning_rate': 7.658156206974005e-05, 'epoch': 1.75}


  0%|          | 0/89 [00:00<?, ?it/s]

{'eval_loss': 0.035955775529146194, 'eval_pearson': 0.7019910588983029, 'eval_runtime': 5.9874, 'eval_samples_per_second': 474.829, 'eval_steps_per_second': 14.865, 'epoch': 2.0}
{'loss': 0.0122, 'grad_norm': 0.1615705043077469, 'learning_rate': 7.283617926826647e-05, 'epoch': 2.19}
{'loss': 0.0109, 'grad_norm': 0.1913604587316513, 'learning_rate': 6.789187204375981e-05, 'epoch': 2.63}


  0%|          | 0/89 [00:00<?, ?it/s]

{'eval_loss': 0.03614410012960434, 'eval_pearson': 0.7116770502189527, 'eval_runtime': 6.1097, 'eval_samples_per_second': 465.328, 'eval_steps_per_second': 14.567, 'epoch': 3.0}
{'loss': 0.0103, 'grad_norm': 0.3467991352081299, 'learning_rate': 6.192916842410089e-05, 'epoch': 3.07}
{'loss': 0.0086, 'grad_norm': 0.30314546823501587, 'learning_rate': 5.516578043170591e-05, 'epoch': 3.51}
{'loss': 0.0089, 'grad_norm': 0.2180846929550171, 'learning_rate': 4.7848654916885446e-05, 'epoch': 3.95}


  0%|          | 0/89 [00:00<?, ?it/s]

{'eval_loss': 0.03363389894366264, 'eval_pearson': 0.7258992098634983, 'eval_runtime': 5.9542, 'eval_samples_per_second': 477.475, 'eval_steps_per_second': 14.947, 'epoch': 4.0}
{'loss': 0.0078, 'grad_norm': 0.24860285222530365, 'learning_rate': 4.0244956960401305e-05, 'epoch': 4.39}
{'loss': 0.0075, 'grad_norm': 0.1753646433353424, 'learning_rate': 3.2632315061931184e-05, 'epoch': 4.82}


  0%|          | 0/89 [00:00<?, ?it/s]

{'eval_loss': 0.03488587960600853, 'eval_pearson': 0.7216647231086074, 'eval_runtime': 5.9354, 'eval_samples_per_second': 478.989, 'eval_steps_per_second': 14.995, 'epoch': 5.0}
{'loss': 0.0081, 'grad_norm': 0.4730294346809387, 'learning_rate': 2.5288684285044283e-05, 'epoch': 5.26}
{'loss': 0.0081, 'grad_norm': 0.25007617473602295, 'learning_rate': 1.8482197478578303e-05, 'epoch': 5.7}


  0%|          | 0/89 [00:00<?, ?it/s]

{'eval_loss': 0.03555789589881897, 'eval_pearson': 0.7221306853107899, 'eval_runtime': 5.9537, 'eval_samples_per_second': 477.521, 'eval_steps_per_second': 14.949, 'epoch': 6.0}
{'train_runtime': 1607.5468, 'train_samples_per_second': 181.509, 'train_steps_per_second': 5.673, 'train_loss': 0.010390488062685693, 'epoch': 6.0}


In [35]:
trainer.save_model("nreimers/MiniLM-L6-H384-uncased")

# model 4

## Data preprocessing

In [36]:
model_nm = "roberta-base"

In [37]:
tokz = AutoTokenizer.from_pretrained(model_nm) #our special tokenazier for this model

In [38]:
tokz.tokenize("what a long work")

['what', 'Ġa', 'Ġlong', 'Ġwork']

In [39]:
def tok_func(x): #a function that back tokenizied inpots
    return tokz(x["input"])
tok_ds1 = ds1.map(tok_func, batched=True)# a mappinfg for doing parallel running that adds a new item to our dataset called input_ids
tok_ds2 = ds2.map(tok_func, batched=True)
tok_ds3 = ds3.map(tok_func, batched=True)
tok_ds1 = tok_ds1.rename_columns({'score':'labels'})
tok_ds2 = tok_ds2.rename_columns({'score':'labels'})
tok_ds3 = tok_ds3.rename_columns({'score':'labels'})

Map:   0%|          | 0/36473 [00:00<?, ? examples/s]

Map:   0%|          | 0/2843 [00:00<?, ? examples/s]

Map:   0%|          | 0/9232 [00:00<?, ? examples/s]

## Train and validation

In [40]:
bs = 8 #our batch size
epochs = 2 # and number of epochs 
lr = 8e-5 # starting learning rate
args = TrainingArguments('outputs',#the directory where our wheight will save
                        learning_rate=lr,#specifieng learning rate for start
                        warmup_ratio=0.1,#how to increase learning with steps
                        lr_scheduler_type='cosine', #lower the learning rate using a cosine curve 
                        fp16=True,
                        evaluation_strategy="epoch", #evaluate the model after each epochs 
                        per_device_train_batch_size=bs, #size of our batch 
                        per_device_eval_batch_size=bs, 
                        gradient_accumulation_steps=2,  
                        num_train_epochs=epochs,#total number of training epochs 
                        weight_decay=0.01,#usinh L2 regularzation
                        report_to='none')# our argument for fitting model

In [41]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1) #create our model
trainer = Trainer(model, #specifieng model
                args, #illustarate arguments
                train_dataset=tok_ds1, #our train set
                eval_dataset=tok_ds2, #our test set
                tokenizer=tokz, #out tokenizer
                compute_metrics=corr_d) #what is our metrics

In [42]:
trainer.train(); #run the model

  0%|          | 0/4560 [00:00<?, ?it/s]

{'loss': 0.0287, 'grad_norm': 0.9536291360855103, 'learning_rate': 7.997731288189876e-05, 'epoch': 0.22}
{'loss': 0.0337, 'grad_norm': 0.6044930815696716, 'learning_rate': 7.658156206974005e-05, 'epoch': 0.44}
{'loss': 0.0315, 'grad_norm': 0.47513195872306824, 'learning_rate': 6.789187204375981e-05, 'epoch': 0.66}
{'loss': 0.0282, 'grad_norm': 0.7719506025314331, 'learning_rate': 5.519410964445069e-05, 'epoch': 0.88}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.03383300080895424, 'eval_pearson': 0.7201047753800074, 'eval_runtime': 44.1889, 'eval_samples_per_second': 64.337, 'eval_steps_per_second': 8.056, 'epoch': 1.0}
{'loss': 0.0236, 'grad_norm': 0.9697262048721313, 'learning_rate': 4.027557612291482e-05, 'epoch': 1.1}
{'loss': 0.019, 'grad_norm': 1.1644240617752075, 'learning_rate': 2.531716231303287e-05, 'epoch': 1.32}
{'loss': 0.0194, 'grad_norm': 0.7755067944526672, 'learning_rate': 1.2483590795369743e-05, 'epoch': 1.54}
{'loss': 0.0199, 'grad_norm': 0.9264175891876221, 'learning_rate': 3.6320853642843657e-06, 'epoch': 1.75}
{'loss': 0.0201, 'grad_norm': 0.7252911329269409, 'learning_rate': 4.3600933994007996e-08, 'epoch': 1.97}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.03134845942258835, 'eval_pearson': 0.7445355318621882, 'eval_runtime': 44.2218, 'eval_samples_per_second': 64.29, 'eval_steps_per_second': 8.05, 'epoch': 2.0}
{'train_runtime': 3792.4665, 'train_samples_per_second': 19.234, 'train_steps_per_second': 1.202, 'train_loss': 0.024841662823108204, 'epoch': 2.0}


In [43]:
trainer.save_model("roberta-base")

# model 5


In [44]:
model_nm = "albert-base-v2"

In [45]:
tokz = AutoTokenizer.from_pretrained(model_nm) #our special tokenazier for this model

In [46]:
tokz.tokenize("I think this would be the last one!")

['▁i', '▁think', '▁this', '▁would', '▁be', '▁the', '▁last', '▁one', '!']

In [47]:
def tok_func(x): #a function that back tokenizied inpots
    return tokz(x["input"])
tok_ds1 = ds1.map(tok_func, batched=True)# a mappinfg for doing parallel running that adds a new item to our dataset called input_ids
tok_ds2 = ds2.map(tok_func, batched=True)
tok_ds3 = ds3.map(tok_func, batched=True)
tok_ds1 = tok_ds1.rename_columns({'score':'labels'})
tok_ds2 = tok_ds2.rename_columns({'score':'labels'})
tok_ds3 = tok_ds3.rename_columns({'score':'labels'})

Map:   0%|          | 0/36473 [00:00<?, ? examples/s]

Map:   0%|          | 0/2843 [00:00<?, ? examples/s]

Map:   0%|          | 0/9232 [00:00<?, ? examples/s]

In [48]:
import numpy as np

def corr(x, y):
    return np.corrcoef(x, y)[0][1]

def corr_d(eval_pred):
    predictions, labels = eval_pred

    # Flatten the predictions if needed
    if predictions.ndim > 1 and predictions.shape[1] == 1:
        predictions = predictions.squeeze(axis=1)

    return {'pearson': corr(predictions, labels)}

In [49]:
from transformers import EarlyStoppingCallback
bs = 16 #our batch size
epochs = 8 # and number of epochs 
lr = 8e-5 # starting learning rate
args = TrainingArguments('outputs',#the directory where our wheight will save
                        learning_rate=lr,#specifieng learning rate for start
                        warmup_ratio=0.1,#how to increase learning with steps
                        lr_scheduler_type='cosine', #lower the learning rate using a cosine curve 
                        fp16=True,
                        load_best_model_at_end=True,
                        evaluation_strategy="epoch", #evaluate the model after each epochs 
                        save_strategy="epoch",        
                        metric_for_best_model="pearson",  
                        per_device_train_batch_size=bs, #size of our batch 
                        per_device_eval_batch_size=bs*2, 
                        gradient_accumulation_steps=2,  
                        num_train_epochs=epochs,#total number of training epochs 
                        weight_decay=0.01,#usinh L2 regularzation
                        report_to='none')# our argument for fitting model

In [50]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1) #create our model
trainer = Trainer(model, #specifieng model
                args, #illustarate arguments
                train_dataset=tok_ds1, #our train set
                eval_dataset=tok_ds2, #our test set
                tokenizer=tokz, #out tokenizer
                compute_metrics=corr_d, #what is our metrics
                callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Stops after 2 evals without improvement
) 

In [51]:
trainer.train(); #run the model

  0%|          | 0/9120 [00:00<?, ?it/s]

{'loss': 0.0057, 'grad_norm': 0.3648189604282379, 'learning_rate': 4.385964912280702e-05, 'epoch': 0.44}
{'loss': 0.0117, 'grad_norm': 0.8302813768386841, 'learning_rate': 7.997782552120674e-05, 'epoch': 0.88}


  0%|          | 0/89 [00:00<?, ?it/s]

{'eval_loss': 0.03788059949874878, 'eval_pearson': 0.7015678476159913, 'eval_runtime': 42.6373, 'eval_samples_per_second': 66.679, 'eval_steps_per_second': 2.087, 'epoch': 1.0}
{'loss': 0.0137, 'grad_norm': 0.7276374697685242, 'learning_rate': 7.899468161955669e-05, 'epoch': 1.32}
{'loss': 0.0134, 'grad_norm': 0.6421157121658325, 'learning_rate': 7.658775221045979e-05, 'epoch': 1.75}


  0%|          | 0/89 [00:00<?, ?it/s]

{'eval_loss': 0.03828788176178932, 'eval_pearson': 0.7026296752754126, 'eval_runtime': 42.5978, 'eval_samples_per_second': 66.741, 'eval_steps_per_second': 2.089, 'epoch': 2.0}
{'loss': 0.0116, 'grad_norm': 0.4416433870792389, 'learning_rate': 7.285365556448004e-05, 'epoch': 2.19}
{'loss': 0.0096, 'grad_norm': 0.5355353355407715, 'learning_rate': 6.791381161493047e-05, 'epoch': 2.63}


  0%|          | 0/89 [00:00<?, ?it/s]

{'eval_loss': 0.03270522132515907, 'eval_pearson': 0.7234392620530207, 'eval_runtime': 42.623, 'eval_samples_per_second': 66.701, 'eval_steps_per_second': 2.088, 'epoch': 3.0}
{'loss': 0.0096, 'grad_norm': 0.352676659822464, 'learning_rate': 6.195477020602767e-05, 'epoch': 3.07}
{'loss': 0.0069, 'grad_norm': 0.3146379888057709, 'learning_rate': 5.519410964445069e-05, 'epoch': 3.51}
{'loss': 0.0073, 'grad_norm': 0.6050491333007812, 'learning_rate': 4.787867719573825e-05, 'epoch': 3.95}


  0%|          | 0/89 [00:00<?, ?it/s]

{'eval_loss': 0.032055843621492386, 'eval_pearson': 0.7331759243321564, 'eval_runtime': 42.6051, 'eval_samples_per_second': 66.729, 'eval_steps_per_second': 2.089, 'epoch': 4.0}
{'loss': 0.0051, 'grad_norm': 0.42970868945121765, 'learning_rate': 4.027557612291482e-05, 'epoch': 4.39}
{'loss': 0.005, 'grad_norm': 0.24761943519115448, 'learning_rate': 3.266241313206286e-05, 'epoch': 4.82}


  0%|          | 0/89 [00:00<?, ?it/s]

{'eval_loss': 0.031195566058158875, 'eval_pearson': 0.736784596561627, 'eval_runtime': 42.6241, 'eval_samples_per_second': 66.699, 'eval_steps_per_second': 2.088, 'epoch': 5.0}
{'loss': 0.0037, 'grad_norm': 0.26316943764686584, 'learning_rate': 2.531716231303287e-05, 'epoch': 5.26}
{'loss': 0.0032, 'grad_norm': 0.07472123205661774, 'learning_rate': 1.8508015666127043e-05, 'epoch': 5.7}


  0%|          | 0/89 [00:00<?, ?it/s]

{'eval_loss': 0.03233282268047333, 'eval_pearson': 0.7282306199217221, 'eval_runtime': 42.6147, 'eval_samples_per_second': 66.714, 'eval_steps_per_second': 2.088, 'epoch': 6.0}
{'loss': 0.003, 'grad_norm': 0.10132621973752975, 'learning_rate': 1.2483590795369743e-05, 'epoch': 6.14}
{'loss': 0.002, 'grad_norm': 0.20898258686065674, 'learning_rate': 7.463853308003739e-06, 'epoch': 6.58}


  0%|          | 0/89 [00:00<?, ?it/s]

{'eval_loss': 0.03227576985955238, 'eval_pearson': 0.7320897854417371, 'eval_runtime': 42.6604, 'eval_samples_per_second': 66.643, 'eval_steps_per_second': 2.086, 'epoch': 7.0}
{'train_runtime': 12704.5515, 'train_samples_per_second': 22.967, 'train_steps_per_second': 0.718, 'train_loss': 0.007115774704400161, 'epoch': 7.0}


In [52]:
trainer.save_model("albert-base-v2")