In [None]:
!pip install transformers
!pip install evaluate
!pip install peft

In [None]:
dataset_path = "/kaggle/input/rostan/final"
batch_size=16

In [None]:
def tokenize_function(tokenizer, data):
    return tokenizer(data['code'] + '</s>', padding='max_length', truncation=True, max_length=512)

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model_url = "TinyLlama/TinyLlama-1.1B-step-50K-105b"
model = AutoModelForSequenceClassification.from_pretrained(model_url)
tokenizer = AutoTokenizer.from_pretrained(model_url)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.truncation_side='left'

In [None]:
import datasets

dataset = datasets.load_dataset("parquet", data_files="/kaggle/input/updated/dataset.parquet")

In [None]:
dataset = dataset['train']

In [None]:
dataset_generated = dataset.remove_columns(['code', 'size', 'caption', 'language'])
dataset_human = dataset.remove_columns(['size', 'caption', 'language', 'generated_code'])

def label(x, label):
    x['labels'] = label
    return x

dataset_generated = dataset_generated.map(lambda x: label(x, 1))
dataset_human = dataset_human.map(lambda x: label(x, 0))

dataset_generated = dataset_generated.rename_column('generated_code', 'code')

In [None]:
final_dataset = datasets.concatenate_datasets((dataset_generated, dataset_human)).map(lambda x: tokenize_function(tokenizer, x))

In [None]:
splitted_final_dataset = final_dataset.shuffle().train_test_split().remove_columns(['code'])

In [None]:
import numpy as np
import evaluate

dataset = datasets.load_from_disk("/kaggle/input/coments/staqc_man_python_codegen_comments.hf")

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import Trainer, TrainingArguments
import peft

model.gradient_checkpointing_enable()


peft_config = peft.LoraConfig(
    r = 8,
    lora_alpha=16
)

model = peft.get_peft_model(model, peft_config)

In [None]:
model.pad_token_id = tokenizer.pad_token_id

In [None]:
model.config.pad_token_id = model.pad_token_id

In [None]:
training_args = TrainingArguments(output_dir="test_trainer", 
                                  evaluation_strategy="epoch",
                                  logging_strategy="epoch",
                                  per_device_train_batch_size=8,
                                  per_device_eval_batch_size=9,
                                  log_level='debug',
                                  label_names = ["labels"],
                                 remove_unused_columns=False)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=splitted_final_dataset['train'],
    eval_dataset=splitted_final_dataset['test'],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()