<a href="https://colab.research.google.com/github/rjenez/W266-final-project/blob/main/notebooks/Plagiarism_with_LongFormer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Plagiarism with Longformer

> Indented block


**Author:*** Ricardo Jenez heavily modified from examples in HuggingFace
**Description:** NLP code to detect plagiarism in code.

## Introduction

This is a preliminary model for doing code plagiarism detection. The idea is to identify when students in a class has plagiarized a coding example.

### References

* [Longformer](https://arxiv.org/abs/2004.05150)
* [Plagiarism Detection in Computer Programming Using Feature Extraction From Ultra-Fine-Grained Repositories](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9097285)

## Setup

Note: install HuggingFace `transformers` via `pip install transformers` (version >= 2.11.0).

In [None]:
%%capture
!pip3 install transformers
!pip3 install sentence_transformers
!pip3 install imbalanced-learn
!pip3 install datasets
#!pip3 install wandb

In [None]:
import torch
import datasets
import transformers
import pandas as pd
import numpy as np
from transformers import BertTokenizer, \
BertForSequenceClassification, Trainer, TrainingArguments,EvalPrediction, \
AutoTokenizer,  BertTokenizerFast,AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
#import wandb
import random
from imblearn.over_sampling import RandomOverSampler
import pprint
from transformers.utils import logging
logging.set_verbosity_error()


In [None]:
#!gsutil cp gs://w266finalproject/plagA20162017.tar plag2.tar
!gsutil cp gs://w266finalproject/plag2.tar plag2.tar

Copying gs://w266finalproject/plag2.tar...
- [1 files][ 77.8 MiB/ 77.8 MiB]                                                
Operation completed over 1 objects/77.8 MiB.                                     


In [None]:
#!echo Y | gcloud auth login
#gcloud auth login --remote-bootstrap="https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=a8PApkxcVY96cI0kqcyYRSLEz0yZP0&access_type=offline&code_challenge=nXI-Bvrdyh7AWh0n8iFt8UFtKFTjS2iN8gvUtQeilWE&code_challenge_method=S256&token_usage=remote"

In [None]:

!tar xvf plag2.tar
!ls -l
# !mv trainA*.csv train.csv
# !mv testA*.csv test.csv
!mv train2.csv train.csv
!mv test2.csv test.csv

alldata2.csv
groundtruth2.csv
test2.csv
train2.csv
total 237872
-rw-r--r--  1 jupyter jupyter  1114619 Mar 16 08:22 alldata2.csv
drwxr-xr-x  2 jupyter jupyter    12288 Mar 24 06:21 data
-rw-r--r--  1 jupyter jupyter   203396 Mar 16 08:19 groundtruth2.csv
-rw-r--r--  1 jupyter jupyter 81619968 Mar 24 19:51 plag2.tar
drwxr-xr-x 14 jupyter jupyter     4096 Mar 24 06:08 results
drwxr-xr-x  2 jupyter jupyter     4096 Mar 24 04:24 resultsBERT03242022_042423_512
drwxr-xr-x  6 jupyter jupyter     4096 Mar 24 04:34 resultsBERT03242022_042500_512
drwxr-xr-x  3 jupyter jupyter     4096 Mar 24 02:43 src
-rw-r--r--  1 jupyter jupyter 15819857 Mar 16 08:22 test.csv
-rw-r--r--  1 jupyter jupyter 15819857 Mar 16 08:22 test2.csv
-rw-r--r--  1 jupyter jupyter 64478135 Mar 16 08:22 train.csv
-rw-r--r--  1 jupyter jupyter 64478135 Mar 16 08:22 train2.csv
drwxr-xr-x  6 jupyter jupyter     4096 Mar 24 02:43 tutorials


In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
valid_df = train_df[int(len(train_df)*0.8):]
train_df = train_df[:int(len(train_df)*0.8)]#[:15000]

In [None]:
print("Train Target Distribution")
print(train_df.plagiarized.value_counts())

Train Target Distribution
0    10595
1      463
Name: plagiarized, dtype: int64


In [None]:
# y_train = tf.keras.utils.to_categorical(train_df.plagiarized, num_classes=2)
# y_val = tf.keras.utils.to_categorical(valid_df.plagiarized, num_classes=2)
# y_test = tf.keras.utils.to_categorical(test_df.plagiarized, num_classes=2)

# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority',random_state=1234)
train_over, y_train_over = oversample.fit_resample(train_df, train_df.plagiarized)
print("Train Target Distribution")
print(train_over.plagiarized.value_counts())

valid_over, y_valid_over = oversample.fit_resample(valid_df, valid_df.plagiarized)
print("Valid Target Distribution")
print(valid_over.plagiarized.value_counts())

test_over, y_test_over = oversample.fit_resample(test_df, test_df.plagiarized)
print("Test Target Distribution")
print(test_over.plagiarized.value_counts())

Train Target Distribution
0    10595
1    10595
Name: plagiarized, dtype: int64
Valid Target Distribution
0    2654
1    2654
Name: plagiarized, dtype: int64
Test Target Distribution
0    3294
1    3294
Name: plagiarized, dtype: int64


In [None]:


train_data = datasets.Dataset.from_pandas(train_over)
valid_data = datasets.Dataset.from_pandas(valid_over)
test_data = datasets.Dataset.from_pandas(test_over)

In [None]:
print(len(train_data),type(train_data),train_data)

21190 <class 'datasets.arrow_dataset.Dataset'> Dataset({
    features: ['label', 'filename0', 'filename1', 'source0', 'source1', 'percent', 'percent0', 'percent1', 'lines', 'plagiarized'],
    num_rows: 21190
})


In [None]:
# load model and tokenizer and define length of the text sequence
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
model = AutoModelForSequenceClassification.from_pretrained("allenai/longformer-base-4096",
#                gradient_checkpointing=False,
                num_labels = 2,
                cache_dir='data',
                return_dict=True).to(device)

tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096", 
                                          max_length = 4096,
                                          cache_dir='data',)

def tokenization(batched_text):
    return tokenizer(batched_text['source0'],batched_text['source1'], padding = 'max_length', truncation=True, max_length = 4096)



Downloading:   0%|          | 0.00/694 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [None]:
train_data = train_data.map(tokenization, batched = True, batch_size = 128)
valid_data = valid_data.map(tokenization, batched = True, batch_size = 128)
test_data = test_data.map(tokenization, batched = True, batch_size = 128)

  0%|          | 0/166 [00:00<?, ?ba/s]

  0%|          | 0/42 [00:00<?, ?ba/s]

  0%|          | 0/52 [00:00<?, ?ba/s]

In [None]:
train_data = train_data.map(lambda examples: {'label': examples['plagiarized']}, batched=True)
valid_data = valid_data.map(lambda examples: {'label': examples['plagiarized']}, batched=True)
test_data = test_data.map(lambda examples: {'label': examples['plagiarized']}, batched=True)
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
valid_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

  0%|          | 0/22 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [None]:
# define accuracy metrics
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = 'results',
    num_train_epochs = 4,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 32,    
    per_device_eval_batch_size= 16,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    warmup_steps=160,
    weight_decay=0.01,
    logging_steps = 4,
    learning_rate = 1e-5,
    fp16 = True,
    logging_dir='logs',
    dataloader_num_workers = 0,
#    run_name = 'bigbird_classification_1e5'
)
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=valid_data
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device


Using amp half precision backend


'cuda'

In [None]:
logging.set_verbosity_error()
# see how the basic model would perform
trainer.evaluate()



{'eval_loss': 0.69614177942276,
 'eval_accuracy': 0.48342125094197436,
 'eval_f1': 0.64417337139891,
 'eval_precision': 0.4912905779889153,
 'eval_recall': 0.9351921627731725,
 'eval_runtime': 107.9344,
 'eval_samples_per_second': 49.178,
 'eval_steps_per_second': 0.769}

In [None]:
!nvidia-smi -L 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-c4b5bfe8-6b06-2db9-3ae4-08dd3baf694a)
GPU 1: Tesla V100-SXM2-16GB (UUID: GPU-1fcb827f-e5b3-805e-99cc-711ad7c0d1c8)
GPU 2: Tesla V100-SXM2-16GB (UUID: GPU-97a89097-4c02-136d-5954-4e083dc9d19f)
GPU 3: Tesla V100-SXM2-16GB (UUID: GPU-5d40b3fa-9579-22ab-8696-cd7d22dad613)


In [None]:
torch.cuda.empty_cache()
import gc
gc.collect()

46

In [None]:
# train the model
logging.set_verbosity_error()
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,0.6398,0.672005,0.594386,0.642062,0.574531,0.727581
1,0.3471,0.708942,0.698568,0.61941,0.84,0.49058
2,0.1803,0.828389,0.723813,0.664684,0.845751,0.547476
3,0.1795,1.057344,0.678033,0.564142,0.872928,0.416729




TrainOutput(global_step=660, training_loss=0.3952386198621808, metrics={'train_runtime': 9137.7291, 'train_samples_per_second': 9.276, 'train_steps_per_second': 0.072, 'total_flos': 2.2251505196187648e+17, 'train_loss': 0.3952386198621808, 'epoch': 4.0})

In [None]:
# Evaluate the results
trainer.evaluate()



{'eval_loss': 0.8283891677856445,
 'eval_accuracy': 0.7238131122833459,
 'eval_f1': 0.6646843549862763,
 'eval_precision': 0.8457508731082655,
 'eval_recall': 0.5474755086661642,
 'eval_runtime': 100.2595,
 'eval_samples_per_second': 52.943,
 'eval_steps_per_second': 0.828,
 'epoch': 4.0}

In [None]:
logging.set_verbosity_error()
results = trainer.predict(test_data)
pprint.pprint(results.metrics)



{'test_accuracy': 0.7140255009107468,
 'test_f1': 0.6521418020679468,
 'test_loss': 0.9575977325439453,
 'test_precision': 0.8322337417530632,
 'test_recall': 0.5361262902246509,
 'test_runtime': 124.5725,
 'test_samples_per_second': 52.885,
 'test_steps_per_second': 0.827}


In [None]:
!gsutil cp -r results gs://w266finalproject/result03192022

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Copying file://results/checkpoint-164/training_args.bin [Content-Type=application/octet-stream]...
AccessDeniedException: 403 Access denied.


In [None]:
!gsutil ls gs://w266finalproject/result03192022/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
gs://w266finalproject/result03192022/checkpoint-331/
gs://w266finalproject/result03192022/results/


In [None]:
!gsutil cp -r result gs://w266finalproject

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
CommandException: No URLs matched: result


In [None]:
!ls -al results/checkpoint-*


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
results/checkpoint-164:
total 1460968
drwxr-xr-x  2 jupyter jupyter      4096 Mar 24 05:40 .
drwxr-xr-x 18 jupyter jupyter      4096 Mar 24 22:28 ..
-rw-r--r--  1 jupyter jupyter       769 Mar 24 05:40 config.json
-rw-r--r--  1 jupyter jupyter 997293661 Mar 24 05:40 optimizer.pt
-rw-r--r--  1 jupyter jupyter 498675053 Mar 24 05:40 pytorch_model.bin
-rw-r--r--  1 jupyter jupyter     17563 Mar 24 05:40 rng_state.pth
-rw-r--r--  1 jupyter jupyter       559 Mar 24 05:40 scaler.pt
-rw-r--r--  1 jupyter jupyter       623 Mar 24 05:40 scheduler.pt
-rw-r--r--  1 jupyter jupyter      5698 Mar 24 05:40 trainer_state.json
-rw-r--r--  1 jupyter jupyter      2927 Mar 24 05:40 training_args.bin

results/checkpoint-165:
to

###Do main import of all approprite libraries for BigBIRD.

## Configuration

## Load the Data

Dataset Overview:

- source0: Homework assignment for 1st student.
- source1: Homework assignment for 2nd student.
- label: This is the label chosen for plagiarized content

Here are the "similarity" label values in our dataset:

- 0: no similarity
- 1: similarity

Let's look at one sample from the dataset:

## Preprocessing

Distribution of our validation targets.

One-hot encode training, validation, and test labels.

## Keras Custom Data Generator

## Build the model.

Create train and validation data generators

## Train the Model

Training is done only for the top layers to perform "feature extraction",
which will allow the model to use the representations of the pretrained model.

## Fine-tuning

This step must only be performed after the feature extraction model has
been trained to convergence on the new data.

This is an optional last step where `bert_model` is unfreezed and retrained
with a very low learning rate. This can deliver meaningful improvement by
incrementally adapting the pretrained features to the new data.

# Train the entire model end-to-end.

## Evaluate model on the test set

## Inference on custom sentences

In [None]:
!ls /usr


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
bin  games    lib    lib64    libx32	local  share
etc  include  lib32  libexec  licenses	sbin   src


In [None]:

def check_similarity(source0, source1):
  sentence_pairs = np.array([[str(source0), str(source1)]])
  test_dataset = tokenizer(sentence_pairs[0],sentence_pairs[1], padding = 'max_length', truncation=True, max_length = 3072)
  test_results = trainer.predict(test_dataset)
  print(test_results)
  return(test_results)
    # sentence_pairs = np.array([[str(source0), str(source1)]])
    # test_data = BertSemanticDataGenerator(
    #     sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
    # )
    # proba = model.predict(test_data[0])[0]
    # #proba = model.predict(test_data)[0]
    # idx = np.argmax(proba)
    # proba = f"{proba[idx]*100: .2f}%"
    # pred = labels[idx]
    # return pred, proba


Check results on some example code pairs.

In [None]:
source0 = """int obrni(int broj)
{
        int cifra,nova=0;
        while(broj>0) {
                cifra=broj%10;
                nova=nova*10+cifra;
                broj/=10;
        }
        return nova;
}
"""
source1 = """int okreni_cifre(int broj)
{
        int cifra;
        int nova=0;
        while(broj>0) {
                cifra=broj%10;
                nova=nova*10+cifra;
                broj=broj/10;
        }
        return nova;
}"""
check_similarity(source0, source1)

IndexError: ignored