# Plagiarism with BERT
**Author:*** Ricardo Jenez heavily modified from examples in HuggingFace
**Description:** NLP code to detect plagiarism in code.

## Introduction

This is a preliminary model for doing code plagiarism detection. The idea is to identify when students in a class has plagiarized a coding example.

### References

* [BigBird](https://arxiv.org/abs/2007.14062)
* [Plagiarism Detection in Computer Programming Using Feature Extraction From Ultra-Fine-Grained Repositories](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9097285)

## Setup

Note: install HuggingFace `transformers` via `pip install transformers` (version >= 2.11.0).

In [1]:
!pip3 install antlr4-python3-runtime

Collecting antlr4-python3-runtime
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
[?25l[K     |██▉                             | 10 kB 33.1 MB/s eta 0:00:01[K     |█████▋                          | 20 kB 19.6 MB/s eta 0:00:01[K     |████████▍                       | 30 kB 16.2 MB/s eta 0:00:01[K     |███████████▏                    | 40 kB 14.8 MB/s eta 0:00:01[K     |██████████████                  | 51 kB 7.3 MB/s eta 0:00:01[K     |████████████████▉               | 61 kB 8.6 MB/s eta 0:00:01[K     |███████████████████▋            | 71 kB 8.6 MB/s eta 0:00:01[K     |██████████████████████▍         | 81 kB 8.8 MB/s eta 0:00:01[K     |█████████████████████████▏      | 92 kB 9.8 MB/s eta 0:00:01[K     |████████████████████████████    | 102 kB 8.2 MB/s eta 0:00:01[K     |██████████████████████████████▉ | 112 kB 8.2 MB/s eta 0:00:01[K     |████████████████████████████████| 117 kB 8.2 MB/s 
[?25hBuilding wheels for collected packages: antlr4-python3-run

In [2]:
import antlr4
from antlr4 import *

In [3]:
import CPP14Parser
import CPP14Lexer

def antlr(source):
  lexer = CPP14Lexer.CPP14Lexer(antlr4.InputStream(source))
  stream = antlr4.CommonTokenStream(lexer)
  stream.fill()

  return " ".join([token.text.lower() for token in stream.tokens][:-1])

# sanity check
with open("test_cpp.c", "r") as f:
    code = f.read()
    print(antlr(code))

#include <stdio.h> int main ( ) { double a , b , c , d , e , uk1 , uk2 , uk3 , y , x , v , n , m , q , w , r , t , u ; int oc1 , oc2 , oc3 , znak = 0 ; printf ( "unesite bodove za tarika:" ) ; printf ( "\ni parcijalni ispit: " ) ; scanf ( "%lf" , & a ) ; if ( a < 0 || a > 20 ) { printf ( "neispravan broj bodova" ) ; return 0 ; } printf ( "ii parcijalni ispit: " ) ; scanf ( "%lf" , & b ) ; if ( b < 0 || b > 20 ) { printf ( "neispravan broj bodova" ) ; return 0 ; } printf ( "prisustvo: " ) ; scanf ( "%lf" , & c ) ; if ( c < 0 || c > 10 ) { printf ( "neispravan broj bodova" ) ; return 0 ; } printf ( "zadace: " ) ; scanf ( "%lf" , & d ) ; if ( d < 0 || d > 10 ) { printf ( "neispravan broj bodova" ) ; return 0 ; } printf ( "zavrsni ispit: " ) ; scanf ( "%lf" , & e ) ; if ( e < 0 || e > 40 ) { printf ( "neispravan broj bodova" ) ; return 0 ; } uk1 = a + b + c + d + e ; printf ( "unesite bodove za bojana: " ) ; printf ( "\ni parcijalni ispit: " ) ; scanf ( "%lf" , & y ) ; if ( y < 0 || y > 20

In [4]:
%%capture
!pip3 install transformers
!pip3 install sentence_transformers
!pip3 install imbalanced-learn
!pip3 install datasets
#!pip3 install wandb

In [5]:
import torch
import datasets
import transformers
import pandas as pd
import numpy as np
from transformers import BertTokenizer, \
BertForSequenceClassification, Trainer, TrainingArguments,EvalPrediction, \
AutoTokenizer,  BertTokenizerFast, RobertaTokenizer, \
RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
#import wandb
import random
import datetime
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix
import pprint


In [6]:
#!gsutil cp gs://w266finalproject/plagA20162017.tar plag2.tar
!gsutil cp gs://w266finalproject/plag2.tar plag2.tar

Copying gs://w266finalproject/plag2.tar...
- [1 files][ 77.8 MiB/ 77.8 MiB]                                                
Operation completed over 1 objects/77.8 MiB.                                     


In [7]:
!nvidia-smi -L 

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-05ad01bf-4578-c9a9-45aa-c1375c717fcb)


In [8]:
# !gcloud auth login --no-browser

In [9]:

!tar xvf plag2.tar
!ls -l
# !mv trainA*.csv train.csv
# !mv testA*.csv test.csv
!mv train2.csv train.csv
!mv test2.csv test.csv

alldata2.csv
groundtruth2.csv
test2.csv
train2.csv
total 160396
-rw-r--r-- 1  501 staff  1114619 Mar 16 08:22 alldata2.csv
-rw-r--r-- 1 root root      5683 Mar 31 17:48 CPP14Lexer.g4
-rw-r--r-- 1 root root     53791 Mar 31 17:48 CPP14Lexer.interp
-rw-r--r-- 1 root root     64725 Mar 31 17:48 CPP14Lexer.py
-rw-r--r-- 1 root root      3074 Mar 31 17:48 CPP14Lexer.tokens
-rw-r--r-- 1 root root     19414 Mar 31 17:48 CPP14Parser.g4
-rw-r--r-- 1 root root     83384 Mar 31 17:48 CPP14Parser.interp
-rw-r--r-- 1 root root     61212 Mar 31 17:48 CPP14ParserListener.py
-rw-r--r-- 1 root root    676188 Mar 31 17:48 CPP14Parser.py
-rw-r--r-- 1 root root      3074 Mar 31 17:48 CPP14Parser.tokens
-rw-r--r-- 1  501 staff   203396 Mar 16 08:19 groundtruth2.csv
-rw-r--r-- 1 root root  81619968 Mar 31 17:49 plag2.tar
drwxr-xr-x 2 root root      4096 Mar 31 17:49 __pycache__
drwxr-xr-x 1 root root      4096 Mar 23 14:22 sample_data
-rw-r--r-- 1  501 staff 15819857 Mar 16 08:22 test2.csv
-rw-r--r-- 1 root

In [10]:
train_df = pd.read_csv("train.csv")
train_df["source0"] = train_df["source0"].apply(lambda r: antlr(r))
train_df["source1"] = train_df["source1"].apply(lambda r: antlr(r))
test_df = pd.read_csv("test.csv")
test_df["source0"] = test_df["source0"].apply(lambda r: antlr(r))
test_df["source1"] = test_df["source1"].apply(lambda r: antlr(r))
valid_df = train_df[int(len(train_df)*0.8):]
train_df = train_df[:int(len(train_df)*0.8)]#[:15000]

line 539:71 token recognition error at: ''
line 539:73 token recognition error at: ''
line 560:13 token recognition error at: ''
line 560:15 token recognition error at: ''
line 571:13 token recognition error at: ''
line 571:15 token recognition error at: ''
line 585:13 token recognition error at: ''
line 585:15 token recognition error at: ''
line 597:13 token recognition error at: ''
line 597:15 token recognition error at: ''
line 247:32 token recognition error at: '\\n'
line 247:32 token recognition error at: '\\n'
line 124:45 token recognition error at: ''
line 124:47 token recognition error at: ''
line 247:32 token recognition error at: '\\n'
line 539:71 token recognition error at: ''
line 539:73 token recognition error at: ''
line 560:13 token recognition error at: ''
line 560:15 token recognition error at: ''
line 571:13 token recognition error at: ''
line 571:15 token recognition error at: ''
line 585:13 token recognition error at: ''
line 585:15 token recognit

In [11]:
train_df.head()

Unnamed: 0,label,filename0,filename1,source0,source1,percent,percent0,percent1,lines,plagiarized
0,46,A2016/Z3/Z2/student4780,A2016/Z3/Z2/student9160,#include <stdio.h> #include <stdlib.h> void ub...,#include <stdio.h> #include<math.h> #include<s...,46,40,46,33,0
1,16,A2016/Z5/Z1/student4082,A2016/Z5/Z1/student7258,#include <stdio.h> #include <stdlib.h> #includ...,#include <stdio.h> struct student { char ime [...,16,16,13,16,0
2,11,A2016/Z4/Z4/student3116,A2016/Z4/Z4/student1738,"#include <stdio.h> void dajbroj ( char * pt , ...",#include <stdio.h> #include <math.h> #define ...,11,11,11,12,0
3,95,A2016/Z1/Z2/student8540,A2016/Z1/Z2/student5403,#include <stdio.h> #include <math.h> #define e...,#include <stdio.h> #include <math.h> #define e...,95,95,95,14,0
4,16,A2016/Z4/Z2/student5744,A2016/Z4/Z2/student9188,"#include <stdio.h> char * kodiraj ( char * s ,...",#include <stdio.h> #include <stdlib.h> char * ...,16,16,11,9,0


In [12]:
print("Train Target Distribution")
print(train_df.plagiarized.value_counts())

Train Target Distribution
0    10595
1      463
Name: plagiarized, dtype: int64


In [13]:
# y_train = tf.keras.utils.to_categorical(train_df.plagiarized, num_classes=2)
# y_val = tf.keras.utils.to_categorical(valid_df.plagiarized, num_classes=2)
# y_test = tf.keras.utils.to_categorical(test_df.plagiarized, num_classes=2)

# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority',random_state=1234)
train_over, y_train_over = oversample.fit_resample(train_df, train_df.plagiarized)
print("Train Target Distribution")
print(train_over.plagiarized.value_counts())

valid_over, y_valid_over = oversample.fit_resample(valid_df, valid_df.plagiarized)
print("Valid Target Distribution")
print(valid_over.plagiarized.value_counts())

test_over, y_test_over = oversample.fit_resample(test_df, test_df.plagiarized)
print("Test Target Distribution")
print(test_over.plagiarized.value_counts())

Train Target Distribution
0    10595
1    10595
Name: plagiarized, dtype: int64
Valid Target Distribution
0    2654
1    2654
Name: plagiarized, dtype: int64
Test Target Distribution
0    3294
1    3294
Name: plagiarized, dtype: int64


In [14]:
# train_data, test_data = datasets.load_dataset('imdb', split =['train', 'test'], 
#                                              cache_dir='/media/data_files/github/website_tutorials/data')

# train_data, test_data = datasets.load_dataset('csv',  split =['train', 'test'], data_files={'train': 'train.csv',
#                                               'test': 'test.csv'},cache_dir='data')

train_data = datasets.Dataset.from_pandas(train_over)
valid_data = datasets.Dataset.from_pandas(valid_over)
test_data = datasets.Dataset.from_pandas(test_over)

In [15]:
print(len(train_data),type(train_data),train_data)

21190 <class 'datasets.arrow_dataset.Dataset'> Dataset({
    features: ['label', 'filename0', 'filename1', 'source0', 'source1', 'percent', 'percent0', 'percent1', 'lines', 'plagiarized'],
    num_rows: 21190
})


In [16]:
transformers.logging.set_verbosity_error()

# load model and tokenizer and define length of the text sequence
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
# model = BertForSequenceClassification.from_pretrained("bert-large-uncased-whole-word-masking",
#                 # gradient_checkpointing=False,
#                 num_labels = 2,
#                 cache_dir='data',
#                 return_dict=True).to(device)
model = RobertaForSequenceClassification.from_pretrained("microsoft/codebert-base",
                                     num_labels = 2,
                                     cache_dir='data',
                                     return_dict=True).to(device)     

# tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking", 
#                                           max_length = 512,
#                                           cache_dir='data',)
# tokenizer = BertTokenizerFast.from_pretrained("bert-large-uncased-whole-word-masking")
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

def tokenization(batched_text):
    return tokenizer(batched_text['source0'],batched_text['source1'], padding = 'max_length', truncation=True, max_length = 512)
train_data = train_data.map(tokenization, batched = True, batch_size = 256) #len(train_data))
valid_data = valid_data.map(tokenization, batched = True, batch_size = 256) #len(valid_data))
test_data = test_data.map(tokenization, batched = True, batch_size = 256) #len(test_data))

Downloading:   0%|          | 0.00/498 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/498 [00:00<?, ?B/s]

  0%|          | 0/83 [00:00<?, ?ba/s]

  0%|          | 0/21 [00:00<?, ?ba/s]

  0%|          | 0/26 [00:00<?, ?ba/s]

In [17]:
train_data = train_data.map(lambda examples: {'label': examples['plagiarized']}, batched=True)
valid_data = valid_data.map(lambda examples: {'label': examples['plagiarized']}, batched=True)
test_data = test_data.map(lambda examples: {'label': examples['plagiarized']}, batched=True)
# train_data = train_data.map(lambda examples: {'labels': examples['plagiarized']}, batched=True)
# valid_data = valid_data.map(lambda examples: {'labels': examples['plagiarized']}, batched=True)
# test_data = test_data.map(lambda examples: {'labels': examples['plagiarized']}, batched=True)
# train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# valid_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
valid_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

  0%|          | 0/22 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [18]:
# define accuracy metrics

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [19]:
# Set parameters
today = datetime.datetime.now()
date_time = today.strftime("%m%d%Y_%H%M%S")
token_max_length = 512
train_batch_size = 2 # 1 for 4096
cachedir = 'data' + date_time + '_' + str(token_max_length)
outputdir = 'resultsBERT' + date_time + '_' + str(token_max_length)
logsdir = 'logs' + date_time + '_' + str(token_max_length)

In [34]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = outputdir,
    num_train_epochs = 4,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 32,    
    per_device_eval_batch_size= 16,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    warmup_steps=160,
    weight_decay=0.01,
    logging_steps = 4,
    learning_rate = 5e-6,
    fp16 = True,
    logging_dir='logs',
    dataloader_num_workers = 0,
#    run_name = 'bigbird_classification_1e5'
)
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=valid_data
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp half precision backend


'cuda'

In [35]:
# see how the basic model would perform
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: source1, lines, percent, plagiarized, filename1, percent0, percent1, filename0, source0. If source1, lines, percent, plagiarized, filename1, percent0, percent1, filename0, source0 are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5308
  Batch size = 16


{'eval_accuracy': 0.5,
 'eval_f1': 0.6666666666666666,
 'eval_loss': 0.7071086168289185,
 'eval_precision': 0.5,
 'eval_recall': 1.0,
 'eval_runtime': 109.4305,
 'eval_samples_per_second': 48.506,
 'eval_steps_per_second': 3.034}

In [36]:
!nvidia-smi -L 

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-05ad01bf-4578-c9a9-45aa-c1375c717fcb)


In [37]:
torch.cuda.empty_cache()
import gc
gc.collect()

80

In [38]:
# train the model
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: source1, lines, percent, plagiarized, filename1, percent0, percent1, filename0, source0. If source1, lines, percent, plagiarized, filename1, percent0, percent1, filename0, source0 are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 21190
  Num Epochs = 4
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 32
  Total optimization steps = 1324


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,0.2951,0.443043,0.827054,0.84243,0.773644,0.924642
1,0.2124,0.383965,0.862283,0.866533,0.840595,0.894122
2,0.2223,0.312457,0.890731,0.895004,0.861324,0.931424
3,0.197,0.312507,0.89205,0.896139,0.86343,0.931424


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: source1, lines, percent, plagiarized, filename1, percent0, percent1, filename0, source0. If source1, lines, percent, plagiarized, filename1, percent0, percent1, filename0, source0 are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5308
  Batch size = 16


Saving model checkpoint to resultsBERT03312022_180553_512/checkpoint-331
Configuration saved in resultsBERT03312022_180553_512/checkpoint-331/config.json
Model weights saved in resultsBERT03312022_180553_512/checkpoint-331/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: source1, lines, percent, plagiarized, filename1, percent0, percent1, filename0, source0. If source1, lines, percent, plagiarized, filename1, percent0, percent1, filename0, source0 are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5308
  Batch size = 16
Saving model checkpoint to resultsBERT03312022_180553_512/checkpoint-662
Configuration saved in resultsBERT03312022_180553_512/checkpoint-662/config.json
Model weights saved in resultsBERT03312022_180553_512/checkpoint-662/pytorch_model.bin
The following co

TrainOutput(global_step=1324, training_loss=0.2989650309491014, metrics={'train_runtime': 6048.3006, 'train_samples_per_second': 14.014, 'train_steps_per_second': 0.219, 'total_flos': 2.229971438598144e+16, 'train_loss': 0.2989650309491014, 'epoch': 4.0})

In [39]:
# Evaluate the results
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: source1, lines, percent, plagiarized, filename1, percent0, percent1, filename0, source0. If source1, lines, percent, plagiarized, filename1, percent0, percent1, filename0, source0 are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5308
  Batch size = 16


{'epoch': 4.0,
 'eval_accuracy': 0.8920497362471741,
 'eval_f1': 0.8961392060902664,
 'eval_loss': 0.31250712275505066,
 'eval_precision': 0.8634299685644429,
 'eval_recall': 0.9314242652599849,
 'eval_runtime': 109.2714,
 'eval_samples_per_second': 48.576,
 'eval_steps_per_second': 3.038}

In [40]:
results = trainer.predict(test_data)
pprint.pprint(results.metrics)

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: source1, lines, percent, plagiarized, filename1, percent0, percent1, filename0, source0. If source1, lines, percent, plagiarized, filename1, percent0, percent1, filename0, source0 are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 6588
  Batch size = 16


{'test_accuracy': 0.8633879781420765,
 'test_f1': 0.8674521354933726,
 'test_loss': 0.4366402328014374,
 'test_precision': 0.842391304347826,
 'test_recall': 0.8940497874924105,
 'test_runtime': 135.6204,
 'test_samples_per_second': 48.577,
 'test_steps_per_second': 3.038}


In [42]:
!zip -r codebert_antlr_lower_4.zip resultsBERT03312022_180553_512/checkpoint-1324 

  adding: resultsBERT03312022_180553_512/checkpoint-1324/ (stored 0%)
  adding: resultsBERT03312022_180553_512/checkpoint-1324/config.json (deflated 50%)
  adding: resultsBERT03312022_180553_512/checkpoint-1324/optimizer.pt (deflated 35%)
  adding: resultsBERT03312022_180553_512/checkpoint-1324/rng_state.pth (deflated 27%)
  adding: resultsBERT03312022_180553_512/checkpoint-1324/scheduler.pt (deflated 49%)
  adding: resultsBERT03312022_180553_512/checkpoint-1324/scaler.pt (deflated 55%)
  adding: resultsBERT03312022_180553_512/checkpoint-1324/pytorch_model.bin (deflated 7%)
  adding: resultsBERT03312022_180553_512/checkpoint-1324/training_args.bin (deflated 48%)
  adding: resultsBERT03312022_180553_512/checkpoint-1324/trainer_state.json (deflated 85%)


In [27]:
!gsutil ls gs://w266finalproject/resultsBERT03242022_082713_512/checkpoint-656/

gs://w266finalproject/resultsBERT03242022_082713_512/checkpoint-656/config.json
gs://w266finalproject/resultsBERT03242022_082713_512/checkpoint-656/optimizer.pt
gs://w266finalproject/resultsBERT03242022_082713_512/checkpoint-656/pytorch_model.bin
gs://w266finalproject/resultsBERT03242022_082713_512/checkpoint-656/rng_state.pth
gs://w266finalproject/resultsBERT03242022_082713_512/checkpoint-656/scaler.pt
gs://w266finalproject/resultsBERT03242022_082713_512/checkpoint-656/scheduler.pt
gs://w266finalproject/resultsBERT03242022_082713_512/checkpoint-656/trainer_state.json
gs://w266finalproject/resultsBERT03242022_082713_512/checkpoint-656/training_args.bin


In [28]:
!gsutil cp -r $outputdir gs://w266finalproject/

Copying file://resultsBERT03312022_180553_512/checkpoint-246/config.json [Content-Type=application/json]...
/ [0 files][    0.0 B/  769.0 B]                                                ServiceException: 401 Anonymous caller does not have storage.objects.create access to the Google Cloud Storage object.


In [29]:
!ls -al 


total 160428
drwxr-xr-x 1 root root      4096 Mar 31 18:07 .
drwxr-xr-x 1 root root      4096 Mar 31 17:47 ..
-rw-r--r-- 1  501 staff  1114619 Mar 16 08:22 alldata2.csv
drwxr-xr-x 1 root root      4096 Mar 23 14:21 .config
-rw-r--r-- 1 root root      5683 Mar 31 17:48 CPP14Lexer.g4
-rw-r--r-- 1 root root     53791 Mar 31 17:48 CPP14Lexer.interp
-rw-r--r-- 1 root root     64725 Mar 31 17:48 CPP14Lexer.py
-rw-r--r-- 1 root root      3074 Mar 31 17:48 CPP14Lexer.tokens
-rw-r--r-- 1 root root     19414 Mar 31 17:48 CPP14Parser.g4
-rw-r--r-- 1 root root     83384 Mar 31 17:48 CPP14Parser.interp
-rw-r--r-- 1 root root     61212 Mar 31 17:48 CPP14ParserListener.py
-rw-r--r-- 1 root root    676188 Mar 31 17:48 CPP14Parser.py
-rw-r--r-- 1 root root      3074 Mar 31 17:48 CPP14Parser.tokens
drwxr-xr-x 2 root root      4096 Mar 31 17:58 data
-rw-r--r-- 1  501 staff   203396 Mar 16 08:19 groundtruth2.csv
drwxr-xr-x 3 root root      4096 Mar 31 19:37 logs
-rw-r--r-- 1 root root  81619968 Mar 31 17:

In [30]:
!mkdir saved_model

In [31]:
!gsutil cp -R gs://w266finalproject/resultsBERT03242022_082713_512/checkpoint-656/* saved_model

Copying gs://w266finalproject/resultsBERT03242022_082713_512/checkpoint-656/config.json...
Copying gs://w266finalproject/resultsBERT03242022_082713_512/checkpoint-656/optimizer.pt...
Copying gs://w266finalproject/resultsBERT03242022_082713_512/checkpoint-656/pytorch_model.bin...
Copying gs://w266finalproject/resultsBERT03242022_082713_512/checkpoint-656/rng_state.pth...
| [4 files][  1.2 GiB/  1.2 GiB]   84.4 MiB/s                                   
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://w266finalproject/resultsBERT03242022_082713_512/checkpoint-656/scaler.pt...
Copying gs://w266finalproject/resultsBERT03242022_082713_512/checkpoint-656/scheduler.pt...
Copying gs://w266finalproject/resultsBERT03242022_082713_512/checkpoint-656/trainer_state.json...
Copying gs://w2

In [32]:
!ls saved_model

config.json   pytorch_model.bin  scaler.pt     trainer_state.json
optimizer.pt  rng_state.pth	 scheduler.pt  training_args.bin


In [33]:
# load model and tokenizer and define length of the text sequence
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
model = BertForSequenceClassification.from_pretrained("./saved_model",
                gradient_checkpointing=False,
                num_labels = 2,
                cache_dir='data',
                return_dict=True).to(device)

tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking", 
                                          max_length = 512,
                                          cache_dir='data',)

loading configuration file ./saved_model/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file ./saved_model/pytorch_model.bin


KeyboardInterrupt: ignored

In [None]:

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = "saved",
    num_train_epochs = 4,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 32,    
    per_device_eval_batch_size= 16,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    warmup_steps=160,
    weight_decay=0.01,
    logging_steps = 4,
    learning_rate = 1e-5,
    fp16 = True,
    logging_dir='logs',
    dataloader_num_workers = 0,
#    run_name = 'bigbird_classification_1e5'
)
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    # train_dataset=train_data,
    # eval_dataset=valid_data
)

In [None]:
#!gsutil cp gs://w266finalproject/plagA20162017.tar plag2.tar
!gsutil cp gs://w266finalproject/plag2.tar plag2.tar
!tar xvf plag2.tar
!ls -l

!mv train2.csv train.csv
!mv test2.csv test.csv

test_df = pd.read_csv("test.csv")
oversample = RandomOverSampler(sampling_strategy='minority',random_state=1234)
test_over, y_test_over = oversample.fit_resample(test_df, test_df.plagiarized)
print(test_over.plagiarized.value_counts())
test_data = datasets.Dataset.from_pandas(test_over)
def tokenization(batched_text):
    return tokenizer(batched_text['source0'],batched_text['source1'], padding = 'max_length', truncation=True, max_length = 512)
test_data = test_data.map(tokenization, batched = True, batch_size = 256)
test_data = test_data.map(lambda examples: {'label': examples['plagiarized']}, batched=True)
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
predictions = trainer.predict(test_data)
pprint.pprint(predictions.metrics)

In [None]:
preds = np.argmax(predictions.predictions, axis=-1)
print(preds)

In [None]:

print(confusion_matrix(preds, predictions.label_ids))

tn, fp, fn, tp = confusion_matrix(preds, predictions.label_ids).ravel()
print(tn,fp,fn,tp)




In [None]:
sourcefalsepos = test_over[np.logical_and(preds == 1,predictions.label_ids==0)][['source0','source1','filename0','filename1']]

In [None]:
pp = pprint.PrettyPrinter(depth=6)

In [None]:
print(sourcefalsepos[['filename0','filename1']].iloc[0])

In [None]:
pp.pprint(sourcefalsepos['source0'].iloc[0])

In [None]:
pp.pprint(sourcefalsepos['source1'].iloc[0])

In [None]:
sourcefalseneg = test_over[np.logical_and(preds == 0,predictions.label_ids==1)][['source0','source1']]

In [None]:
pp.pprint(sourcefalseneg['source0'].iloc[0])

In [None]:
pp.pprint(sourcefalseneg['source1'].iloc[0])

In [None]:
sourcetruepos = test_over[np.logical_and(preds == 1,predictions.label_ids==1)][['source0','source1']]

In [None]:
pp.pprint(sourcetruepos['source0'].iloc[0])

In [None]:
pp.pprint(sourcetruepos['source1'].iloc[0])

In [None]:
sourcetrueneg = test_over[np.logical_and(preds == 0,predictions.label_ids==0)][['source0','source1']]

In [None]:
pp.pprint(sourcetrueneg['source0'].iloc[0])

In [None]:
pp.pprint(sourcetrueneg['source1'].iloc[0])

###Do main import of all approprite libraries for BigBIRD.

## Configuration

## Load the Data

Dataset Overview:

- source0: Homework assignment for 1st student.
- source1: Homework assignment for 2nd student.
- label: This is the label chosen for plagiarized content

Here are the "similarity" label values in our dataset:

- 0: no similarity
- 1: similarity

Let's look at one sample from the dataset:

## Preprocessing

Distribution of our validation targets.

One-hot encode training, validation, and test labels.

## Keras Custom Data Generator

## Build the model.

Create train and validation data generators

## Train the Model

Training is done only for the top layers to perform "feature extraction",
which will allow the model to use the representations of the pretrained model.

## Fine-tuning

This step must only be performed after the feature extraction model has
been trained to convergence on the new data.

This is an optional last step where `bert_model` is unfreezed and retrained
with a very low learning rate. This can deliver meaningful improvement by
incrementally adapting the pretrained features to the new data.

# Train the entire model end-to-end.

## Evaluate model on the test set

## Inference on custom sentences

In [None]:
!ls /usr


In [None]:

def check_similarity(source0, source1):
  sentence_pairs = np.array([[str(source0), str(source1)]])
  test_dataset = tokenizer(sentence_pairs[0],sentence_pairs[1], padding = 'max_length', truncation=True, max_length = 3072)
  test_results = trainer.predict(test_dataset)
  print(test_results)
  return(test_results)
    # sentence_pairs = np.array([[str(source0), str(source1)]])
    # test_data = BertSemanticDataGenerator(
    #     sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
    # )
    # proba = model.predict(test_data[0])[0]
    # #proba = model.predict(test_data)[0]
    # idx = np.argmax(proba)
    # proba = f"{proba[idx]*100: .2f}%"
    # pred = labels[idx]
    # return pred, proba


Check results on some example code pairs.

In [None]:
4source0 = """int obrni(int broj)
{
        int cifra,nova=0;
        while(broj>0) {
                cifra=broj%10;
                nova=nova*10+cifra;
                broj/=10;
        }
        return nova;
}
"""
source1 = """int okreni_cifre(int broj)
{
        int cifra;
        int nova=0;
        while(broj>0) {
                cifra=broj%10;
                nova=nova*10+cifra;
                broj=broj/10;
        }
        return nova;
}"""
check_similarity(source0, source1)