<a href="https://colab.research.google.com/github/rjenez/W266-final-project/blob/main/notebooks/Plagiarism_with_Bert_Trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Plagiarism with BERT
**Author:*** Ricardo Jenez heavily modified from examples in HuggingFace
**Description:** NLP code to detect plagiarism in code.

## Introduction

This is a preliminary model for doing code plagiarism detection. The idea is to identify when students in a class has plagiarized a coding example.

### References

* [BERT](https://arxiv.org/pdf/1810.04805.pdf)
* [Plagiarism Detection in Computer Programming Using Feature Extraction From Ultra-Fine-Grained Repositories](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9097285)

## Setup

In [None]:
%%capture
!pip3 install transformers
!pip3 install sentence_transformers
!pip3 install imbalanced-learn
!pip3 install datasets
#!pip3 install wandb

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import torch
import datasets
import transformers
import pandas as pd
import numpy as np
from transformers import BertTokenizer, \
BertForSequenceClassification, Trainer, TrainingArguments,EvalPrediction, \
AutoTokenizer,  BertTokenizerFast
from torch.utils.data import Dataset, DataLoader
#import wandb
import random
import datetime
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix
import pprint


In [None]:
#!gsutil cp gs://w266finalproject/plagA20162017.tar plag2.tar
!gsutil cp gs://w266finalproject/plag2.tar plag2.tar

Copying gs://w266finalproject/plag2.tar...
/ [1 files][ 77.8 MiB/ 77.8 MiB]                                                
Operation completed over 1 objects/77.8 MiB.                                     


In [None]:
!nvidia-smi -L 

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-a981cbfd-8d03-51cf-79eb-ee0b5758bb2d)


In [None]:
#!gcloud auth login --no-browser

In [None]:

!tar xvf plag2.tar
!ls -l
# !mv trainA*.csv train.csv
# !mv testA*.csv test.csv
!mv train2.csv train.csv
!mv test2.csv test.csv

alldata2.csv
groundtruth2.csv
test2.csv
train2.csv
total 159428
-rw-r--r-- 1  501 staff  1114619 Mar 16 08:22 alldata2.csv
-rw-r--r-- 1  501 staff   203396 Mar 16 08:19 groundtruth2.csv
-rw-r--r-- 1 root root  81619968 Mar 29 18:22 plag2.tar
drwxr-xr-x 1 root root      4096 Mar 23 14:22 sample_data
-rw-r--r-- 1  501 staff 15819857 Mar 16 08:22 test2.csv
-rw-r--r-- 1  501 staff 64478135 Mar 16 08:22 train2.csv


In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
valid_df = train_df[int(len(train_df)*0.8):]
train_df = train_df[:int(len(train_df)*0.8)]#[:15000]

In [None]:
print("Train Target Distribution")
print(train_df.plagiarized.value_counts())

Train Target Distribution
0    10595
1      463
Name: plagiarized, dtype: int64


In [None]:
# y_train = tf.keras.utils.to_categorical(train_df.plagiarized, num_classes=2)
# y_val = tf.keras.utils.to_categorical(valid_df.plagiarized, num_classes=2)
# y_test = tf.keras.utils.to_categorical(test_df.plagiarized, num_classes=2)

# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority',random_state=1234)
train_over, y_train_over = oversample.fit_resample(train_df, train_df.plagiarized)
print("Train Target Distribution")
print(train_over.plagiarized.value_counts())

valid_over, y_valid_over = oversample.fit_resample(valid_df, valid_df.plagiarized)
print("Valid Target Distribution")
print(valid_over.plagiarized.value_counts())

test_over, y_test_over = oversample.fit_resample(test_df, test_df.plagiarized)
print("Test Target Distribution")
print(test_over.plagiarized.value_counts())

Train Target Distribution
0    10595
1    10595
Name: plagiarized, dtype: int64
Valid Target Distribution
0    2654
1    2654
Name: plagiarized, dtype: int64
Test Target Distribution
0    3294
1    3294
Name: plagiarized, dtype: int64


In [None]:
# train_data, test_data = datasets.load_dataset('imdb', split =['train', 'test'], 
#                                              cache_dir='/media/data_files/github/website_tutorials/data')

# train_data, test_data = datasets.load_dataset('csv',  split =['train', 'test'], data_files={'train': 'train.csv',
#                                               'test': 'test.csv'},cache_dir='data')

train_data = datasets.Dataset.from_pandas(train_over)
valid_data = datasets.Dataset.from_pandas(valid_over)
test_data = datasets.Dataset.from_pandas(test_over)

In [None]:
print(len(train_data),type(train_data),train_data)

21190 <class 'datasets.arrow_dataset.Dataset'> Dataset({
    features: ['label', 'filename0', 'filename1', 'source0', 'source1', 'percent', 'percent0', 'percent1', 'lines', 'plagiarized'],
    num_rows: 21190
})


In [None]:
# load model and tokenizer and define length of the text sequence
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                gradient_checkpointing=False,
                num_labels = 2,
                cache_dir='data',
                return_dict=True).to(device)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", 
                                          max_length = 512,
                                          cache_dir='data',)
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenization(batched_text):
    return tokenizer(batched_text['source0'],batched_text['source1'], padding = 'max_length', truncation=True, max_length = 512)
train_data = train_data.map(tokenization, batched = True, batch_size = 256) #len(train_data))
valid_data = valid_data.map(tokenization, batched = True, batch_size = 256) #len(valid_data))
test_data = test_data.map(tokenization, batched = True, batch_size = 256) #len(test_data))

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

  0%|          | 0/83 [00:00<?, ?ba/s]

  0%|          | 0/21 [00:00<?, ?ba/s]

  0%|          | 0/26 [00:00<?, ?ba/s]

In [None]:
train_data = train_data.map(lambda examples: {'label': examples['plagiarized']}, batched=True)
valid_data = valid_data.map(lambda examples: {'label': examples['plagiarized']}, batched=True)
test_data = test_data.map(lambda examples: {'label': examples['plagiarized']}, batched=True)
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids','label'])
valid_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids','label'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])

  0%|          | 0/22 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [None]:
# define accuracy metrics

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# Set parameters
today = datetime.datetime.now()
date_time = today.strftime("%m%d%Y_%H%M%S")
token_max_length = 512
train_batch_size = 2 # 1 for 4096
cachedir = 'data' + date_time + '_' + str(token_max_length)
outputdir = 'resultsBERT' + date_time + '_' + str(token_max_length)
logsdir = 'logs' + date_time + '_' + str(token_max_length)

In [None]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = outputdir,
    num_train_epochs = 4,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 32,    
    per_device_eval_batch_size= 16,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    warmup_steps=160,
    weight_decay=0.01,
    logging_steps = 4,
    learning_rate = 1e-5,
    fp16 = True,
    logging_dir='logs',
    dataloader_num_workers = 0,
#    run_name = 'bigbird_classification_1e5'
)
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=valid_data
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device


Using amp half precision backend


'cuda'

In [None]:
# see how the basic model would perform
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: plagiarized, filename0, percent0, source1, percent, percent1, lines, source0, filename1. If plagiarized, filename0, percent0, source1, percent, percent1, lines, source0, filename1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5308
  Batch size = 16


{'eval_accuracy': 0.49981160512434064,
 'eval_f1': 0.0,
 'eval_loss': 0.6943948268890381,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_runtime': 24.0807,
 'eval_samples_per_second': 220.426,
 'eval_steps_per_second': 13.787}

In [None]:
!nvidia-smi -L 

GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-55a0f7c9-489b-d7b9-2877-953ad288ba30)


In [None]:
torch.cuda.empty_cache()
import gc
gc.collect()

307

In [None]:
# train the model
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: plagiarized, filename0, percent0, source1, percent, percent1, lines, source0, filename1. If plagiarized, filename0, percent0, source1, percent, percent1, lines, source0, filename1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 21190
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 32
  Total optimization steps = 328


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,0.6017,0.588298,0.670121,0.68422,0.656174,0.71477
1,0.44,0.630598,0.680482,0.748293,0.617287,0.949887
2,0.3309,0.454279,0.797099,0.805138,0.774452,0.838357
3,0.2605,0.441485,0.818011,0.823336,0.799929,0.848154


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: plagiarized, filename0, percent0, source1, percent, percent1, lines, source0, filename1. If plagiarized, filename0, percent0, source1, percent, percent1, lines, source0, filename1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5308
  Batch size = 16


Saving model checkpoint to resultsBERT03292022_002726_512/checkpoint-82
Configuration saved in resultsBERT03292022_002726_512/checkpoint-82/config.json
Model weights saved in resultsBERT03292022_002726_512/checkpoint-82/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: plagiarized, filename0, percent0, source1, percent, percent1, lines, source0, filename1. If plagiarized, filename0, percent0, source1, percent, percent1, lines, source0, filename1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5308
  Batch size = 16
Saving model checkpoint to resultsBERT03292022_002726_512/checkpoint-164
Configuration saved in resultsBERT03292022_002726_512/checkpoint-164/config.json
Model weights saved in resultsBERT03292022_002726_512/checkpoint-164/pytorch_model.bin
The following columns in 

TrainOutput(global_step=328, training_loss=0.4483931031895847, metrics={'train_runtime': 1238.32, 'train_samples_per_second': 68.448, 'train_steps_per_second': 0.265, 'total_flos': 2.224919706335232e+16, 'train_loss': 0.4483931031895847, 'epoch': 3.99})

In [None]:
# Evaluate the results
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: plagiarized, filename0, percent0, source1, percent, percent1, lines, source0, filename1. If plagiarized, filename0, percent0, source1, percent, percent1, lines, source0, filename1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5308
  Batch size = 16


{'epoch': 3.99,
 'eval_accuracy': 0.8180105501130369,
 'eval_f1': 0.8233357717629848,
 'eval_loss': 0.4414847493171692,
 'eval_precision': 0.7999289267945985,
 'eval_recall': 0.848153730218538,
 'eval_runtime': 24.0608,
 'eval_samples_per_second': 220.608,
 'eval_steps_per_second': 13.798}

In [None]:
results = trainer.predict(test_data)
pprint.pprint(results.metrics)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: plagiarized, filename0, percent0, source1, percent, percent1, lines, source0, filename1. If plagiarized, filename0, percent0, source1, percent, percent1, lines, source0, filename1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 6588
  Batch size = 16


{'test_accuracy': 0.779143897996357,
 'test_f1': 0.7815643296802282,
 'test_loss': 0.5230265855789185,
 'test_precision': 0.7730917730917731,
 'test_recall': 0.7902246508803886,
 'test_runtime': 30.3457,
 'test_samples_per_second': 217.098,
 'test_steps_per_second': 13.577}


In [None]:
!gsutil cp -r $outputdir gs://w266finalproject/

Copying file://resultsBERT03292022_002726_512/checkpoint-164/scaler.pt [Content-Type=application/octet-stream]...
/ [0 files][    0.0 B/  559.0 B]                                                ServiceException: 401 Anonymous caller does not have storage.objects.create access to the Google Cloud Storage object.


In [None]:
!ls -al 


total 159456
drwxr-xr-x 1 root root      4096 Mar 29 00:27 .
drwxr-xr-x 1 root root      4096 Mar 28 23:53 ..
-rw-r--r-- 1  501 staff  1114619 Mar 16 08:22 alldata2.csv
drwxr-xr-x 1 root root      4096 Mar 23 14:21 .config
drwxr-xr-x 2 root root      4096 Mar 29 00:26 data
-rw-r--r-- 1  501 staff   203396 Mar 16 08:19 groundtruth2.csv
drwxr-xr-x 3 root root      4096 Mar 29 00:48 logs
-rw-r--r-- 1 root root  81619968 Mar 29 00:25 plag2.tar
drwxr-xr-x 6 root root      4096 Mar 29 00:48 resultsBERT03292022_002726_512
drwxr-xr-x 1 root root      4096 Mar 23 14:22 sample_data
-rw-r--r-- 1  501 staff 15819857 Mar 16 08:22 test.csv
-rw-r--r-- 1  501 staff 64478135 Mar 16 08:22 train.csv


In [None]:
!mkdir saved_model

In [None]:
trainer.save_model('saved_model')

Saving model checkpoint to saved_model
Configuration saved in saved_model/config.json
Model weights saved in saved_model/pytorch_model.bin


In [None]:
#!gsutil cp -R gs://w266finalproject/resultsBERT03292022_002726_512/checkpoint-328/* saved_model

CommandException: No URLs matched: gs://w266finalproject/resultsBERT03292022_002726_512/checkpoint-328/*


In [None]:
!ls saved_model

config.json  optimizer.pt_.gstmp  pytorch_model.bin  training_args.bin


In [None]:
# load model and tokenizer and define length of the text sequence
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
model = BertForSequenceClassification.from_pretrained("./saved_model",
                gradient_checkpointing=False,
                num_labels = 2,
                cache_dir='data',
                return_dict=True).to(device)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", 
                                          max_length = 512,
                                          cache_dir='data',)

loading configuration file ./saved_model/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file ./saved_model/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were

In [None]:

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = "saved",
    num_train_epochs = 4,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 32,    
    per_device_eval_batch_size= 16,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    warmup_steps=160,
    weight_decay=0.01,
    logging_steps = 4,
    learning_rate = 1e-5,
    fp16 = True,
    logging_dir='logs',
    dataloader_num_workers = 0,
#    run_name = 'bigbird_classification_1e5'
)
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    # train_dataset=train_data,
    # eval_dataset=valid_data
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp half precision backend


In [None]:
#!gsutil cp gs://w266finalproject/plagA20162017.tar plag2.tar
!gsutil cp gs://w266finalproject/plag2.tar plag2.tar
!tar xvf plag2.tar
!ls -l

!mv train2.csv train.csv
!mv test2.csv test.csv

test_df = pd.read_csv("test.csv")
oversample = RandomOverSampler(sampling_strategy='minority',random_state=1234)
test_over, y_test_over = oversample.fit_resample(test_df, test_df.plagiarized)
print(test_over.plagiarized.value_counts())
test_data = datasets.Dataset.from_pandas(test_over)
def tokenization(batched_text):
    return tokenizer(batched_text['source0'],batched_text['source1'], padding = 'max_length', truncation=True, max_length = 512)
test_data = test_data.map(tokenization, batched = True, batch_size = 256)
test_data = test_data.map(lambda examples: {'label': examples['plagiarized']}, batched=True)
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])

Copying gs://w266finalproject/plag2.tar...
- [1 files][ 77.8 MiB/ 77.8 MiB]                                                
Operation completed over 1 objects/77.8 MiB.                                     
alldata2.csv
groundtruth2.csv
test2.csv
train2.csv
total 237864
-rw-r--r-- 1  501 staff  1114619 Mar 16 08:22 alldata2.csv
drwxr-xr-x 2 root root      4096 Mar 29 00:26 data
-rw-r--r-- 1  501 staff   203396 Mar 16 08:19 groundtruth2.csv
drwxr-xr-x 3 root root      4096 Mar 29 00:48 logs
-rw-r--r-- 1 root root  81619968 Mar 29 00:54 plag2.tar
drwxr-xr-x 6 root root      4096 Mar 29 00:48 resultsBERT03292022_002726_512
drwxr-xr-x 1 root root      4096 Mar 23 14:22 sample_data
drwxr-xr-x 2 root root      4096 Mar 29 00:54 saved
drwxr-xr-x 2 root root      4096 Mar 29 00:53 saved_model
-rw-r--r-- 1  501 staff 15819857 Mar 16 08:22 test2.csv
-rw-r--r-- 1  501 staff 15819857 Mar 16 08:22 test.csv
-rw-r--r-- 1  501 staff 64478135 Mar 16 08:22 train2.csv
-rw-r--r-- 1  501 staff 64478135 Mar 

  0%|          | 0/26 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [None]:
predictions = trainer.predict(test_data)
pprint.pprint(predictions.metrics)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: plagiarized, filename0, percent0, source1, percent, percent1, lines, source0, filename1. If plagiarized, filename0, percent0, source1, percent, percent1, lines, source0, filename1 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 6588
  Batch size = 16


{'test_accuracy': 0.779143897996357,
 'test_f1': 0.7815643296802282,
 'test_loss': 0.5230265855789185,
 'test_precision': 0.7730917730917731,
 'test_recall': 0.7902246508803886,
 'test_runtime': 29.7856,
 'test_samples_per_second': 221.18,
 'test_steps_per_second': 13.832}


In [None]:
preds = np.argmax(predictions.predictions, axis=-1)
print(preds)

[0 1 0 ... 0 1 1]


In [None]:

print(confusion_matrix(preds, predictions.label_ids))

tn, fp, fn, tp = confusion_matrix(preds, predictions.label_ids).ravel()
print(tn,fp,fn,tp)




[[2530  691]
 [ 764 2603]]
2530 691 764 2603


In [None]:

sourcefalsepos = test_over[np.logical_and(preds == 1,predictions.label_ids==0)][['percent','label','plagiarized','lines','filename0','filename1','source0','source1']]
sourcefalsepos.sort_values(by=['percent'])

Unnamed: 0,percent,label,plagiarized,lines,filename0,filename1,source0,source1
2591,2,2,0,5,A2016/Z5/Z2/student2676,A2016/Z5/Z2/student3776,#include <stdio.h>\n#include <string.h>\n#incl...,"#include <stdio.h>\n\nvoid unesi(char niz[],in..."
1586,2,2,0,6,A2016/Z4/Z4/student8794,A2016/Z4/Z4/student7392,#include <stdio.h>\n#include <math.h>\n\n//Zad...,#include <stdio.h>\n#include <math.h>\n\n/* VR...
1756,3,3,0,4,A2016/Z5/Z2/student7392,A2016/Z5/Z2/student9823,#include <stdio.h>\n#include <stdlib.h>\n#incl...,#include <stdio.h>\n#include <ctype.h>\n#inclu...
1327,3,3,0,14,A2016/Z5/Z2/student3315,A2016/Z5/Z2/student5660,#include <stdio.h>\n#include <ctype.h>\n#inclu...,#include <stdio.h>\n#include <string.h>\n#incl...
1277,3,3,0,4,A2016/Z5/Z2/student7392,A2016/Z5/Z2/student6723,#include <stdio.h>\n#include <stdlib.h>\n#incl...,\n#include <stdio.h>\n#include <ctype.h>\n#inc...
...,...,...,...,...,...,...,...,...
2084,99,99,0,108,A2016/Z4/Z1/student5744,A2016/Z4/Z1/student2675,#include <stdio.h>\n\nchar* izbaci_najcescu(ch...,#include <stdio.h>\n\nchar* izbaci_najcescu(ch...
2087,99,99,0,77,A2016/Z3/Z3/student2111,A2016/Z3/Z3/student7165,#include <stdio.h>\nint main() {\n\tint nesto=...,#include <stdio.h>\nint main() {\n\tint nesto=...
2097,99,99,0,71,A2016/Z4/Z2/student5957,A2016/Z4/Z2/student6550,"#include <stdio.h>\n\nvoid unesi(char niz[], i...","#include <stdio.h>\n\nvoid unesi(char niz[], i..."
1949,99,99,0,22,A2016/Z1/Z4/student9949,A2016/Z1/Z4/student4814,"#include <stdio.h>\nint main() {\n\tint i,j,n;...","#include <stdio.h>\n\nint main() {\n\tint i,j,..."


In [None]:
pp = pprint.PrettyPrinter(depth=6,width=200)

In [None]:
sourcefalseneg = test_over[np.logical_and(preds == 0,predictions.label_ids==1)][['percent','label','plagiarized','lines','filename0','filename1','source0','source1']]
sourcefalseneg.sort_values(by=['percent'], ascending=False)

Unnamed: 0,percent,label,plagiarized,lines,filename0,filename1,source0,source1
4832,99,99,1,136,A2016/Z1/Z1/student5512,A2016/Z1/Z1/student4852,"#include <stdio.h>\nint main() {\n\tdouble Tb,...","#include <stdio.h>\nint main() {\n\tdouble BT,..."
3781,99,99,1,32,A2016/Z1/Z4/student6054,A2016/Z1/Z4/student7341,"#include <stdio.h>\n\nint main() {\n\tint red,...","#include <stdio.h>\n\nint main() {\n\tint n, i..."
4758,99,99,1,136,A2016/Z1/Z1/student5512,A2016/Z1/Z1/student4852,"#include <stdio.h>\nint main() {\n\tdouble Tb,...","#include <stdio.h>\nint main() {\n\tdouble BT,..."
3970,99,99,1,32,A2016/Z1/Z4/student6054,A2016/Z1/Z4/student7341,"#include <stdio.h>\n\nint main() {\n\tint red,...","#include <stdio.h>\n\nint main() {\n\tint n, i..."
4797,99,99,1,32,A2016/Z1/Z4/student6054,A2016/Z1/Z4/student7341,"#include <stdio.h>\n\nint main() {\n\tint red,...","#include <stdio.h>\n\nint main() {\n\tint n, i..."
...,...,...,...,...,...,...,...,...
3840,34,34,1,29,A2016/Z4/Z1/student2908,A2016/Z4/Z1/student2210,#include <stdio.h>\n\nchar* izbaci_najcescu (c...,"#include <stdio.h>\nvoid unesi(char niz[], int..."
3560,34,34,1,29,A2016/Z4/Z1/student2908,A2016/Z4/Z1/student2210,#include <stdio.h>\n\nchar* izbaci_najcescu (c...,"#include <stdio.h>\nvoid unesi(char niz[], int..."
3680,34,34,1,29,A2016/Z4/Z1/student2908,A2016/Z4/Z1/student2210,#include <stdio.h>\n\nchar* izbaci_najcescu (c...,"#include <stdio.h>\nvoid unesi(char niz[], int..."
5220,34,34,1,29,A2016/Z4/Z1/student2908,A2016/Z4/Z1/student2210,#include <stdio.h>\n\nchar* izbaci_najcescu (c...,"#include <stdio.h>\nvoid unesi(char niz[], int..."


In [None]:
sourcetruepos = test_over[np.logical_and(preds == 1,predictions.label_ids==1)][['percent','label','plagiarized','lines','filename0','filename1','source0','source1']]
sourcetruepos.sort_values(by=['percent'], ascending=False)

Unnamed: 0,percent,label,plagiarized,lines,filename0,filename1,source0,source1
5947,99,99,1,82,A2016/Z3/Z3/student4420,A2016/Z3/Z3/student4661,#include <stdio.h>\n \n \nint main() {\n\tint ...,#include <stdio.h>\n\n\nint main()\n{\n\tint m...
4665,99,99,1,64,A2016/Z3/Z3/student8430,A2016/Z3/Z3/student3517,#include <stdio.h>\n\nint main()\n{\n\n int...,#include <stdio.h>\n\nint main() {\n\tint matr...
6256,99,99,1,82,A2016/Z3/Z3/student4420,A2016/Z3/Z3/student4661,#include <stdio.h>\n \n \nint main() {\n\tint ...,#include <stdio.h>\n\n\nint main()\n{\n\tint m...
5052,99,99,1,47,A2016/Z1/Z3/student6776,A2016/Z1/Z3/student8357,#include <stdio.h>\n\nint main() {\n\tfloat B=...,#include <stdio.h>\n#include <math.h>\n\nint m...
5763,99,99,1,99,A2016/Z1/Z1/student1915,A2016/Z1/Z1/student4959,#include <stdio.h>\n\nint main() {\n\tdouble p...,#include <stdio.h>\n\nint main() {\n\tdouble p...
...,...,...,...,...,...,...,...,...
5665,17,17,1,49,A2016/Z5/Z3/student4082,A2016/Z5/Z3/student5378,#include <stdio.h>\n#include<string.h>\n#defin...,#include <stdio.h>\n#include <string.h>\n#defi...
2275,17,17,1,49,A2016/Z5/Z3/student4082,A2016/Z5/Z3/student5378,#include <stdio.h>\n#include<string.h>\n#defin...,#include <stdio.h>\n#include <string.h>\n#defi...
4068,17,17,1,49,A2016/Z5/Z3/student4082,A2016/Z5/Z3/student5378,#include <stdio.h>\n#include<string.h>\n#defin...,#include <stdio.h>\n#include <string.h>\n#defi...
4125,17,17,1,49,A2016/Z5/Z3/student4082,A2016/Z5/Z3/student5378,#include <stdio.h>\n#include<string.h>\n#defin...,#include <stdio.h>\n#include <string.h>\n#defi...


In [None]:

sourcetrueneg = test_over[np.logical_and(preds == 0,predictions.label_ids==0)][['percent','label','plagiarized','lines','filename0','filename1','source0','source1']]
sourcetrueneg.sort_values(by=['percent'], ascending=False)

Unnamed: 0,percent,label,plagiarized,lines,filename0,filename1,source0,source1
1924,99,99,0,34,A2016/Z1/Z4/student6547,A2016/Z1/Z4/student8317,"#include <stdio.h>\n\nint main() {\n\tint n=0,...","#include <stdio.h>\n\nint main() {\n\tint n=0,..."
1866,99,99,0,149,A2016/Z4/Z3/student3900,A2016/Z4/Z3/student7802,#include <stdio.h>\n#include <stdlib.h>\nint b...,#include <stdio.h>\n#include <stdlib.h>\nint b...
2707,99,99,0,31,A2016/Z1/Z4/student9352,A2016/Z1/Z4/student5649,"#include <stdio.h>\n\nint main()\n{\n\tint i,j...","#include <stdio.h>\n\nint main() {\n\tint n, j..."
1255,99,99,0,52,A2016/Z2/Z3/student8864,A2016/Z2/Z3/student2547,"#include <stdio.h>\n\nint main() {\n\tint i,j,...","#include <stdio.h>\n\nint main() {\n\tint i,j,..."
3193,99,99,0,30,A2016/Z1/Z4/student2645,A2016/Z1/Z4/student8576,#include <stdio.h>\n\nint main() {\n\t\n\tint ...,#include <stdio.h>\n\nint main() {\n\t\n\tint ...
...,...,...,...,...,...,...,...,...
2396,2,2,0,3,A2016/Z5/Z3/student7823,A2016/Z5/Z3/student9897,#include <stdio.h>\n#include <string.h>\n#defi...,#include <stdio.h>\n#include <string.h>\n#defi...
2972,2,2,0,9,A2016/Z5/Z2/student9315,A2016/Z5/Z2/student3219,#include <stdio.h>\n#include <string.h>\n#incl...,#include <stdio.h>\n#include <string.h>\n\nint...
1305,2,2,0,9,A2016/Z5/Z2/student3219,A2016/Z5/Z2/student4766,#include <stdio.h>\n#include <string.h>\n\nint...,#include <stdio.h>\n#include <string.h>\n#incl...
2743,2,2,0,9,A2016/Z5/Z2/student3219,A2016/Z5/Z2/student2068,#include <stdio.h>\n#include <string.h>\n\nint...,#include <stdio.h>\n#include <stdlib.h>\n#incl...
