Author: Louis Owen (https://louisowen6.github.io/)

In [39]:
from pdfparser import pdf_to_text
from preprocessor import preprocessing

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

import pylcs
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

from itertools import combinations

In [51]:
import pandas as pd
import numpy as np
import re
import pickle

# Experiment

## PDF to Text

In [2]:
essay_1_doc = pdf_to_text('../data/essay_1.pdf')
essay_1_similar_doc = pdf_to_text('../data/essay_1_similar.pdf')
essay_2_doc = pdf_to_text('../data/essay_2.pdf')
essay_2_similar_doc = pdf_to_text('../data/essay_2_similar.pdf')
essay_3_doc = pdf_to_text('../data/essay_3.pdf')

# The key will be filled with Author's name
doc_dict = {'essay_1':essay_1_doc,
            'essay_1_similar':essay_1_similar_doc,
            'essay_2':essay_2_doc,
            'essay_2_similar':essay_2_similar_doc,
            'essay_3':essay_3_doc,
           }

## Candidate Document Retrieval

In [4]:
def extract_ngram_similarity(sentence_1,sentence_2,n=4):
    ngrams_set_1 = set()
    ngrams_set_2 = set()
    
    ngram1 = ngrams(sentence_1.split(), n)
    for grams in ngram1:
        ngrams_set_1.add(grams)
        
    ngram2 = ngrams(sentence_2.split(), n)
    for grams in ngram2:
        ngrams_set_2.add(grams)
    
    try:
        jaccard_sim = len(ngrams_set_1.intersection(ngrams_set_2)) / len(ngrams_set_1.union(ngrams_set_2))

        return jaccard_sim
    except:
        return 0

In [5]:
doc_pair_combinations = list(combinations(list(doc_dict.keys()), 2))

In [6]:
candidate_pairs = []

for pair1,pair2 in doc_pair_combinations:
    sim = extract_ngram_similarity(' '.join(doc_dict[pair1]),' '.join(doc_dict[pair2]),1)
    print(pair1,pair2)
    print('-----------',sim)
    
    if sim > 0.25:
        candidate_pairs.append((pair1,pair2))

essay_1 essay_1_similar
----------- 0.6457399103139013
essay_1 essay_2
----------- 0.08991228070175439
essay_1 essay_2_similar
----------- 0.09544787077826726
essay_1 essay_3
----------- 0.11764705882352941
essay_1_similar essay_2
----------- 0.088139281828074
essay_1_similar essay_2_similar
----------- 0.09143686502177069
essay_1_similar essay_3
----------- 0.12583892617449666
essay_2 essay_2_similar
----------- 0.554531490015361
essay_2 essay_3
----------- 0.05795454545454545
essay_2_similar essay_3
----------- 0.06220839813374806


In [7]:
candidate_pairs

[('essay_1', 'essay_1_similar'), ('essay_2', 'essay_2_similar')]

## Paragraph Filtering

In [8]:
candidate_doc_dict = {}
for pair1,pair2 in candidate_pairs:
    candidate_doc_dict[pair1] = [x for x in doc_dict[pair1] if len(x.split('.'))>2]
    candidate_doc_dict[pair2] = [x for x in doc_dict[pair2] if len(x.split('.'))>2]

In [9]:
candidate_doc_dict.keys()

dict_keys(['essay_1', 'essay_1_similar', 'essay_2', 'essay_2_similar'])

In [10]:
candidate_doc_dict['essay_1']

['Musim kemarau di Indonesia menyebabkan angka produksi padi menurun, penurunan sebesar 4,6 juta ton terjadi di tahun 2019 (1). Gambar 1 menunjukan luas fase persiapan lahan terjadi penurunan pada bulan April — Agustus yang sesuai dengan musim kemarau di Indonesia. Provinsi yang memiliki luas fase persiapan terbesar diantaranya Jawa Barat,  Jawa Tengah, Jawa Timur, Sulawesi Selatan, dan Sumatera Selatan.',
 'Selain itu, provinsi tersebut memiliki tingkat petani skala kecil yang hanya memiliki luas sawah 0,16 hektar dengan persentase tertinggi di Indonesia. Salah satu indikator tujuan pembangunan berkelanjutan pada sektor pertanian yaitu nilai produksi per hektar, 90Y6 lahan pertanian di Jawa Barat, Jawa Timur, dan Nusa Tenggara Barat (NTB) dikategorikan lahan pertanian tidak berkelanjutan (21. Hal ini disebabkan karena tidak ada sumber air saat musim kemarau, bantuan dari pemerintah berupa pompa air tetapi sumber listrik di lokasi belum memadai. Mahalnya harga dan jauhnya akses untuk m

## Paragraph Preprocessing

In [11]:
for doc in candidate_doc_dict:
    candidate_doc_dict[doc] = [preprocessing(x,stemmer) for x in candidate_doc_dict[doc]]

In [12]:
candidate_doc_dict['essay_1']

['musim kemarau angka produksi padi turun turun 4 6 juta ton 2019 1 gambar 1 luas fase lahan turun april agustus musim kemarau provinsi milik luas fase jawa barat jawa jawa timur sulawesi selatan sumatera selatan',
 'provinsi milik tingkat tani skala milik luas sawah 0 16 hektar persentase salah indikator tuju bangun sektor tani nilai produksi hektar 90y6 lahan tani jawa barat jawa timur nusa tenggara barat ntb kategori lahan tani 21 sumber musim kemarau bantu perintah pompa sumber lokasi mahal harga akses beli bahan bakar minyak bbm tani milik sawah desa sanggup bbm salah bangkit suplai pompa potensi radiasi matahari panel surya opsi bangkit milik harga ekonomis panel surya luas lahan tani akibat turun produksi salah solusi tingkat persentase system of rice intensification sri sumber pompa panel',
 'metode butuh tanam terputus- putus intermittent sri kelola tanam tingkat produktivitas tanam 30-100 y4 31 tanam 100 sawah sistem sri 467 mm 4 67 mm konversi 0 116 liter detik ha produktivi

## Paragraph Pairing

In [24]:
def generate_bigram(arr):
    return [arr[i]+' '+arr[i+1] for i in range(len(arr)-1)]

In [25]:
paired_dict = {'cleaned_paragraph_1':[],'cleaned_paragraph_2':[],'type':[],'pairs':[]}

for pair1,pair2 in candidate_pairs:
    for paragraph1 in candidate_doc_dict[pair1]:
        for paragraph2 in candidate_doc_dict[pair2]:
            paired_dict['cleaned_paragraph_1'].append(paragraph1)
            paired_dict['cleaned_paragraph_2'].append(paragraph2)
            paired_dict['type'].append('uni-paragraph')
            paired_dict['pairs'].append((pair1,pair2))
            
    bi_paragraph1_list = generate_bigram(candidate_doc_dict[pair1])
    bi_paragraph2_list = generate_bigram(candidate_doc_dict[pair2])
    
    for bi_paragraph1 in bi_paragraph1_list:
        for bi_paragraph2 in bi_paragraph2_list:
            paired_dict['cleaned_paragraph_1'].append(bi_paragraph1)
            paired_dict['cleaned_paragraph_2'].append(bi_paragraph2)
            paired_dict['type'].append('bi-paragraph')
            paired_dict['pairs'].append((pair1,pair2))

In [27]:
df_test = pd.DataFrame(paired_dict)
df_test

Unnamed: 0,cleaned_paragraph_1,cleaned_paragraph_2,type,pairs
0,musim kemarau angka produksi padi turun turun ...,musim kemarau angka produksi padi turun penuru...,uni-paragraph,"(essay_1, essay_1_similar)"
1,musim kemarau angka produksi padi turun turun ...,provinsi milik tingkat tani skala milik luas s...,uni-paragraph,"(essay_1, essay_1_similar)"
2,musim kemarau angka produksi padi turun turun ...,metode butuh tanam terputus- putus intermitten...,uni-paragraph,"(essay_1, essay_1_similar)"
3,musim kemarau angka produksi padi turun turun ...,sistem kontrol atur arah arus pompa baterai si...,uni-paragraph,"(essay_1, essay_1_similar)"
4,musim kemarau angka produksi padi turun turun ...,rancang panel surya kapasitas 4 kvv sawar 1 he...,uni-paragraph,"(essay_1, essay_1_similar)"
...,...,...,...,...
203,to actualization their knowledge it needs hand...,transisi hadap tantang bijak aspek kunci cepat...,bi-paragraph,"(essay_2, essay_2_similar)"
204,to actualization their knowledge it needs hand...,transisi hadap tantang bijak aspek kunci cepat...,bi-paragraph,"(essay_2, essay_2_similar)"
205,to actualization their knowledge it needs hand...,tingkat tajam sumber utama kait emisi dioksida...,bi-paragraph,"(essay_2, essay_2_similar)"
206,to actualization their knowledge it needs hand...,platform kolaborasi anak libat multi sektor bu...,bi-paragraph,"(essay_2, essay_2_similar)"


## Feature Engineering

In [34]:
df_train = pd.DataFrame(columns=['category','paragraph_1','paragraph_2','len_paragraph_1','is_plagiarism','plagiarism_type','cleaned_paragraph_1','cleaned_paragraph_2'])

with open('../data/plagiarism_data_train.tsv','r') as f_in:
    for i,line in enumerate(f_in):
        if i>0:
            columns = line.split('\t')
            columns[-1] = re.sub('\n','',columns[-1])
            df_train.loc[i] = columns

In [48]:
def feature_engineering(df_train,df_test):
    corpus = df_train['cleaned_paragraph_1'].to_list() + df_train['cleaned_paragraph_2'].to_list()
    
    # Word Pairs
    bigram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2), min_df=0.002)
    bigram_vectorizer.fit(corpus)
    
    transformed_array_test = bigram_vectorizer.transform(df_test['cleaned_paragraph_1'] + ' '+ df_test['cleaned_paragraph_2']).toarray()
    num_feats = transformed_array_test.shape[1]

    df_preprocessed_test = pd.DataFrame(transformed_array_test,columns=[f'word_pairs_{i}' for i in range(num_feats)])
    
    # Words Similarity
    df_preprocessed_test['words_similarity'] = df_test.apply(lambda x: extract_ngram_similarity(x['cleaned_paragraph_1'],x['cleaned_paragraph_2'],n=1), axis=1)
    
    # Fingerprints Similarity
    df_preprocessed_test['fingerprint_similarity'] = df_test.apply(lambda x: extract_ngram_similarity(x['cleaned_paragraph_1'],x['cleaned_paragraph_2'],n=4), axis=1)
    
    # Longest Common Subsequence
    df_preprocessed_test['lcs'] = df_test.apply(lambda x: pylcs.lcs2(x['cleaned_paragraph_1'],x['cleaned_paragraph_2'])/max(len(x['cleaned_paragraph_1']),len(x['cleaned_paragraph_2'])), axis=1)
    
    # LSA Similarity
    num_component = 200
    tfidf_vectorizer = TfidfVectorizer(use_idf=True,smooth_idf=True)
    lsa = TruncatedSVD(num_component, algorithm = 'randomized',random_state=0)

    tfidf_vectorizer.fit(corpus)
    dtm = tfidf_vectorizer.transform(corpus)
    lsa.fit(dtm)
        
    dtm_lsa_test_1 = lsa.transform(tfidf_vectorizer.transform(df_test['cleaned_paragraph_1'].to_list()))
    dtm_lsa_test_2 = lsa.transform(tfidf_vectorizer.transform(df_test['cleaned_paragraph_2'].to_list()))
    
    cosine_sim_test = []
    for i in range(dtm_lsa_test_1.shape[0]):
        cosine_sim_test.append(cosine_similarity([dtm_lsa_test_1[i]],[dtm_lsa_test_2[i]])[0][0])
    
    df_preprocessed_test['lsa_similarity'] = cosine_sim_test
    
    # Add Supporting Features
    df_preprocessed_test[['type','pairs']] = df_test[['type','pairs']]
    
    return df_preprocessed_test

In [81]:
df_preprocessed_test = feature_engineering(df_train,df_test)

## Model Inference

In [83]:
def predict(df_preprocessed_test,model):

    X_val = df_preprocessed_test.drop(columns=['type','pairs'])

    y_pred = model.predict_proba(X_val)
    
    return y_pred

In [84]:
classic_ml_model = pickle.load(open('../model/classic_ml_model.pkl', 'rb'))

In [85]:
df_preprocessed_test[['negative_prob','positive_prob']] = predict(df_preprocessed_test,classic_ml_model)
df_preprocessed_test['prediction'] = np.argmax(df_preprocessed_test[['negative_prob','positive_prob']].values,axis=1)
df_preprocessed_test

Unnamed: 0,word_pairs_0,word_pairs_1,word_pairs_2,word_pairs_3,word_pairs_4,word_pairs_5,word_pairs_6,word_pairs_7,word_pairs_8,word_pairs_9,...,word_pairs_203,words_similarity,fingerprint_similarity,lcs,lsa_similarity,type,pairs,negative_prob,positive_prob,prediction
0,0,0,0,0,0,0,0,0,0,0,...,0,0.896552,0.511111,0.380531,0.986783,uni-paragraph,"(essay_1, essay_1_similar)",0.00,1.00,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0.132530,0.000000,0.025994,0.340639,uni-paragraph,"(essay_1, essay_1_similar)",0.71,0.29,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0.046154,0.000000,0.024221,0.096824,uni-paragraph,"(essay_1, essay_1_similar)",1.00,0.00,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0.041667,0.000000,0.032710,0.022834,uni-paragraph,"(essay_1, essay_1_similar)",1.00,0.00,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0.079545,0.000000,0.032397,0.142074,uni-paragraph,"(essay_1, essay_1_similar)",0.99,0.01,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,0,0,0,0,0,0,0,0,0,0,...,0,0.018072,0.000000,0.008593,0.092465,bi-paragraph,"(essay_2, essay_2_similar)",0.98,0.02,0
204,0,0,0,0,0,0,0,0,0,0,...,0,0.011364,0.000000,0.008895,0.091897,bi-paragraph,"(essay_2, essay_2_similar)",1.00,0.00,0
205,0,0,0,0,0,0,0,0,0,0,...,0,0.022099,0.000000,0.012121,0.066489,bi-paragraph,"(essay_2, essay_2_similar)",1.00,0.00,0
206,0,0,0,0,0,0,0,0,0,0,...,0,0.026144,0.000000,0.012706,0.061532,bi-paragraph,"(essay_2, essay_2_similar)",1.00,0.00,0


## Score Aggregation

In [122]:
def calculate_plagiarism_score(df_preprocessed_pairs_i,candidate_doc_dict,pairs):
    avg_pos_prob = df_preprocessed_pairs_i[df_preprocessed_pairs_i['prediction']==1]['positive_prob'].mean()
    
    uni_par_length = (len(candidate_doc_dict[pairs[0]]) + len(candidate_doc_dict[pairs[1]]))
    bi_par_length = (len(candidate_doc_dict[pairs[0]]) - 1 + len(candidate_doc_dict[pairs[1]]) - 1) 
    freq = min(1,df_preprocessed_pairs_i['prediction'].value_counts().loc[1] / (uni_par_length+bi_par_length))
    
    return avg_pos_prob * freq

In [124]:
score_dict = {}
for pairs in df_preprocessed_test['pairs'].unique():
    df_preprocessed_pairs_i = df_preprocessed_test[df_preprocessed_test['pairs']==pairs]
    
    score = calculate_plagiarism_score(df_preprocessed_pairs_i,candidate_doc_dict,pairs)
    
    print(pairs,score)
    score_dict[pairs] = score

('essay_1', 'essay_1_similar') 0.8031818181818182
('essay_2', 'essay_2_similar') 0.7652631578947368


In [125]:
score_dict

{('essay_1', 'essay_1_similar'): 0.8031818181818182,
 ('essay_2', 'essay_2_similar'): 0.7652631578947368}

# ---------------------------------------------------------------------------------------------------------------

# Full Pipeline

In [1]:
import sys
sys.path.append("..")

In [2]:
import pickle
from scripts.docxparser import docx_to_pdf
from scripts.pdfparser import pdf_to_text
from scripts.pipeline import plagiarism_pipeline

### Source: PDF

In [3]:
essay_1_doc = pdf_to_text('../data/essay_1.pdf')
essay_1_similar_doc = pdf_to_text('../data/essay_1_similar.pdf')
essay_2_doc = pdf_to_text('../data/essay_2.pdf')
essay_2_similar_doc = pdf_to_text('../data/essay_2_similar.pdf')
essay_3_doc = pdf_to_text('../data/essay_3.pdf')

# The key will be filled with Author's name
doc_dict = {'essay_1':essay_1_doc,
            'essay_1_similar':essay_1_similar_doc,
            'essay_2':essay_2_doc,
            'essay_2_similar':essay_2_similar_doc,
            'essay_3':essay_3_doc,
           }

In [4]:
classic_ml_model = pickle.load(open('../model/classic_ml_model.pkl', 'rb'))

In [5]:
score_dict = plagiarism_pipeline(doc_dict,classic_ml_model,
                                candidate_doc_thres=0.25)

Candidate Document Retrieval: 100%|██████████| 10/10 [00:00<00:00, 1181.66it/s]
Paragraph Filtering: 100%|██████████| 2/2 [00:00<00:00, 1840.82it/s]
Paragraph Preprocessing: 100%|██████████| 4/4 [00:32<00:00,  8.15s/it]
Paragraph Pairing: 100%|██████████| 2/2 [00:00<00:00, 7612.17it/s]
Feature Engineering: 2013it [00:07, 269.32it/s]


('essay_1', 'essay_1_similar') 0.8031818181818182
('essay_2', 'essay_2_similar') 0.7652631578947368


In [6]:
# Check All Document Pairs
score_dict = plagiarism_pipeline(doc_dict,classic_ml_model,
                                 candidate_doc_thres=0)

Candidate Document Retrieval: 100%|██████████| 10/10 [00:00<00:00, 1165.80it/s]
Paragraph Filtering: 100%|██████████| 10/10 [00:00<00:00, 18859.28it/s]
Paragraph Preprocessing: 100%|██████████| 5/5 [00:03<00:00,  1.29it/s]
Paragraph Pairing: 100%|██████████| 10/10 [00:00<00:00, 16396.81it/s]
Feature Engineering: 2013it [00:07, 276.45it/s]


('essay_1', 'essay_1_similar') 0.8031818181818182
('essay_1', 'essay_2') 0
('essay_1', 'essay_2_similar') 0
('essay_1', 'essay_3') 0.04576923076923077
('essay_1_similar', 'essay_2') 0
('essay_1_similar', 'essay_2_similar') 0
('essay_1_similar', 'essay_3') 0.024090909090909093
('essay_2', 'essay_2_similar') 0.7652631578947368
('essay_2', 'essay_3') 0
('essay_2_similar', 'essay_3') 0


### Source: DOCX

In [3]:
docx_to_pdf('../data/docx_to_pdf','../data/docx_files/essay_1.docx')
docx_to_pdf('../data/docx_to_pdf','../data/docx_files/essay_1_similar.docx')
docx_to_pdf('../data/docx_to_pdf','../data/docx_files/essay_2.docx')
docx_to_pdf('../data/docx_to_pdf','../data/docx_files/essay_2_similar.docx')
docx_to_pdf('../data/docx_to_pdf','../data/docx_files/essay_3.docx')

'/mnt/c/users/User/Desktop/practisee_plagiarism/data/docx_to_pdf/essay_3.pdf'

In [4]:
essay_1_doc = pdf_to_text('../data/docx_to_pdf/essay_1.pdf')
essay_1_similar_doc = pdf_to_text('../data/docx_to_pdf/essay_1_similar.pdf')
essay_2_doc = pdf_to_text('../data/docx_to_pdf/essay_2.pdf')
essay_2_similar_doc = pdf_to_text('../data/docx_to_pdf/essay_2_similar.pdf')
essay_3_doc = pdf_to_text('../data/docx_to_pdf/essay_3.pdf')

# The key will be filled with Author's name
doc_dict = {'essay_1':essay_1_doc,
            'essay_1_similar':essay_1_similar_doc,
            'essay_2':essay_2_doc,
            'essay_2_similar':essay_2_similar_doc,
            'essay_3':essay_3_doc,
           }

In [5]:
classic_ml_model = pickle.load(open('../model/classic_ml_model.pkl', 'rb'))

In [6]:
score_dict = plagiarism_pipeline(doc_dict,classic_ml_model,
                                candidate_doc_thres=0.25)

Candidate Document Retrieval: 100%|██████████| 10/10 [00:00<00:00, 912.78it/s]
Paragraph Filtering: 100%|██████████| 2/2 [00:00<00:00, 4662.93it/s]
Paragraph Preprocessing: 100%|██████████| 4/4 [00:32<00:00,  8.22s/it]
Paragraph Pairing: 100%|██████████| 2/2 [00:00<00:00, 5870.26it/s]
Feature Engineering: 2013it [00:08, 242.11it/s]


('essay_1', 'essay_1_similar') 0.7445
('essay_2', 'essay_2_similar') 0.8047222222222222


In [7]:
# Check All Document Pairs
score_dict = plagiarism_pipeline(doc_dict,classic_ml_model,
                                 candidate_doc_thres=0)

Candidate Document Retrieval: 100%|██████████| 10/10 [00:00<00:00, 420.51it/s]
Paragraph Filtering: 100%|██████████| 10/10 [00:00<00:00, 13285.73it/s]
Paragraph Preprocessing: 100%|██████████| 5/5 [00:04<00:00,  1.19it/s]
Paragraph Pairing: 100%|██████████| 10/10 [00:00<00:00, 12850.20it/s]
Feature Engineering: 2013it [00:08, 246.73it/s]


('essay_1', 'essay_1_similar') 0.7445
('essay_1', 'essay_2') 0
('essay_1', 'essay_2_similar') 0
('essay_1', 'essay_3') 0.07050000000000001
('essay_1_similar', 'essay_2') 0
('essay_1_similar', 'essay_2_similar') 0
('essay_1_similar', 'essay_3') 0.027307692307692307
('essay_2', 'essay_2_similar') 0.8047222222222222
('essay_2', 'essay_3') 0
('essay_2_similar', 'essay_3') 0


### Source: DOC

In [8]:
docx_to_pdf('../data/doc_to_pdf','../data/doc_files/essay_1.doc')
docx_to_pdf('../data/doc_to_pdf','../data/doc_files/essay_1_similar.doc')
docx_to_pdf('../data/doc_to_pdf','../data/doc_files/essay_2.doc')
docx_to_pdf('../data/doc_to_pdf','../data/doc_files/essay_2_similar.doc')
docx_to_pdf('../data/doc_to_pdf','../data/doc_files/essay_3.doc')

'/mnt/c/users/User/Desktop/practisee_plagiarism/data/doc_to_pdf/essay_3.pdf'

In [9]:
essay_1_doc = pdf_to_text('../data/doc_to_pdf/essay_1.pdf')
essay_1_similar_doc = pdf_to_text('../data/doc_to_pdf/essay_1_similar.pdf')
essay_2_doc = pdf_to_text('../data/doc_to_pdf/essay_2.pdf')
essay_2_similar_doc = pdf_to_text('../data/doc_to_pdf/essay_2_similar.pdf')
essay_3_doc = pdf_to_text('../data/doc_to_pdf/essay_3.pdf')

# The key will be filled with Author's name
doc_dict = {'essay_1':essay_1_doc,
            'essay_1_similar':essay_1_similar_doc,
            'essay_2':essay_2_doc,
            'essay_2_similar':essay_2_similar_doc,
            'essay_3':essay_3_doc,
           }

In [10]:
classic_ml_model = pickle.load(open('../model/classic_ml_model.pkl', 'rb'))

In [11]:
score_dict = plagiarism_pipeline(doc_dict,classic_ml_model,
                                candidate_doc_thres=0.25)

Candidate Document Retrieval: 100%|██████████| 10/10 [00:00<00:00, 773.40it/s]
Paragraph Filtering: 100%|██████████| 2/2 [00:00<00:00, 5133.79it/s]
Paragraph Preprocessing: 100%|██████████| 4/4 [00:01<00:00,  2.04it/s]
Paragraph Pairing: 100%|██████████| 2/2 [00:00<00:00, 6186.29it/s]
Feature Engineering: 2013it [00:11, 181.75it/s]


('essay_1', 'essay_1_similar') 0.785
('essay_2', 'essay_2_similar') 0.8038888888888889


In [12]:
# Check All Document Pairs
score_dict = plagiarism_pipeline(doc_dict,classic_ml_model,
                                 candidate_doc_thres=0)

Candidate Document Retrieval: 100%|██████████| 10/10 [00:00<00:00, 480.87it/s]
Paragraph Filtering: 100%|██████████| 10/10 [00:00<00:00, 9086.45it/s]
Paragraph Preprocessing: 100%|██████████| 5/5 [00:00<00:00, 51.53it/s]
Paragraph Pairing: 100%|██████████| 10/10 [00:00<00:00, 10866.07it/s]
Feature Engineering: 2013it [00:09, 208.21it/s]


('essay_1', 'essay_1_similar') 0.785
('essay_1', 'essay_2') 0
('essay_1', 'essay_2_similar') 0
('essay_1', 'essay_3') 0.0695
('essay_1_similar', 'essay_2') 0
('essay_1_similar', 'essay_2_similar') 0
('essay_1_similar', 'essay_3') 0.027307692307692307
('essay_2', 'essay_2_similar') 0.8038888888888889
('essay_2', 'essay_3') 0
('essay_2_similar', 'essay_3') 0
