In [1]:
pip install -U scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [29]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cdist
import numpy as np

In [10]:
def read_input_csv():
    df = pd.read_csv('./highcos-plagiarism_included.csv', sep=',')
    df1 = df[['essay']]
    df1_dict = df1.to_dict('dict')
    return df1, df1_dict
    
vectorize = lambda Text: TfidfVectorizer().fit_transform(Text).toarray()

similarity = lambda doc1, doc2: cosine_similarity([doc1, doc2])

In [9]:
df1, df1_dict = read_input_csv()

In [17]:
strList = [df1['essay'][0]]
vectors_0 = vectorize(strList)

strList = [df1['essay'][1]]
vectors_1 = vectorize(strList)

In [18]:
vectors_0

array([[0.21016277, 0.02627035, 0.02627035, 0.02627035, 0.02627035,
        0.10508139, 0.02627035, 0.02627035, 0.02627035, 0.05254069,
        0.02627035, 0.10508139, 0.02627035, 0.07881104, 0.05254069,
        0.02627035, 0.02627035, 0.02627035, 0.02627035, 0.02627035,
        0.05254069, 0.02627035, 0.02627035, 0.10508139, 0.02627035,
        0.13135173, 0.02627035, 0.02627035, 0.18389243, 0.05254069,
        0.02627035, 0.02627035, 0.02627035, 0.02627035, 0.05254069,
        0.02627035, 0.02627035, 0.02627035, 0.02627035, 0.02627035,
        0.05254069, 0.02627035, 0.02627035, 0.02627035, 0.02627035,
        0.02627035, 0.02627035, 0.02627035, 0.02627035, 0.13135173,
        0.02627035, 0.02627035, 0.02627035, 0.02627035, 0.02627035,
        0.05254069, 0.02627035, 0.05254069, 0.10508139, 0.02627035,
        0.02627035, 0.05254069, 0.05254069, 0.02627035, 0.02627035,
        0.07881104, 0.05254069, 0.10508139, 0.02627035, 0.05254069,
        0.13135173, 0.02627035, 0.13135173, 0.05

In [26]:
vectorList = [vectors_0,vectors_1]
cosine_similarity_score = 1. - cdist(vectors_0, vectors_1, 'cosine')



ValueError: XA and XB must have the same number of columns (i.e. feature dimension.)

In [30]:
x = np.random.rand(1000,1000)
x

array([[0.5309894 , 0.89839163, 0.61568104, ..., 0.12699193, 0.54183309,
        0.82100317],
       [0.56247366, 0.03596714, 0.1747988 , ..., 0.80268252, 0.62655048,
        0.69097281],
       [0.86097381, 0.0902278 , 0.43230032, ..., 0.84211979, 0.49874214,
        0.06625627],
       ...,
       [0.4817236 , 0.86571865, 0.69551452, ..., 0.39294491, 0.8314818 ,
        0.1285262 ],
       [0.85533976, 0.19008564, 0.35569371, ..., 0.11591191, 0.51426235,
        0.96456571],
       [0.61714911, 0.24049539, 0.65343188, ..., 0.13088412, 0.34357541,
        0.71192107]])

# Fixing the Unequal Vectors Problem

* When we compare the cosine similarity between two essays, we are really comparing the cosine similarity between vectors. Cosine similarity can only be calculated on equal length vectors.
* The way that the MPNet fine-tuned language model, all-MiniLM-L6-v2 deals with this problem, is that input text gets truncated down to 256 words.
* If an input text were to be less than 256 words, then 0's need to be appended to the end of the vector.

Note - the length equality restriction does not apply to the vectorization() function, but it does apply to the cosine_similarity function. Hence, we don't apply length similarity prior to vectorization to the text, but we do apply it to the vector produced by the text.

In [161]:
def read_input_csv():
    df = pd.read_csv('./highcos-plagiarism_included.csv', sep=',')
    df1 = df[['essay']]
    df1_dict = df1.to_dict('dict')
    return df1, df1_dict

def truncate_256(df1_dict):
    truncated_dict = {}
    final_truncated_dict = {}
    for each in df1_dict['essay']:
        text = df1_dict['essay'][each]
        textTruncatedList = text.split()[0:256]
        textTruncated = " ".join(textTruncatedList)
        truncated_dict[each] = textTruncated
        
    # add 0's in array to even out
    for each in truncated_dict:
        template = list(0 for i in range(0, 256))
        if len(truncated_dict[each].split()) < 256:
            shortList = truncated_dict[each].split()
            template[:len(shortList)] = shortList
            outputText=[]
            for item in template:
                outputText.append(str(item))
                
            textExtended = " ".join(outputText)
            truncated_dict[each] = textExtended
            
    final_truncated_dict['essay']=truncated_dict
         
    return final_truncated_dict

def check_equallength(truncated_dict):
    for item in truncated_dict['essay']:
        if len(truncated_dict['essay'][item].split()) == 256:
            print("Yes! ")
        else:
            print("No! ", item.split() - 256)

In [163]:
df1, df1_dict = read_input_csv()
truncated_dict = truncate_256(df1_dict)
# check_equallength(truncated_dict)


In [165]:
strList = [truncated_dict['essay'][0]]
vectors_0 = vectorize(strList)
print(len(vectors_0[0]))

strList = [truncated_dict['essay'][1]]
vectors_1 = vectorize(strList)
print(len(vectors_1[0]))

# cosine_similarity_score = 1. - cdist(vectors_0, vectors_1, 'cosine')
# cosine_similarity_score

138
91
