In [184]:
import pandas as pd
import time
from sentence_transformers import SentenceTransformer, util
    
def read_input_csv():
    df = pd.read_csv('./highcos-plagiarism_included.csv', sep=',')
    df1 = df[['essay']]
    df1_dict = df1.to_dict('dict')
    return df1, df1_dict

# calculate all sentence embeddings, put into dict
def create_embedding_dict(df_dict):
    embedding_dict = {}
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    for key in df_dict['essay']:
        embedding_n = model.encode(df_dict['essay'][key], convert_to_tensor=True)
        embedding_dict[key]=embedding_n
        
    return embedding_dict

# cosine similarity of all sentence embeddings against each other, put into dataframe
def create_embedding_cosine_matrix(embedding_dict):
    # setup empty dataframe
    df_cosine_sim = pd.DataFrame()
    df_cosine_sim

    # setup empty meas_dict
    meas_dict = {}

    for key in embedding_dict:
        # first embedding
        embedding_1 = embedding_dict[key]
        # compare to every other essay except self
        i=0
        for item in embedding_dict:
            if key == item:
                meas_dict[i] = 1
                # print("for key: ",key," and item: ",item," we set meas: ",1)
                i=i+1
            else:
                embedding_2 = embedding_dict[item]
                meas = util.pytorch_cos_sim(embedding_1, embedding_2)
                # print("for key: ",key," and item: ",item," we found meas: ",meas.item())
                meas_dict[i] = meas.item()
                i=i+1

        # show dict status
        df_cosine_sim_col = pd.DataFrame.from_dict(meas_dict,orient='index')
        df_cosine_sim_col.columns = [key]
        # print(df_cosine_sim_col)
        df_cosine_sim = pd.concat([df_cosine_sim, df_cosine_sim_col], axis=1)
        
    return df_cosine_sim

def get_cosine_sim_matrix():
    df, df_dict = read_input_csv()
    embedding_dict1 = create_embedding_dict(df_dict)
    df_cosine_sim = create_embedding_cosine_matrix(embedding_dict1)
    return df_cosine_sim

def get_largest_for_column(df,column):
    n = 2
    topN = df.nlargest(n,[column])[column]
    largest = topN.min()
    idx = topN.idxmin()
    return largest,idx

def get_ranked_n_for_column(df,column):
    n = 2
    topN = df.nlargest(n,[column])
    
    # nth_series_idx[[0]]
    nth_series = df_cosine_sim.nlargest(3,[0])
    df_nth = nth_series[[0]]
    df_nth
    
    largest = topTwo.min()
    idx = topTwo.idxmin()
    return largest,idx

def get_largest_all_columns(df_cosine_sim,df_with_essays):
    df_out = pd.DataFrame()
    for col in df_cosine_sim.columns:
        largest,idx = get_largest_for_column(df_cosine_sim,col)            
        assigned_row = col
        match_column = "largestMatchIndex"
        measurement_column = "cosineSimilarity"
        value_column = "essay"
        df_out.at[assigned_row,match_column] = str(int(idx))
        df_out.at[assigned_row,measurement_column] = largest
        df_out.at[assigned_row,value_column] = df_with_essays['essay'][idx]
    return df_out

# def get_most_likely_plagerism():

def do_complete_analysis():
    df1, df1_dict = read_input_csv()
    df_cosine_sim = get_cosine_sim_matrix()
    df_out = get_largest_all_columns(df1,df_cosine_sim)
    return df_out

In [185]:
df_with_essays, df1_dict = read_input_csv()
df_cosine_sim = get_cosine_sim_matrix()
df_out = get_largest_all_columns(df_cosine_sim,df_with_essays)

df_out

Unnamed: 0,largestMatchIndex,cosineSimilarity,essay
0,0,1.000000,"Dear local newspaper, I think effects computer..."
1,108,0.865651,"Dear Local Newspaper, People all around the gl..."
2,111,0.829053,Dear I believe that computers are a good sourc...
3,84,0.827972,"Dear Local Newspaper, I am awair that more peo..."
4,107,0.857072,Dear NewsPaper Have you ever taken into consid...
...,...,...,...
109,76,0.824389,"Dear Newspaper, Computers have a huge effect o..."
110,72,0.814387,"Dear editor, I think that computers are a good..."
111,99,0.834396,I believe that computers are a great invention...
112,0,1.000000,"Dear local newspaper, I think effects computer..."


In [212]:
# get the nth value and index 
n = 1
nth_series = df_cosine_sim.nlargest(n,[0])
df_nth = nth_series[[0]]
df_nth_array = df_nth.values
print(df_nth_array[n-1][0])
print(df_nth.index[n-1])

1.0
0


In [233]:
n=5
df_cosine_sim_columns = df_cosine_sim.shape[1]
for col in range(0,df_cosine_sim_columns):
    print(col)
    nth_series = df_cosine_sim.nlargest(n,[col])
    df_nth = nth_series[[2]]
    df_nth_array = df_nth.values
    print(df_nth_array)

0
[[0.71262622]
 [0.71262622]
 [0.7238093 ]
 [0.70107645]
 [0.81167567]]
1
[[0.73553067]
 [0.73065042]
 [0.77323937]
 [0.76228631]
 [0.71907735]]
2
[[1.        ]
 [0.82905275]
 [0.81167567]
 [0.80556685]
 [0.79618633]]
3
[[0.67159188]
 [0.7378273 ]
 [0.65506387]
 [0.7176075 ]
 [0.78689897]]
4
[[0.66308618]
 [0.72918832]
 [0.7502054 ]
 [0.70107645]
 [0.71176457]]
5
[[0.63197857]
 [0.71102101]
 [0.78689897]
 [0.65506387]
 [0.66079926]]
6
[[0.74702054]
 [0.77718532]
 [0.71176457]
 [0.74426937]
 [0.64506894]]
7
[[0.64046276]
 [0.6685096 ]
 [0.64727247]
 [0.71390647]
 [0.72892177]]
8
[[0.71247679]
 [0.78689897]
 [0.70263809]
 [0.7095468 ]
 [0.71176457]]
9
[[0.66633308]
 [0.66079926]
 [0.74475896]
 [0.68793094]
 [0.65506387]]
10
[[0.72601569]
 [0.76584721]
 [0.71247679]
 [0.72753525]
 [0.68030179]]
11
[[0.7502054 ]
 [0.73433256]
 [0.66308618]
 [0.72918832]
 [0.70107645]]
12
[[0.67489874]
 [0.74621129]
 [0.71906734]
 [0.757236  ]
 [0.81167567]]
13
[[0.68003285]
 [0.77757096]
 [0.71176457]
 [0

In [228]:
df_cosine_sim_columns = df_cosine_sim.shape[1]
df_cosine_sim_columns

114