In [1]:
import pandas as pd
import os
import gensim

import spacy
nlp=spacy.load('en_core_web_lg') # large model which has about 685k unique vectors

In [2]:
os.chdir(r'D:\Sandeep_assignment\data')

In [3]:
data=pd.read_csv('harry.csv',encoding='latin')
data.head()

Unnamed: 0,id,text
0,36527,The last trace of steam evaporated in the autu...
1,36521,But how the doors were slamming all along the ...
2,36406,Haven't you noticed how none of the spells you...
3,36395,Harry watched with terror and elation as Molly...
4,36383,Harry pulled the Invisibility Cloak from insid...


In [4]:
ref_sent=data.loc[data['id'] ==1140,'text'].iloc[0] # Extract the reference sentence from the data

In [5]:
ref_sent_words = gensim.utils.simple_preprocess(ref_sent) # Tokenize the the ref sentence into words

In [6]:
ref_sent_vec = nlp(ref_sent) # Vectorize the tokens

In [7]:
all_docs = [nlp(row) for row in data['text']] # Vectorize the tokens of all words in the data

In [8]:
# Calculate similarity score using "similarity" funtion from spacy (Score ranges from 0 to 1, 1 being most similar)
sims=[]
doc_id=[]
for i in range(len(all_docs)):
    sim = all_docs[i].similarity(ref_sent_vec)
    sims.append(sim)
    doc_id.append(i)
    sim_docs = pd.DataFrame(list(zip(doc_id,sims)),columns=['doc_id','sims']) # Create DataFrame of Document ID and similarity score

In [9]:
sim_docs_sorted = sim_docs.sort_values(by="sims",ascending=False)
sim_docs_sorted.head()

Unnamed: 0,doc_id,sims
962,962,1.0
17,17,0.970573
647,647,0.957789
749,749,0.954897
996,996,0.952174


In [10]:
top_5_sim_docs = data.iloc[sim_docs_sorted['doc_id'][1:6]] # get only top 5 similar documents based on the score
top_5_sim_docs

Unnamed: 0,id,text
17,35736,She glanced at him and then looked back out of...
647,13239,He turned his head back to Harry and stared at...
749,9773,"Harry lunged forward, he seized a handful of t..."
996,19,"For a second, Mr. Dursley didn't realize what ..."
414,21444,Harry emerged from behind his towel; the chang...


In [11]:
# Check by printing the most similar sentence and the reference sentence
print(top_5_sim_docs[top_5_sim_docs['id']==35736]['text'].values)
print(data[data['id']==1140]['text'].values)

['She glanced at him and then looked back out of the window.']
["He glanced at Harry and then looked quickly out of the window, pretending he hadn't looked."]


In [12]:
top_sim_scores = pd.concat([top_5_sim_docs, sim_docs_sorted['sims'][1:6]], axis=1)

In [13]:
top_sim_scores = top_sim_scores.drop(columns='id')
top_sim_scores

Unnamed: 0,text,sims
17,She glanced at him and then looked back out of...,0.970573
647,He turned his head back to Harry and stared at...,0.957789
749,"Harry lunged forward, he seized a handful of t...",0.954897
996,"For a second, Mr. Dursley didn't realize what ...",0.952174
414,Harry emerged from behind his towel; the chang...,0.952061


In [14]:
#Print the top 5 similar sentences along with the corresponding similarity scores

for (text,sim) in zip(top_sim_scores['text'],top_sim_scores['sims']):
    print("The most similar sentences are: {} \n with a similarity score of {:.2f}\n".format(text,sim))

The most similar sentences are: She glanced at him and then looked back out of the window. 
 with a similarity score of 0.97

The most similar sentences are: He turned his head back to Harry and stared at him as though he couldn't believe his eyes. 
 with a similarity score of 0.96

The most similar sentences are: Harry lunged forward, he seized a handful of the brute's hair, but it was dragging Ron away as easily as though he were a rag doll  -   Then, out of nowhere, something hit Harry so hard across the face he was knocked off his feet again. 
 with a similarity score of 0.95

The most similar sentences are: For a second, Mr. Dursley didn't realize what he had seen  -  then he jerked his head around to look again. 
 with a similarity score of 0.95

The most similar sentences are: Harry emerged from behind his towel; the changing room was blurred because he was not wearing his glasses, but he could still tell that everyone's face was turned towards him. 
 with a similarity score of 