In [45]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from difflib import SequenceMatcher as SM
from collections import defaultdict
from gensim import corpora

In [46]:
documents = [
    "The responsiveness of our app is ensured by an ElasticSearch cluster.",
    "The app is made of a Vue.js frontend and a Node.js backend, both written in Typescript.",
    "The almost real-time data processing pipelines hold components written in Rust and Golang.",
    "Our stack is mostly in Node both on the backend and frontend, and we work with React for our interfaces and GraphQL as API.",
    "Our Mobile Apps are made with Swift and Kotlin."
]

stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [47]:
from gensim import models
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

In [48]:
doc = "Application user interface is made of a React.js frontend and a Node.js backend written with Typescript."
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]  # convert the query to LSI space
print(vec_lsi)

[(0, 1.153275503526602), (1, -0.7112020856454169)]


In [49]:
from gensim import similarities
index = similarities.MatrixSimilarity(lsi[corpus])  # transform corpus to LSI space and index it

In [50]:
index.save('./similarity_01.index')
index = similarities.MatrixSimilarity.load('./similarity_01.index')

In [51]:
sims = index[vec_lsi]  # perform a similarity query against the corpus
print(list(enumerate(sims)))  # print (document_number, document_similarity) 2-tuples

[(0, 0.9220505), (1, 0.948815), (2, 0.7230117), (3, 0.6288132), (4, 0.5600751)]


In [52]:
items = {}
sims = sorted(enumerate(sims), key=lambda item: -item[1])
for doc_position, doc_score in sims:
#     print(doc_score, documents[doc_position])
    items[documents[doc_position]] = {"score": doc_score * 100 }
    items[documents[doc_position]]['score'] += fuzz.ratio(doc, documents[doc_position]) 
    items[documents[doc_position]]['score'] += fuzz.partial_ratio(doc, documents[doc_position])
    items[documents[doc_position]]['score'] += SM(None, doc, documents[doc_position]).ratio() * 100
    items[documents[doc_position]]['score'] = items[documents[doc_position]]['score'] / 4

In [53]:
print(items)

{'The app is made of a Vue.js frontend and a Node.js backend, both written in Typescript.': {'score': 83.33032234738634}, 'The responsiveness of our app is ensured by an ElasticSearch cluster.': {'score': 48.08160872266472}, 'The almost real-time data processing pipelines hold components written in Rust and Golang.': {'score': 45.26085882272917}, 'Our stack is mostly in Node both on the backend and frontend, and we work with React for our interfaces and GraphQL as API.': {'score': 44.60799537697552}, 'Our Mobile Apps are made with Swift and Kotlin.': {'score': 39.87439416220646}}


In [54]:
for i in documents:
    if items[i]["score"] > 80:
        items[i]["status"] = "passed"
    else:
        items[i]["status"] = "failed"

In [55]:
print(items)

{'The app is made of a Vue.js frontend and a Node.js backend, both written in Typescript.': {'score': 83.33032234738634, 'status': 'passed'}, 'The responsiveness of our app is ensured by an ElasticSearch cluster.': {'score': 48.08160872266472, 'status': 'failed'}, 'The almost real-time data processing pipelines hold components written in Rust and Golang.': {'score': 45.26085882272917, 'status': 'failed'}, 'Our stack is mostly in Node both on the backend and frontend, and we work with React for our interfaces and GraphQL as API.': {'score': 44.60799537697552, 'status': 'failed'}, 'Our Mobile Apps are made with Swift and Kotlin.': {'score': 39.87439416220646, 'status': 'failed'}}
