In [1]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.pipeline import Pipeline
from stop_words import get_stop_words
import numpy as np
import os



In [2]:
stopWords = get_stop_words('en')

In [3]:
def printPathsWithSimilarities(paths, cosineSimilarities):
    for i in range(len(paths)): 
        print(paths[i], cosineSimilarities[i])

In [25]:
def getTop5FilesUsing_TfidfVectorizer(pathToFiles, query):
    
    listOfFiles = os.listdir(pathToFiles)
    files = [query]
    for i in range(len(listOfFiles)):
        files.append(open( pathToFiles + "/" + listOfFiles[i], 'r').read().replace('\n', ' '))
    
    tfXidf = TfidfVectorizer(stop_words = stopWords, lowercase=True, min_df=0.5, max_features=10).fit_transform(files)
    
    cosine_similarities = (linear_kernel(tfXidf[0:1], tfXidf[1:])).flatten()
    related_docs_indices = (-cosine_similarities).argsort()
    cosine_similarities =(cosine_similarities[(-cosine_similarities).argsort()])
    
    paths=[]
    for indice in related_docs_indices:
        paths.append(pathToFiles + "/" + listOfFiles[indice])
    return (paths, cosine_similarities)

In [26]:
paths, cosineSimilarities = (getTop5FilesUsing_TfidfVectorizer("files", "regression"))
printPathsWithSimilarities(paths[:5], cosineSimilarities[:5])

files/text44.txt 0.822480031379848
files/text22.txt 0.6035870152131216
files/text4.txt 0.5866642654744673
files/text33.txt 0.5636651476674287
files/text11.txt 0.5395933739251162




In [23]:
def getTop5FilesUsing_TfidfTransformer(pathToFiles, query):
    listOfFiles = os.listdir(pathToFiles)
    files = []
    for i in range(len(listOfFiles)):
        files.append(open( pathToFiles + "/" + listOfFiles[i], 'r').read().replace('\n', ' '))
    query=[query]
    
    vectorizer = CountVectorizer(stop_words = stopWords, lowercase=True, min_df=0.5, max_features=10)
    transformer = TfidfTransformer()
    
    filesVectors = vectorizer.fit_transform(files).toarray()
    queryVector = vectorizer.transform(query).toarray()
    
    transformer.fit(filesVectors)
    tfXidfFiles = (transformer.transform(filesVectors).toarray())

    transformer.fit(queryVector)
    tfXidfQuery = (transformer.transform(queryVector).todense())    

    cosine_similarities = linear_kernel(tfXidfFiles, tfXidfQuery).flatten()
    related_docs_indices = (-cosine_similarities).argsort()
    cosine_similarities = (cosine_similarities[(-cosine_similarities).argsort()])
    
    paths=[]
    for indice in related_docs_indices:
        paths.append(pathToFiles + "/" + listOfFiles[indice])
    return (paths, cosine_similarities)

In [24]:
paths, cosineSimilarities = (getTop5FilesUsing_TfidfTransformer("files", "regression"))
printPathsWithSimilarities(paths[:5], cosineSimilarities[:5])

files/text44.txt 0.8408504621964032
files/text22.txt 0.6300746275457452
files/text4.txt 0.6143291663316863
files/text33.txt 0.5896064658468518
files/text11.txt 0.5666647558836073


