### Given set of text files we will try to find the cosine similarities of documents with a query document. For this demonstration, we will find the tf-idf values calculated for every word in the documents

In [1]:
import os, glob, math, string
import re
import numpy as np

In [2]:
def getTFDict(document):
    """Returns a dictionary for each document whose keys are all the unique words in the document and whose values are their 
    corresponding tf."""
    tfDict={}
    for word in document:
        if word in tfDict:
            tfDict[word] += 1
        else:
            tfDict[word] = 1
    #Computes tf for each word           
    for word in tfDict:
        tfDict[word] = tfDict[word] / len(document)
    return tfDict

In [3]:
def getCountDict(tfDict):
    """ Returns a dictionary whose keys are all the unique words in
    the dataset and whose values count the number of documents in which
    the word appears.
    """
    countDict = {}
    # Run through each review's tf dictionary and increment countDict's (word, doc) pair
    for review in tfDict:
        for word in review:
            if word in countDict:
                countDict[word] += 1
            else:
                countDict[word] = 1
    return countDict

In [4]:
def getIDFDict(data, countDict):
    """ Returns a dictionary whose keys are all the unique words in the
    dataset and whose values are their corresponding idf.
    """
    idfDict = {}
    for word in countDict:
        idfDict[word] = math.log(len(data) / countDict[word])
    return idfDict

In [5]:
def getDocumentTFIDFDict(tfDict, idfDict):
    """ Returns a dictionary whose keys are all the unique words in the
    document and whose values are their corresponding tfidf.
    """
    documentTFIDFDict = {}
    #For each word in the document, we multiply its tf and its idf.
    for word in tfDict:
        documentTFIDFDict[word] = round(tfDict[word] * idfDict[word], 4)
    return documentTFIDFDict

In [6]:
def getTFIDFVector(document, wordDict):
    tfidfVector = [0] * len(wordDict)
    # For each unique word, if it is in the document, store its TF-IDF value.
    for i, word in enumerate(wordDict):
          if word in document:
                tfidfVector[i] = document[word]
    return tfidfVector

In [7]:
def removeQueryDoc(file):
    return file!=query

`query` is the name of the file which has the query document. 

In [8]:
query = 'd_query.txt'

### Cleaning data and computing tf and idf dictionaries

In [9]:
documents = filter(removeQueryDoc, glob.glob("*.txt"))

# list of list of all words in a document
data=[]

#Storing the tf dict of every document in a list
tfDict = []

# storing the count of every word in all documents in a dict
countDict = {}

# storing the idf value of every word in the documents
idfDict = {}

# cleaning and data preparation
for file in glob.glob("*.txt"):
    f=open(file,'r')
    contents = f.read()
    data_row=[]
    contents = re.sub('[^a-zA-Z0-9 +]', '', contents)
    for word in contents.split():
        data_row.append(word.lower())
    data.append(data_row)
    documentTFDict = getTFDict(data_row)
    # calculate the tf Map
    tfDict.append(documentTFDict)

# calculate the idf Map of all the words in the documents
countDict = getCountDict(tfDict)

# find the idfCountMap of the dataset
idfDict = getIDFDict(data, countDict)


### c) Computing tf-idf values for each word in every document as well as the query, printing them as vectors

In [10]:
wordDict = sorted(countDict.keys())
# computing vector for query document

# a) convert words into vectors
tfidfVector={}
# b) iterating over every tfDict of each document

for i,file in enumerate(glob.glob("*.txt")):
    #calculate the TFIDF Dict
    documentTFDict = getDocumentTFIDFDict(tfDict[i], idfDict)
    tfidfVector[file] = getTFIDFVector(documentTFDict, wordDict)

For file d_query.txt
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3662, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

### c) Printing Similarity of every document with query

In [11]:
# part c)
similarities = []
for i,file in enumerate(documents):
    review_similarity = np.dot(tfidfVector[query], tfidfVector[file])
    similarities.append(review_similarity)
    print("Similarity of Query Document and "+ str(file) + " is " + str(review_similarity))

Similarity of Query Document and d4.txt is 0.01530716
Similarity of Query Document and d5.txt is 0.0
Similarity of Query Document and d1.txt is 0.0
Similarity of Query Document and d2.txt is 0.0
Similarity of Query Document and d3.txt is 0.0


### Highest Similarity is for d4.txt

In [12]:
print(max(similarities))

0.01530716


## Using sklearn vectorizer

Note: We get 0 for the query term 'java coffee mocha' (d_query.txt) with all documents except for `d4.txt` as the term 'java' is present in all 
the documents and we negate its overall precedence assuming its behavior like a stop word. However we see that the 
vectorizer from sklearn we still get some similarity values of the query document
with the others.

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
import glob, re
tfidf_vectorizer = TfidfVectorizer()

# list of list of all words in a document
data=[]

# cleaning and data preparation
for file in glob.glob("*.txt"):
    f=open(file,'r')
    contents = f.read()
    contents = re.sub('[^a-zA-Z0-9 +]', '', contents)
    data.append(contents.lower())

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
import math

In [15]:
documents = filter(removeQueryDoc, glob.glob("*.txt"))
tfidf_matrix = tfidf_vectorizer.fit_transform(data)

for i,file in enumerate(documents):
    print("For file "+ file + " and the query document, the cosine similarity is {0:.2f}".format(cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)[0][i+1])) 

For file d4.txt and the query document, the cosine similarity is 0.28
For file d5.txt and the query document, the cosine similarity is 0.09
For file d1.txt and the query document, the cosine similarity is 0.06
For file d2.txt and the query document, the cosine similarity is 0.03
For file d3.txt and the query document, the cosine similarity is 0.03
