# 功課二 - Create Dictionary and Calculate tf
* dictionary.txt
    * t_index , term, df
    * in ascending order
* Transfer each document into a tf-idf unit vector.
* Write a function cosine(Docx, Docy) which loads the tf-idf vectors of documents x and y and returns their cosine similarity.

## Import

In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import pickle
import math
import glob
import os

In [2]:
from nltk.stem import PorterStemmer
p = PorterStemmer()
stemmed_doc =[]

In [3]:
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

## Create Dictionaries

In [4]:
os.chdir('./Data/IRTM/')
my_files = glob.glob('*.txt')

In [5]:
my_dict={}

punctuations = '''!()-[]{};:'`"\,<>./?@#$%^&*_~'''
numbers ='1234567890'
for file in my_files:

    with open(file) as f:
        lines = f.read()
    
    # 處理標點符號與大小寫, 句子沒有斷乾淨
    for punc in punctuations:
        lines = lines.replace(punc, ' ').replace('\n','')
    for num in numbers:
        lines = lines.replace(num,'')
        
    doc_list = lines.lower().split(' ')
    
    stemmed_doc = [p.stem(token) for token in doc_list if p.stem(token)!= '\n' and not p.stem(token).isnumeric() and len(p.stem(token))>1 ]
    
    
    final_tokens=[]
    for word in stemmed_doc:
        if word not in final_tokens and word not in stop_words and not word.isnumeric() and len(word)>1:
            final_tokens.append(word)

    
    for token in final_tokens:
        if token in my_dict:
            my_dict[token]+=1
        else:
            my_dict[token]=1

In [6]:
len(my_dict)

12332

In [7]:
import operator
df = pd.DataFrame(sorted(my_dict.items(), key=operator.itemgetter(0)),columns=['term', 'df'])

In [8]:
df['t_index']= range(1, len(df) + 1) 

In [9]:
cols = ['t_index' , 'term' , 'df']
df = df[cols]

In [10]:
df

Unnamed: 0,t_index,term,df
0,1,aan,1
1,2,aaron,2
2,3,ab,1
3,4,aback,1
4,5,abahd,1
...,...,...,...
12327,12328,zubin,1
12328,12329,zuric,1
12329,12330,zurich,1
12330,12331,zutshi,1


In [11]:
# df.to_csv(r'c:\data\pandas.txt', header=None, index=None, sep=' ', mode='a')

## Transfer to unit vector

In [12]:
for file in my_files:
    
    unit_df = pd.DataFrame(columns=['t_index' , 'tf_idf'])
    
    with open(file) as f:
        lines = f.read()
    
    # 處理標點符號與大小寫, 句子沒有斷乾淨
    for punc in punctuations:
        lines = lines.replace(punc, ' ').replace('\n','')
    for num in numbers:
        lines = lines.replace(num,'')
        
    doc_list = lines.lower().split(' ')
    
    stemmed_doc = [p.stem(token) for token in doc_list if p.stem(token)!= '\n' and not p.stem(token).isnumeric() and len(p.stem(token))>1 ]
    
    
    final_tokens=[]
    for word in stemmed_doc:
        if word not in stop_words and not word.isnumeric() and len(word)>1:
            final_tokens.append(word)
        
    N = len(my_files)
    # create the tf-idf vector
    for tok in final_tokens:
        tf = final_tokens.count(tok)/ len(final_tokens)
        unit_df.loc[len(unit_df)]=[df[df.term == tok].t_index.to_list()[0] , math.log(float(N) / df[df.term == tok].df.to_list()[0]) ]
#     print(unit_df)
    unit_df.to_csv('/home/emma/bilab/Steph_C/IR/doc'+file, header=None, index=None, sep=' ', mode='a')

## Cosine Similarity Function

In [14]:
from numpy import dot
from numpy.linalg import norm

In [15]:
def cosine(doc_x , doc_y):
    
    # read files
    x = pd.read_csv(doc_x, sep=" ", header=None)
    y = pd.read_csv(doc_y, sep=" ", header=None)
    x.columns = ['t_index' , 'tf_idf']
    y.columns = ['t_index' , 'tf_idf']
    
    # get bag of words
    bag_of_words = set(x.t_index.to_list()+y.t_index.to_list())
    
    x_list =[]
    y_list =[]
    
    # calculate cosine similarity
    for word in bag_of_words:
        
        if word in x.t_index.to_list():
            x_list.append(x[x.t_index == word].tf_idf.to_list()[0])
        elif word not in x.t_index.to_list():
            x_list.append(0)
        
        if word in y.t_index.to_list():
            y_list.append(y[y.t_index == word].tf_idf.to_list()[0])
        elif word not in y.t_index.to_list():
            y_list.append(0)
            
    
    return  dot(x_list, y_list)/(norm(x_list)*norm(y_list))
    


In [17]:
cosine('/home/emma/bilab/Steph_C/IR/doc1.txt','/home/emma/bilab/Steph_C/IR/doc2.txt')

0.14331883553130642