# Split the documents in words.

In [1]:
import math
import string
import sys

In [2]:
# This function will return a list of the lines of text in the file.
def read_file(filename): 
      
    try:
        with open(filename, 'r') as f:
            data = f.read()
        return data
      
    except IOError:
        print("Error opening or reading input file: ", filename)
        sys.exit()
  

In [3]:
# splitting the text lines into words
# mapping upper case to lower case and
# punctuation to spaces
translation_table = str.maketrans(string.punctuation+string.ascii_uppercase,
                    " "*len(string.punctuation)+string.ascii_lowercase)

In [4]:
translation_table

{33: 32,
 34: 32,
 35: 32,
 36: 32,
 37: 32,
 38: 32,
 39: 32,
 40: 32,
 41: 32,
 42: 32,
 43: 32,
 44: 32,
 45: 32,
 46: 32,
 47: 32,
 58: 32,
 59: 32,
 60: 32,
 61: 32,
 62: 32,
 63: 32,
 64: 32,
 91: 32,
 92: 32,
 93: 32,
 94: 32,
 95: 32,
 96: 32,
 123: 32,
 124: 32,
 125: 32,
 126: 32,
 65: 97,
 66: 98,
 67: 99,
 68: 100,
 69: 101,
 70: 102,
 71: 103,
 72: 104,
 73: 105,
 74: 106,
 75: 107,
 76: 108,
 77: 109,
 78: 110,
 79: 111,
 80: 112,
 81: 113,
 82: 114,
 83: 115,
 84: 116,
 85: 117,
 86: 118,
 87: 119,
 88: 120,
 89: 121,
 90: 122}

In [4]:
# returns a list of the words in the file
def get_words_from_line_list(text): 
      
    text = text.translate(translation_table)
    word_list = text.split()
      
    return word_list

Now that we have the word list, we will now calculate the frequency of occurrences of the words.1

In [5]:
# counts frequency of each word
# returns a dictionary which maps the words to  their frequency.
def count_frequency(word_list): 
      
    D = {}
      
    for new_word in word_list:
          
        if new_word in D:
            D[new_word] = D[new_word] + 1
              
        else:
            D[new_word] = 1
              
    return D


In [6]:
# returns dictionary of (word, frequency) pairs from the previous dictionary.
def word_frequencies_for_file(filename): 
      
    line_list = read_file(filename)
    word_list = get_words_from_line_list(line_list)
    freq_mapping = count_frequency(word_list)
  
    print("File", filename, ":", )
    print(len(line_list), "lines, ", )
    print(len(word_list), "words, ", )
    print(len(freq_mapping), "distinct words")
  
    return freq_mapping

Lastly, we will calculate the dot product to give the document distance.

In [7]:
# returns the dot product of two documents
def dotProduct(D1, D2): 
    Sum = 0.0
      
    for key in D1:
          
        if key in D2:
            Sum += (D1[key] * D2[key])
              
    return Sum

In [8]:
# returns the angle in radians between document vectors
def vector_angle(D1, D2): 
    numerator = dotProduct(D1, D2)
    denominator = math.sqrt(dotProduct(D1, D1)*dotProduct(D2, D2))
      
    return math.acos(numerator / denominator)

In [10]:
def documentSimilarity(filename_1, filename_2):
      
   # filename_1 = sys.argv[1]
   # filename_2 = sys.argv[2]
    sorted_word_list_1 = word_frequencies_for_file(filename_1)
    sorted_word_list_2 = word_frequencies_for_file(filename_2)
    distance = vector_angle(sorted_word_list_1, sorted_word_list_2)
      
    print("The distance between the documents is: % 0.6f (radians)"% distance)


In [12]:
# Driver code
documentSimilarity('Document1.txt', 'Document2.txt')

File Document1.txt :
23 lines, 
5 words, 
5 distinct words
File Document2.txt :
71 lines, 
11 words, 
11 distinct words
The distance between the documents is:  0.830916 (radians)
