## Imports

In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words
import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math

  (fname, cnt))


## Taking all folders

In [2]:
folders = [x[0] for x in os.walk(str(os.path.abspath('..') + '/data/stories/'))]
folders[0] = folders[0][:len(folders[0])-1]

In [3]:
len(folders)

3

## Collecting the file names and titles

In [4]:
dataset = []
c = False
for i in folders:
    file = open(i+"/index.html", 'r')
    text = file.read().strip()
    file.close()
    file_name = re.findall('><A HREF="(.*)">', text)
    file_title = re.findall('<BR><TD> (.*)\n', text)
    if c == False:
        file_name = file_name[2:]
        c = True
    for j in range(len(file_name)):
        dataset.append((str(i) +"/"+ str(file_name[j]), file_title[j]))

In [5]:
N = len(dataset)

In [6]:
def print_doc(id):
    print(dataset[id])
    file = open(dataset[id][0], 'r', encoding='cp1250')
    text = file.read().strip()
    file.close()
    print(text)

# Preprocessing

In [7]:
def convert_lower_case(data):
    return np.char.lower(data)

In [8]:
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [9]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [10]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [11]:
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [12]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [13]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

## Extracting Data

In [None]:
processed_text = []
processed_title = []
texts = []
titles = []

for i in dataset[:len(dataset)]:
    file = open(i[0], 'r', encoding="utf8", errors='ignore')
    text = file.read().strip()
    file.close()
    
    titles.append(i[1])
    texts.append(text)
    processed_text.append(word_tokenize(str(preprocess(text))))
    processed_title.append(word_tokenize(str(preprocess(i[1]))))

In [None]:
df['text'] = texts
df['indexed_texts'] = processed_text
df['indexed_titles'] = processed_title
df.head()

In [None]:
df.to_csv('dataset.csv')

## Calculating DF for all words

In [40]:
doc_freq = {}

for index in range(len(df)):
    tokens = df.indexed_texts[index]
    for word in tokens:
        try:
            doc_freq[word].add(index)
        except:
            doc_freq[word] = {index}

    tokens = df.indexed_titles[index]
    for word in tokens:
        try:
            doc_freq[word].add(index)
        except:
            doc_freq[word] = {index}
for index in doc_freq:
    doc_freq[index] = len(doc_freq[index])

In [42]:
distinct_words = len(doc_freq)

In [44]:
all_words = [x for x in doc_freq]

In [46]:
print(all_words[:20])

['sharewar', 'trial', 'project', 'freewar', 'need', 'support', 'continu', 'one', 'hundr', 'west', 'fifti', 'three', 'north', 'jim', 'prentic', 'copyright', 'thousand', 'nine', 'nineti', 'brandon']


In [57]:
def get_doc_freq(word):
    try:
        return DF[word]
    except:
        return 0

### Calculating TF-IDF for body, we will consider this as the actual tf-idf as we will add the title weight to this.

In [83]:
doc = 0
tf_idf_text = {}
for index in range(len(df)):
    tokens = df.indexed_texts[index]
    counter = Counter(tokens + df.indexed_titles[index])
    words_count = len(tokens + df.indexed_titles[index])
    
    for token in np.unique(tokens):
        #apply tf-idf equation
        total_frequency = counter[token]/words_count
        doc_freq = get_doc_freq(token)
        inverted_document_frequency = np.log((len(df)+1)/(doc_freq+1))
        tf_idf_text[doc, token] = total_frequency*inverted_document_frequency
    doc += 1

### Calculating TF-IDF for Title

In [84]:
doc = 0
tf_idf_title = {}
for index in range(len(df)):
    tokens = df.indexed_titles[index]
    counter = Counter(tokens + df.indexed_texts[index])
    words_count = len(tokens + df.indexed_texts[index])
    
    for token in np.unique(tokens):
        #apply tf-idf equation
        total_frequency = counter[token]/words_count
        doc_freq = get_doc_freq(token)
        inverted_document_frequency = np.log((len(df)+1)/(doc_freq+1))
        tf_idf_title[doc, token] = total_frequency*inverted_document_frequency
    doc += 1

In [116]:
tf_idf_text[(10,"Horror")]

KeyError: (10, 'Horror')

In [115]:
tf_idf_title[(0,"go")]
len(tf_idf_text)

344378

## Merging the TF-IDF according to weights

In [103]:
# adding a 0.3 depreciating weight to textual tf-idf values
alpha = 0.3
for i in tf_idf_text:
    tf_idf_text[i] *= alpha

In [104]:
for i in tf_idf_title:
    tf_idf_text[i] = tf_idf_title[i]

In [106]:
len(tf_idf_text)

344378

# TF-IDF Matching Score Ranking

In [111]:
def matching_score(k, query):
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))

    print("Matching Score")
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    query_weights = {}

    for key in tf_idf_text:
        
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)

    print("")
    
    l = []
    
    for i in query_weights[:10]:
        l.append(i[0])
    
    print(l)
    

matching_score(10, "Horror")

Matching Score

Query: Horror

['horror']


KeyError: (11, 'horror')

In [41]:
print_doc(69)

('/Users/sharan/aarhus_itk/Information-Retrieval/2. TF-IDF Ranking - Cosine Similarity, Matching Score/stories/bluebrd.txt', 'Bluebeard')
BLUEBEARD

   Once upon a time... in the fair land of France, there lived a very powerful 
lord, the owner of estates, farms and a great splendid castle, and his name was
Bluebeard. This wasn't his real name, it was a nickname, due to the fact he had
a long shaggy black beard with glints of blue in it. He was very handsome and
charming, but, if the truth be told, there was something about him that made 
you feel respect, and a little uneasy...
   Bluebeard often went away to war, and when he did, he left his wife in 
charge of the castle... He had had lots of wives, all young, pretty and noble.
As bad luck would have it, one after the other, they had all died, and so the
noble lord was forever getting married again.
   "Sire," someone would ask now and again, "what did your wives die of?"
   "Hah, my friend," Bluebeard would reply, "one died of small

# TF-IDF Cosine Similarity Ranking

In [42]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

### Vectorising tf-idf

In [43]:
D = np.zeros((N, total_vocab_size))
for i in tf_idf:
    try:
        ind = total_vocab.index(i[1])
        D[i[0]][ind] = tf_idf[i]
    except:
        pass

In [44]:
def gen_vector(tokens):

    Q = np.zeros((len(total_vocab)))
    
    counter = Counter(tokens)
    words_count = len(tokens)

    query_weights = {}
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = math.log((N+1)/(df+1))

        try:
            ind = total_vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass
    return Q

In [49]:
def cosine_similarity(k, query):
    print("Cosine Similarity")
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))
    
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    d_cosines = []
    
    query_vector = gen_vector(tokens)
    
    for d in D:
        d_cosines.append(cosine_sim(query_vector, d))
        
    out = np.array(d_cosines).argsort()[-k:][::-1]
    
    print("")
    
    print(out)

#     for i in out:
#         print(i, dataset[i][0])

Q = cosine_similarity(10, "Adventure")

Cosine Similarity

Query: 51

['fifti', 'one']

[  0 101 167 211 127 307  44 154  87 438]


In [50]:
print_doc(51)

('/Users/sharan/aarhus_itk/Information-Retrieval/2. TF-IDF Ranking - Cosine Similarity, Matching Score/stories/bern', 'The Adventures of Bert and Bernece, by  Francis U. Kaltenbaugh')
THE ADVENTURES OF BERT AND BERNECE
  by Francis U. Kaltenbaugh

  In mid-town, the sun's brazen harshness was reinforced, as
it glared from a glass and ivory colored office building towering 
towards the heavens, stiff and erect in stature; symbolism oozed 
from its solar-heated shaft, as an unnoticed conversation unfolded 
ensconced near the tip of this man-made erection of glass and steel.
  
  "Stop squirming. You'll die for what you did," Bert threatened.
  
  "You'll never get away with this," I lied. "There are others, who 
know I came here for you."
  
  "You stole my woman; you're gonna pay," Bert accused.
  
  "What woman? I don't have a woman -- not me. I'm to enter seminary 
next month. I'm celibate," I babbled.
  
  "Sell a bit!  What the hell ... a polite way to say pimp or 
whoremaster?" he 