In [189]:
import pandas as pd
import numpy as np
import math
import ast
from collections import defaultdict
import matplotlib.pyplot as plt
from google.colab import drive
import string
from nltk.stem import PorterStemmer
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
import nltk
import gzip

drive.mount('/content/gdrive')
data_dir = '/content/gdrive/My Drive/MachineLearning/'

def stopword_handling():
  my_file = open(data_dir + 'stopwords.txt', 'r')
  stopwords = my_file.readlines()
  stopwords = list(map(str.strip, stopwords))

def csv_to_dataframe(data_dir):
  data = pd.read_csv(data_dir + 'booksummaries.txt', sep = "\t")
  data.columns = ['Wikipedia_ID', 'Freebase_ID', 'Title', 'Author', 'Publication_Date', 'Genres', 'Plot']
  datatemp = data.copy(deep=True)
  data.drop(labels = ['Wikipedia_ID', 'Author', 'Freebase_ID', 'Publication_Date'], inplace = True, axis = 1)
  datatemp.drop(labels = ['Wikipedia_ID',  'Freebase_ID', 'Publication_Date'], inplace = True, axis = 1)
  return data, datatemp

def clean_data(data, datatemp):
  data = data.dropna(subset = ['Genres'])
  datatemp = datatemp.dropna(subset = ['Genres'])
  data.reset_index(drop = True, inplace = True)
  datatemp.reset_index(drop = True, inplace = True)
  return data, datatemp

def genre_to_dict(data, datatemp):
  data['Genres'] = data['Genres'].map(lambda x: ast.literal_eval(x))
  data['Genres'] = data['Genres'].map(lambda x: list(x.values()))

  datatemp['Genres'] = datatemp['Genres'].map(lambda x: ast.literal_eval(x))
  print(datatemp)
  datatemp['Genres'] = datatemp['Genres'].map(lambda x: list(x.values()))
  print(datatemp)
  return data, datatemp

data, datatemp = csv_to_dataframe(data_dir)
data, datatemp = clean_data(data, datatemp)
data, datatemp = genre_to_dict(data, datatemp)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
                                Title                Author  \
0                  A Clockwork Orange       Anthony Burgess   
1                          The Plague          Albert Camus   
2                A Fire Upon the Deep          Vernor Vinge   
3      All Quiet on the Western Front  Erich Maria Remarque   
4                A Wizard of Earthsea     Ursula K. Le Guin   
...                               ...                   ...   
12835                  The Third Lynx          Timothy Zahn   
12836                  Remote Control            Andy McNab   
12837               Transfer of Power           Vince Flynn   
12838                         Decoded                 Jay-Z   
12839                       Poor Folk    Fyodor Dostoyevsky   

                                                  Genres  \
0      {'/m/06n90': 'Science Fiction', '/m/0l67h': '

In [190]:
nltk.download('stopwords')
nltk.download('punkt')

# Removing Non-Ascii characters
def _removeNonAscii(s): 
    return "".join(i for i in s if ord(i)<128)

data['Plot'] = data['Plot'].map(_removeNonAscii)

# Tokenizing the words and removing punctuations and stopwords
def plot_tokenizer(w):

  stop = set(list(string.punctuation) + ['``', "'s", "''"] + stopwords.words('english'))
  return [item for item in word_tokenize(w.lower()) if item not in stop]

data['plot_token'] = data['Plot'].map(lambda x: plot_tokenizer(x))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [191]:
# Stemming
def stem(x1):
  ps = PorterStemmer()
  stemmed = []
  for a in x1:
    stemmed.append(ps.stem(a))
  return stemmed

data['plot_token'] = data['plot_token'].apply(lambda x: stem(x)) # this is a dataframe column that has the tokenized and stemmed words

In [192]:
# Converting dataframe column to dictionary
docdict = {}
for i in range(len(data)):
  docdict['doc' + str(i)] = data['plot_token'].iloc[i]

In [193]:
freqMax, normOfdoc, docNo = [],[],[]
dictionary, statOfdoc = {}, [docNo,normOfdoc,freqMax]
postingList, indexStruct = [], [dictionary,statOfdoc]
posting_idx = 0

In [194]:
j = 0
for id,doc in docdict.items():
  max_freq = 1
  docno = id
  for word in doc:
    if word not in dictionary:
        dictionary.update({word:[j,posting_idx]})
        postingList.append([j,1])
        posting_idx+=1 # keeps track of offset in the posting list
    else:
      postings = postingList[dictionary[word][1]]
      if j == postings[-2]:
        postings[-1] += 1
        max_freq = max(max_freq,postings[-1])
      else:
        postings.extend([j,1])
  freqMax.append(max_freq)
  docNo.append(docno)
  j +=1

In [195]:
n_docs = j
normOfdoc = [0 for i in range(n_docs)]

In [196]:
from math import sqrt,log2
def tf(f,norm,K=0.5):
	return (K + (1-K)*(f/norm))*(1 + log2(f))

In [197]:
def idf(df,N):
	return log2(N/(1+df)) + 1		

In [198]:
def tf_idf(f,f_norm,df,n_docs,K_f=0.5):
	return tf(f,f_norm,K_f)*idf(df,n_docs)

In [199]:
for key in dictionary:
		val = dictionary[key]
		posting = postingList[val[1]]
		dfTemp = len(posting) //2
		val[0] = dfTemp
		idfVal = idf(dfTemp,n_docs)
		#prev_doc = 0
		for i in range(dfTemp):
			doc_id = posting[2*i]
			freq = posting[2*i+1]
			freq_max = freqMax[doc_id]
			f = tf(freq,freq_max)
			normOfdoc[doc_id] += (idfVal*f)**2
			#posting[2*i],prev_doc = posting[2*i]-prev_doc,posting[2*i]
			#print(posting[2*i],prev_doc)


In [200]:
normOfdoc = [float("{:.3f}".format(sqrt(val))) for val in normOfdoc]
statOfdoc[1] = normOfdoc

In [201]:
import json
with gzip.GzipFile("index"+'.dict', 'w') as f:
	f.write(json.dumps(indexStruct).encode('utf-8')) 

with gzip.GzipFile("postingList"+'.idx', 'w') as f:
	f.write(json.dumps(postingList).encode('utf-8'))  