# Retrieving Relevant Documents using TF-IDF

### 1.Importing libraries

In [None]:
import pandas as pd
import PyPDF2 as pypdf2
import collections
import numpy as np
import string

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from os import listdir
import os.path
from os.path import isfile, join
import docx


### 2. Path of the folder containing documents
- Provide the folder path of your interest
- Reading the filenames that end with .pdf or .docx

In [1]:
file_path="insert a path containing .pdf and .docx files"

#reading .pdf and .docx filenames
files_name= [file for file in listdir(file_path) if isfile(join(file_path,file))and (file.endswith('.pdf')or file.endswith('.docx') )]

NameError: name 'listdir' is not defined

### 3. Reading the files and storing the text as a string

In [None]:
def docx_reader(file):

    doc = docx.Document(file)
    text = ''
    for para in doc.paragraphs:
        text += para.text

    return(text)


def pdf_reader(file):
    pdf_object = open(file, 'rb')
    pdfreader = pypdf2.PdfFileReader(pdf_object)

    num_pages = pdfreader.numPages
    c = 0
    text = ""
    while c < num_pages:
        pageObj = pdfreader.getPage(c)
        c += 1
        text += pageObj.extractText()

    return(text)

### 4. Cleaning the text and Calculating the frequency of words
- Split the text into words 
- Convert into lower
- Remove punctuations
- Remove stop words
- Calculate frequency of words in each file

In [None]:

def tokenisation_df(file_content):
    #converting into lower
    d = [x.lower() for x in file_content.split()]
    file_content = ' '.join(d)
    #removing punctuations
    table = str.maketrans({key: ' ' for key in string.punctuation})
    file_content = file_content.translate(table)
    # removing stopwords
    stop_words = stopwords.words('english')
    s = [x for x in file_content.split() if x not in stop_words]
    file_content = ' '.join(s)
    # calcuating frequency of words
    tokens = word_tokenize(file_content)
    d = collections.Counter(tokens)
    # Returning a data frame with File name,words,frequency
    keywords_df = pd.DataFrame.from_dict(d, orient='index').reset_index()
    keywords_df = keywords_df.rename(columns={'index': 'event', 0: 'freq'})
    return (keywords_df)

#helper function to read .pdf or .docx file

def text(x,z):
    
    switcher = {
        '.pdf': pdf_reader,
        '.docx': docx_reader,

    }

    y = switcher.get(x)

    return (y(z))


### 5. Calculating the TF-IDF score for each word in every document
- Calculate Term Frequency(TF) of each word in a document
- Calculae Inverse Document Frequency (IDF) of each word in a document
- Calculate TF-IDF score i.e TF * IDF of each word

In [None]:
def main_df(files_name):
    
    
    keywords_final = pd.DataFrame()
#TF
    for file in files_name:
        
        #reading the text of the file
        filename=join(file_path,file)

        tex=text(os.path.splitext(filename)[1],filename)
        
        #Cleaning the text and calculating the frequency of words
        keywords_frequency=tokenisation_df(tex)

        keywords_frequency['File']=file
        
        # Calculating TF of each word
        keywords_frequency['TF']=keywords_frequency['freq']/sum(keywords_frequency['freq'])


        keywords_final=keywords_final.append(keywords_frequency, ignore_index=True)


    keywords_final['idf']=0

#IDF
    
    for i in range(len(keywords_final)):

        sub=keywords_final.loc[keywords_final['event']==keywords_final['event'][i]]

        # Caculating IDF of each word
        idf =np.log(len(np.unique(keywords_final['File']))/(1+len(sub)))

        if idf<0:
            keywords_final.loc[i, 'idf']=1
        else:
            keywords_final.loc[i, 'idf']=idf
            
    
    # Calculating TF-IDF score for each word
    keywords_final['Tf_idf']=keywords_final['TF']*keywords_final['idf']
    return keywords_final



### 6. Ranking the files based on user search
- Cleaning the input search to identify key words
- Based on the keywords each file is assigned a final score
- Score is calculated as the sum of TF-IDF scores of keywords with respect to each file
- Then the files are sorted based on score


In [None]:

def rank_df(input_content):
    
    rank_matrix = pd.DataFrame()

    #user search
    input_text=input_content
    
    #cleaning the user inputs
    token_df=tokenisation_df(input_text)
    keywords_final=main_df(files_name)
    #keywords_final=df

    for i in files_name:

        score=0

        for w in token_df['event']:
            
            #calculating the score for each file
            sub_df=keywords_final.loc[(keywords_final['event']==w) & (keywords_final['File']==i)]

            if len(sub_df)==0:
                score=score
            else:

                score=score+float(sub_df['Tf_idf'])

        rank_matrix=rank_matrix.append({'File':i,'Score':score},ignore_index=True)

    return(rank_matrix)

#input here
Relevancy_files= rank_df('enter keywords to search')

#Final list of files sorted based on rank
Relevancy_files= Relevant_files.sort_values(by='Score', ascending=False)
