In [1]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import sys
import os
import re
import string
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def load_files(directory):
    """
    Given a directory name, return a dictionary mapping the filename of each
    `.txt` file inside that directory to the file's contents as a string.
    """
    text =dict()
    data= os.listdir(directory)
    for i in data:
        f = open(os.path.join(directory,i),'r', encoding='UTF-8')
        text[i]= f.read()
    return text


def tokenize(document):
    """
    Given a document (represented as a string), return a list of all of the
    words in that document, in order.

    Process document by coverting all words to lowercase, and removing any
    punctuation or English stopwords.
    """
    stop_words = set(stopwords.words('english'))
    document = document.lower()

    word = re.sub('['+string.punctuation+']', ' ', document)
    word=  word.replace("\n"," ").replace("  "," ")
    word = word.split(" ")
    word = [x  for x in word if x not in stop_words and x!=""]
    return word 

def compute_idfs(documents):
    """
    Given a dictionary of `documents` that maps names of documents to a list
    of words, return a dictionary that maps words to their IDF values.

    Any word that appears in at least one of the documents should be in the
    resulting dictionary.
    """
    count =dict()
    for key in documents:
        for word in set(documents[key]):
            if word not in count:
                count[word]=1
            else:
                count[word]+=1
            
    for key in count:
        count[key]=np.log(len(documents)/count[key])
    return count


def top_files(query, files, idfs, n):
    """
    Given a `query` (a set of words), `files` (a dictionary mapping names of
    files to a list of their words), and `idfs` (a dictionary mapping words
    to their IDF values), return a list of the filenames of the the `n` top
    files that match the query, ranked according to tf-idf.
    ########################################################
    To find the most relevant documents, weâ€™ll use tf-idf to rank documents based 
    both on term frequency for words in the query as well as inverse document frequency for words in the query.
    """
    ans=[0]*len(files)
    for word in query:
        for doc in range(len(files)):
            if word in files[list(files.keys())[doc]]:
                count =files[list(files.keys())[doc]].count(word)
                weight = (np.log(count)) * idfs[word]
                ans[doc]+=weight
    index=np.argsort(-np.array(ans))[:n]
    output = [list(files.keys())[x] for x in index]
    return output


def top_sentences(query, sentences, idfs, n):
    """
    Given a `query` (a set of words), `sentences` (a dictionary mapping
    sentences to a list of their words), and `idfs` (a dictionary mapping words
    to their IDF values), return a list of the `n` top sentences that match
    the query, ranked according to idf. If there are ties, preference should
    be given to sentences that have a higher query term density.
    """
    ans=[0]*len(sentences)
    ans1=[0]*len(sentences)
    ans2=[0]*len(sentences)
    for word in query:
        for doc in range(len(sentences)):
            ans2[doc]=doc
            if word in sentences[list(sentences.keys())[doc]]:
                weight = idfs[word]
                ans[doc]+=weight
                ans1[doc]+=1
            ans1[doc]/= len(sentences[list(sentences.keys())[doc]])
    all_ans=list(map(tuple, zip(ans, ans1,ans2)))
    all_ans=sorted(all_ans, key=lambda element: (-element[0], -element[1]))
    index=[x[2] for x in all_ans[:n]]
    output = [list(sentences.keys())[x] for x in index]
    return output

In [3]:
def main():
    files = load_files("corpus")
    file_words = {
        filename: tokenize(files[filename])
        for filename in files
    }
    file_idfs = compute_idfs(file_words)

    # Prompt user for query
    query = set(tokenize(input("Query: ")))

    # Determine top file matches according to TF-IDF
    filenames = top_files(query, file_words, file_idfs, n=1)
    # Extract sentences from top files
    sentences = dict()
    for filename in filenames:
        for passage in files[filename].split("\n"):
            for sentence in nltk.sent_tokenize(passage):
                tokens = tokenize(sentence)
                if tokens:
                    sentences[sentence] = tokens
    sentence_idfs = compute_idfs(sentences)
    matches = top_sentences(query, sentences, sentence_idfs, n=1)
    for match in matches:
        print(match)

In [4]:
main()

Query: What are the types of supervised learning?
Types of supervised learning algorithms include Active learning , classification and regression.


In [5]:
main()

Query: How do neurons connect in a neural network?
Neurons of one layer connect only to neurons of the immediately preceding and immediately following layers.


In [6]:
main()

Query: When was Python 3.0 released?
Python 3.0 was released on 3 December 2008.
