In [1]:
from os import listdir
import os,sys
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
stopword_list = stopwords.words('english')
import string
def load_files(directory):
    """
    Given a directory name, return a dictionary mapping the filename of each
    `.txt` file inside that directory to the file's contents as a string.
    """
    datafile = os.path.abspath(os.path.dirname(directory)+"corpus")
    ans=os.listdir(datafile)
    final={}
    for i in range(len(ans)):
        with open(os.path.join(datafile,ans[i]),encoding='utf-8') as fin:
                document = fin.read()
        final[os.path.join(datafile,ans[i])]=document
    return final
def remove_punctuation_marks(tokens):
    clean_tokens = []
    for tok in tokens:
        if tok not in string.punctuation:
            clean_tokens.append(tok)
    return clean_tokens
def tokenize(document):
    """
    Given a document (represented as a string), return a list of all of the
    words in that document, in order.

    Process document by coverting all words to lowercase, and removing any
    punctuation or English stopwords.
    """
    
    tokens=word_tokenize(document)
    tokens_clean = []
    for tok in tokens:
        if tok not in stopword_list:
            tokens_clean.append(tok)
    new=remove_punctuation_marks(tokens_clean)
    tokens_lower = []
    for tok in new:
        tokens_lower.append(tok.lower())
    return tokens_lower
def compute_idfs(documents):
    """
    Given a dictionary of `documents` that maps names of documents to a list
    of words, return a dictionary that maps words to their IDF values.

    Any word that appears in at least one of the documents should be in the
    resulting dictionary.
    """
    idf={}
    for i in documents.keys():
        for j in set(documents[i]):
            if j not in idf:
                idf[j]=1
            else:
                idf[j]+=1

    idf_final=dict((k,np.log(len(documents.keys())/v) ) for k, v in idf.items())
    return idf_final

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\POPO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
files = load_files(sys.argv[1])

In [3]:
file_words = {
        filename: tokenize(files[filename])
        for filename in files
    }
file_idfs = compute_idfs(file_words)

In [12]:
query = set(tokenize(input("Query: ")))
query

Query: When was Python 3.0 released?


{'3.0', 'python', 'released', 'when'}

In [13]:
files_=file_words
idfs=file_idfs
def top_files(query, files_, idfs, n):
    """
    Given a `query` (a set of words), `files` (a dictionary mapping names of
    files to a list of their words), and `idfs` (a dictionary mapping words
    to their IDF values), return a list of the filenames of the the `n` top
    files that match the query, ranked according to tf-idf.
    """
    answer=dict()
    for i in files_.keys():
        answer[i]=0
        temp=dict()
        dist = nltk.FreqDist(files_[i])
        for k in query:
            if k in dist.keys():
                temp[k]=dist[k]
        for m in temp.keys():
            temp[m]=temp[m]*idfs[m]
        answer[i]=sum(temp.values())
    return list({k: answer[k] for k in sorted(answer, key=answer.get, reverse=True)}.keys())[:n]

In [14]:
filenames = top_files(query, files_, file_idfs, n=1)
filenames

['C:\\Users\\POPO\\Downloads\\questions\\corpus\\python.txt']

In [15]:
sentences = dict()
for filename in filenames:
    for passage in files[filename].split("\n"):
        for sentence in nltk.sent_tokenize(passage):
            tokens = tokenize(sentence)
            if tokens:
                sentences[sentence] = tokens

In [16]:
idfs = compute_idfs(sentences)

In [17]:
def similar(s1, s2):
    sameWords = set.intersection(set(s1), set(s2))
    return len(sameWords)
def top_sentences(query, sentences, idfs, n):
    """
    Given a `query` (a set of words), `sentences` (a dictionary mapping
    sentences to a list of their words), and `idfs` (a dictionary mapping words
    to their IDF values), return a list of the `n` top sentences that match
    the query, ranked according to idf. If there are ties, preference should
    be given to sentences that have a higher query term density.
    """
    answer=dict()
    for i in sentences.keys():
        answer[i]=0
        temp=dict()
        for k in query:
            for j in tokenize(i):
                if j==k:
                    if j not in temp:
                        temp[j]=1  
        for m in temp.keys():
            temp[m]*=idfs[m]
        answer[i]=(sum(temp.values()),similar(query,[i.lower() for i in i.split(" ")]))
        #print(answer[i],i,query)
    return [t[0] for t in sorted(answer.items(), key=lambda x:(x[1][0],x[1][1]),reverse=True)][:n]

In [11]:
n=1
print(query)
#What are the types of supervised learning?
top_sentences(query, sentences, idfs, n)

{'types', 'what', 'supervised', 'learning'}


['Types of supervised learning algorithms include Active learning , classification and regression.']

In [18]:
n=1
print(query)
#When was Python 3.0 released?
top_sentences(query, sentences, idfs, n)

{'python', 'when', '3.0', 'released'}


['Python 3.0 was released on 3 December 2008.']