# Information Retrieval - Assigment 2
Group 3: Hooshyar Hosna, Lima Rachel, Lorefice Alessandra

## Import packages

In [1]:
import nltk
import re
from numpy import cumsum
from pandas import DataFrame

## Import data

In [2]:
#read textual documents from file
documents_path = 'AssociatedPress.txt'
with open(documents_path, 'r', encoding='utf-8') as doc_f:
    corpus_list = doc_f.readlines()


## Functions

In [3]:
def createTokens(collection):
    
    '''
    given a collection as input, it gives a list of all the tokens of that collection
    and a list lists where each list contains the tokens for that document of the collection
    
    '''
    
    #create create a list of lists for the tokens of each document
    tokenized_collection = []

    for doc in collection:
        tokenized_collection.append(nltk.tokenize.word_tokenize(doc))
        
        
    #create a list of the total tokens
    tokens = []

    for doc_tokens in tokenized_collection:
        for word in doc_tokens:
            tokens.append(word)
            
            
    return tokens, tokenized_collection;

In [4]:
def distinctTerms(tokenized_collection, tokens):
    
    '''
    given a tokenized collection and the total tokens as input, it gives a list of all 
    the distinct terms of that collection and a list lists where each list contains 
    the distinct terms for that document of the collection
    
    '''
    
    #create create a list of lists for the distinct terms of each document
    distinct_terms_collection = []

    for doc_tokens in tokenized_collection:
        distinct_terms_collection.append(sorted(set(doc_tokens)))
        
        
    #create a list of the total distinct terms
    distinct_terms = list(set(tokens))
            
            
    return distinct_terms, distinct_terms_collection;

In [5]:
def is_number(s):
    if "," in s:
        s = s.replace(",","")
    try:
        float(s)
        return True
    except ValueError:
        return False

In [6]:
def TermsNoNum(terms):
    
    '''
    given as input a list of terms, it gives a list of terms without numbers
    
    '''
    
    terms_no_number = []

    for term in terms:
        if not is_number(term):
            terms_no_number.append(term) 
            
    return terms_no_number;
       

In [7]:
def keepOnlyAlph(terms):
    
    '''
    given a list of strings, it checks if the strings are composed just by
    alphabetical values and returns a list containing only those strings 
    
    '''
    
    only_alpha_value_terms = []
    
    for term in terms:
        if term.isalpha() or ("-" in term and re.search("[a-zA-Z]", term.replace("-",""))):
            only_alpha_value_terms.append(term)
                   
    return only_alpha_value_terms

In [8]:
def caseFolding(terms):
    
    '''
    given as input a list of terms, it gives a list of terms without uppercases
    
    '''

    terms_no_uppercase = []
    
    for term in terms:
        if term.isalpha():
            terms_no_uppercase.append(term.lower()) 
        else:
            terms_no_uppercase.append(term)
            
    return terms_no_uppercase

In [9]:
def removeStopWords(terms, n):
    
    '''
    given as input a list of terms and the number of stop words to be removed, 
    it gives a list of terms removing those stop words stop words
    if the number of stop words is bigger than the total number of stop words we have
    it will remove the maximum number of stop words possible
    
    '''

    terms_no_stopwords = []
    stop_words = nltk.corpus.stopwords.words('english')
    
    if n >= len(stop_words):
        n = len(stop_words)
    
    for term in terms:
        if term not in stop_words[0:n]:
            terms_no_stopwords.append(term)
            
    return terms_no_stopwords

In [10]:
def stemming(terms):
    
    '''
    given as input a list of terms, it gives a list of terms after stemming
    we can have repetitions
    
    '''

    ps = nltk.stem.PorterStemmer()
    terms_stemmed = []
    
    for term in terms:
        terms_stemmed.append(ps.stem(term))

    return terms_stemmed


In [11]:
def frequency(terms):
    
    '''
    given as input a list of tokens, it computes the frequency of a token in that list
    
    '''    

    return nltk.FreqDist(terms)

In [12]:
def reductionPercentage(before_red, after_red):
    
    '''
    takes as input 2 ints that represent a value before a given reduction and a value
    after a reduction has been applied and compute the percentange of that reduction
    
    '''
    
    perc_red = int(((before_red - after_red)*100)/before_red)
    
    return perc_red

In [13]:
def allReductionPercentage(numbers):
    '''
    takes as input a list of numbers and compure the sorted percentage reduction
    between consecutive values as a list. The firts element will be always 0
    
    '''
    reduction = [0]
    
    for i in range(1,len(numbers)):
        j = i-1
        reduction.append(reductionPercentage(numbers[j], numbers[i]))
        
    return reduction;
    

In [14]:
def proportionWithUnfiltered(unfiltered_value, value_to_compare):
    
    '''
    it takes as input a value after reduction and a value before reduction and
    computes the proportion with the value of unfiltered terms 
    
    '''
    
    return int(((unfiltered_value-value_to_compare)/unfiltered_value)*100)
    
        

## Build the table of reductions

In [15]:
Distinct_Terms = {"Type": ["Unfiltered", "No numbers", "Terms without symbols", "Case folding", "30 Stop Words", "150 Stop Words", "Stemming"]}


### Tokens

In [16]:
tokenized_terms, tokenized_terms_per_doc = createTokens(corpus_list)


### Distinct terms

In [17]:
distinct_terms, distinct_terms_per_doc = distinctTerms(tokenized_terms_per_doc, tokenized_terms)
unfiltered_value = len(distinct_terms)

### Distinct terms without numbers

In [18]:
no_numbers = TermsNoNum(distinct_terms)
no_numbers_value = len(no_numbers)

### Distinct terms without symbols (except for combined words e.g. "first-degree")

In [19]:
only_alpha_terms = keepOnlyAlph(no_numbers)
only_alpha_terms_value = len(only_alpha_terms)

### Distinct terms after case folding

In [20]:
case_fold = list(set(caseFolding(no_numbers)))
case_fold_value = len(case_fold)

### Distinct terms after removing 30 stop words

In [21]:
stop_words_30 = removeStopWords(case_fold, 30)
stop_words_30_value = len(stop_words_30)

### Distinct terms after removing 150 stop words

In [22]:
stop_words_150 = removeStopWords(case_fold, 150)
stop_words_150_value = len(stop_words_150)

### Distinct terms after stemming

In [23]:
stemmed = list(set(stemming(stop_words_150)))
stemmed_value = len(stemmed)

### Final results

In [24]:
number_filtered = [unfiltered_value, no_numbers_value, only_alpha_terms_value ,case_fold_value, 
                   stop_words_30_value, stop_words_150_value, stemmed_value]


reduction_filtered = allReductionPercentage(number_filtered)

proportion_with_unfiltered = [0]
for value in number_filtered[1:]:
    proportion_with_unfiltered.append(proportionWithUnfiltered(unfiltered_value, value))


In [25]:
Distinct_Terms['Numbers'] = number_filtered
Distinct_Terms['Reduction %'] = reduction_filtered
Distinct_Terms['Proportion with unfiltered %'] = proportion_with_unfiltered

table_distinct_terms = DataFrame(Distinct_Terms)
table_distinct_terms

Unnamed: 0,Type,Numbers,Reduction %,Proportion with unfiltered %
0,Unfiltered,47305,0,0
1,No numbers,43687,7,7
2,Terms without symbols,42554,2,10
3,Case folding,39154,7,17
4,30 Stop Words,39131,0,17
5,150 Stop Words,39023,0,17
6,Stemming,28132,27,40


## Frequency

In [26]:
frequency(caseFolding(tokenized_terms)).most_common(10)

[('the', 59845),
 (',', 51208),
 ('.', 43369),
 ('of', 25536),
 ('to', 24047),
 ('a', 21279),
 ('and', 20697),
 ('in', 20661),
 ('said', 12851),
 ('``', 11264)]

## The issues we faced

One of the problems we have faced was dealing with the words that include a number, such as "10-years-old". The issue came when we tried to remove the numbers. We have realized that some Python commands remove such phrases completely.

Removing symbols caused a problem similar to the previous one. Some words are made of more than one part and “-” is used between the parts. The result of erasing symbols was losing some words such as “first-degree”.

Another issue with removing numbers was finding out the symbol for decimal and Milhar number (it can be "," or ".", or"/").

In the beginning, we structured the case-folding function in a way that it will leave just the strings with all alphabetical characters in them. However, it would have been removed also some terms that we didn't remove before. This means that the number we would have obtained would have been misleading. As a result we created the "keepOnlyAlph" function.