# Projet de RIW

Par Antoine Apollis, Marine Sobas et Paul Viossat

## Installation

In [1]:
!pip install --user nltk



In [2]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\Antoine
[nltk_data]     Apollis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Antoine
[nltk_data]     Apollis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Expressions régulières et constantes

In [3]:
from re import compile

punctuation_regex = compile(r'[,.;:?!—–&]?[" ]+|["\']')
number_regex = compile("^[0-9,.]*$")
index_regex = compile(r'(\S+), \d+ \| (\(\S+, \d+ ;.*)')
doc_occ_pos_regex = compile(r'\((\S+), (\d+) ; ((?:\d+ ?)+)\) ')

In [4]:
INDEX_FILENAME = 'INDEX'

## Fonctions de traitement du texte

In [5]:
from nltk.tokenize import word_tokenize

# Tokenizes a character string
def tokenize(s):
    return [w.lower() for w in punctuation_regex.split(s) if len(w) > 1]

In [6]:
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))

# Removes stop words (from NLTK) from a list of tokens
def remove_stop_words(tokens):
    return [w for w in tokens if not w in stopwords]

In [7]:
def remove_numbers(array):
    [w for w in filtered_sentence if not number_regex.match(w)]

In [8]:
from nltk.stem import WordNetLemmatizer, PorterStemmer

lemmatizer = WordNetLemmatizer()

# Lemmatizes a list of tokens
def lemmatize(tokens):
    return [lemmatizer.lemmatize(w) for w in tokens]

stemmer = PorterStemmer()

# Stems a list of tokens
def stem(tokens):
    return [stemmer.stem(w) for w in tokens]

## Construction de l’index

In [9]:
def save_index(index):
    with open(INDEX_FILENAME, 'w') as f:
        for word in index:
            f.write(f'{word}, {len(index[word])} | ')
            for (document, tokens) in index[word].items():
                f.write(f'({document}, {tokens[0]} ; {" ".join(map(str, tokens[1]))}) ')
            f.write('\n')

In [10]:
def load_index():
    with open(INDEX_FILENAME) as f:
        inverted_index = dict()
        for l in f:
            m = index_regex.match(l)
            inverted_index[m.group(1)] = dict(map(lambda t: (t[0], [int(t[1]), list(map(int, t[2].split(' ')))]), doc_occ_pos_regex.findall(m.group(2))))
    return inverted_index

In [11]:
def extract_vocabulary(collection):
    vocabulary = set()
    for tokens in collection.values():
        for t in tokens:
            vocabulary.add(t)
    return vocabulary

In [12]:
def load_document(filename):
    with open(filename) as f:
        return f.read().rstrip()

In [13]:
from time import time
from os import listdir
from os.path import isfile, getsize

def build_inverted_index(directory):
    if isfile(INDEX_FILENAME):
        return load_index()
    fullchrono = time()
    print('Chargement de la collection : ', end='')
    collection = dict()
    for sub_dir in listdir(directory):
        path = directory + '/' + sub_dir
        for filename in listdir(path):
            fullpath = './' + path + '/' + filename
            collection[fullpath] = list()
    print('fait')
    ndocuments = len(collection)
    print(f'La collection comporte {ndocuments} documents.\n======')
    progress = 0
    step = ndocuments // 10
    nextstep = step
    chrono = time()
    for fullpath in collection.keys():
        collection[fullpath] = stem(remove_stop_words(tokenize(load_document(fullpath))))
        progress += 1
        if progress > nextstep:
            print(f'Traitement de la collection en cours : encore {round((time() - chrono) / nextstep * (ndocuments - progress) / 60)} min')
            nextstep += step
    print('======\nCréation du vocabulaire : ', end='')
    vocabulary = extract_vocabulary(collection)
    print('fait')
    print(f'Le vocabulaire comporte {len(vocabulary)} éléments.\n======')
    index = {word: dict() for word in vocabulary}
    progress = 0
    nextstep = step
    chrono = time()
    for (document, tokens) in collection.items():
        i = 0
        for t in tokens:
            if document in index[t]:
                index[t][document][0] += 1
                index[t][document][1].append(i)
            else:
                index[t][document] = [1, [i]]
            i += 1
        progress += 1
        if progress > nextstep:
            print(f'Création de l’index en cours : encore {round((time() - chrono) / nextstep * (ndocuments - progress))} s')
            nextstep += step
    save_index(index)
    print('\nIndex créé et enregistré\n======')
    print(f'L’opération complète a nécessité {(time() - fullchrono) / 60:.1f} minutes.')
    print(f'L’index occupe {getsize(INDEX_FILENAME) // 1000} ko.')
    return index

In [14]:
index = build_inverted_index('pa1-data')