# Projet de RIW

Par Antoine Apollis, Marine Sobas et Paul Viossat

## Installation

In [None]:
!pip install --user nltk

In [None]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

## Expressions régulières et constantes

In [None]:
from re import compile

punctuation_regex = compile(r'[,.;:?!—–&]?[" ]+|["\']')
number_regex = compile("^[0-9,.]*$")
index_regex = compile(r'^([\w,.-]+), \d+ \| (\(\d+, \d+ ;.*)$')
doc_occ_pos_regex = compile(r'\((\d+), (\d+) ; ((?:\d+ ?)+)\) ')

In [None]:
INDEX_FILENAME = 'INDEX'

## Fonctions de traitement du texte

In [None]:
from nltk.tokenize import word_tokenize

# Tokenizes a character string
def tokenize(s):
    return [w.lower() for w in punctuation_regex.split(s) if len(w) > 1]

In [None]:
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))

# Removes stop words (from NLTK) from a list of tokens
def remove_stop_words(tokens):
    return [w for w in tokens if not w in stopwords]

In [None]:
def remove_numbers(array):
    [w for w in filtered_sentence if not number_regex.match(w)]

In [None]:
from nltk.stem import WordNetLemmatizer, PorterStemmer

lemmatizer = WordNetLemmatizer()

# Lemmatizes a list of tokens
def lemmatize(tokens):
    return [lemmatizer.lemmatize(w) for w in tokens]

stemmer = PorterStemmer()

# Stems a list of tokens
def stem(tokens):
    return [stemmer.stem(w) for w in tokens]

## Construction de l’index

In [None]:
def save_index(index):
    with open(INDEX_FILENAME, 'w') as f:
        for word in index:
            f.write(f'{word}, {len(index[word])} | ')
            for (document, tokens) in index[word].items():
                f.write(f'({document}, {tokens[0]} ; {" ".join(map(str, values[1]))}) ')
            f.write('\n')

In [None]:
def load_index():
    with open(INDEX_FILENAME, 'r') as f:
        inverted_index = dict()
        for l in f:
            m = index_regex.match(l)
            inverted_index[m.group(1)] = dict(map(lambda t: (t[0], [int(t[1]), list(map(int, t[2].split(' ')))]), doc_occ_pos_regex.findall(m.group(2))))
    return inverted_index

In [None]:
def extract_vocabulary(collection):
    vocabulary = set()
    for tokens in collection.values():
        for t in tokens:
            vocabulary.add(t)
    return vocabulary

In [None]:
def load_document(filename):
    with open(filename) as f:
        return f.read().rstrip()

In [None]:
from os.path import isfile
from os import listdir

def build_inverted_index(directory):
    if isfile(INDEX_FILENAME):
        return load_index()
    print('Chargement de la collection : ', end='')
    collection = dict()
    for sub_dir in listdir(directory):
        path = directory + '/' + sub_dir
        for filename in listdir(path):
            fullpath = './' + path + '/' + filename
            collection[fullpath] = list()
    print('fait')
    print(f'La collection comporte {len(collection)} documents.')
    progress = 0
    twopercents = len(collection) // 50
    nexttwopercents = twopercents
    print('Traitement de la collection en cours : ', end='')
    for fullpath in collection.keys():
        collection[fullpath] = stem(remove_stop_words(tokenize(load_document(fullpath))))
        progress += 1
        if progress > next_percent:
            next_percent += percent
            print('=', end='')
    print('\nCréation du vocabulaire : ', end='')
    vocabulary = extract_vocabulary(collection)
    print('fait')
    print(f'Le vocabulaire comporte {len(vocabulary)} éléments.')
    index = {word: dict() for word in vocabulary}
    progress = 0
    next_percent = percent
    print('Création de l’index en cours : ', end='')
    for (document, tokens) in collection.items():
        i = 0
        for t in tokens:
            if document in index[t]:
                index[t][document][0] += 1
                index[t][document][1].append(i)
            else:
                index[t][document] = [1, [i]]
            i += 1
        progress += 1
        if progress > nexttwopercents:
            nexttwopercents += twopercents
            print('=', end='')
    save_index(index)
    return index

In [None]:
build_inverted_index('pa1-data')

## Paramètres de la collection