# FastText (8 points)

In [22]:
import fasttext
from datasets import load_dataset
import numpy as np

# Dataset
## Load and split dataset

In [9]:
dataset = load_dataset("imdb")
# Split dataset
train_dataset = dataset["train"].train_test_split(
    stratify_by_column="label", test_size=0.2, seed=42, shuffle=True
)
test_df = dataset["test"]
train_df = train_dataset["train"]
valid_df = train_dataset["test"]
train_df.shape, valid_df.shape, test_df.shape

Found cached dataset imdb (/home/maxenceoden/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/maxenceoden/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-d8e5ee367e34a906.arrow
Loading cached processed dataset at /home/maxenceoden/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-bc11a628dc776776.arrow
Loading cached processed dataset at /home/maxenceoden/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-39e499381803f536.arrow
Loading cached split indices for dataset at /home/maxenceoden/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-5f37fd0866e4f89f.arrow and /home/maxenceoden/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-dd5732a0e6ac784c.arrow


((20000, 2), (5000, 2), (25000, 2))

## Preprocessing

- Lowercase
- Remove punctuation

In [7]:
punctuation_filter = ['"', '#', '$', '%', '&', "'", '(', ')', '*', '+',
                      ',', '.', '/', ':', ';', '<', '=', '>', '?', '@',
                      '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']

def to_lower_case(row: dict) -> dict:
    """
    Lower text field in the row dict
    return: updated row
    """
    row['text'] = row['text'].lower()
    return row

def remove_punctuation(row: dict) -> dict:
    """
    Replace punctuation from punctuation_filter list to
    spaces in the text field of row dict
    return: updated row
    """
    for punctuation in punctuation_filter:
        row['text'] = row['text'].replace(punctuation, ' ')
    row['text'] = row['text'].replace('!', ' ! ')
    return row

def preprocessing(row: dict) -> dict:
    """
    Lower text field in the row dict and replace punctuation
    from punctuation_filter list to spaces in the text field
    of row dict
    return: updated row
    """
    return to_lower_case(remove_punctuation(row))

In [44]:
Xtest = test_df.map(preprocessing).shuffle()
Xvalid = valid_df.map(preprocessing).shuffle()
Xtrain = train_df.map(preprocessing).shuffle()
Xtest['text'][1]

Loading cached processed dataset at /home/maxenceoden/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-bc11a628dc776776.arrow
Loading cached processed dataset at /home/maxenceoden/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-09e237570f4ca186.arrow
Loading cached processed dataset at /home/maxenceoden/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-7377700f087d94f7.arrow


'in a genre by itself  this film has a limited audience and narrow appeal coupled with a subtle undertone which permeates the entire production  nevertheless  it is a remarkable piece of cinema which is as timeless as a rare work of art  capturing a time in québec rarely seen in movies  mon oncle antoine s strength lies in the depth of its characters and the richness of the settings  duplessis  québec  parochial and feudal  is brilliantly cast as the backdrop which could not possibly be achieved by anyone other than a pure laine québecois  br    br   it would be far too easy to resort to stereotypes  clichés and single-minded myopic statements in this story  yet the director chose to skip the forced imagery and instead  focused on the essence of life in rural québec of the time  that makes this film exceptional in its authenticity while not being pretentious in its presentation  if only more contemporary cinematic endeavors would do the same  the viewing public might not be forced to c

In [45]:
Xtest

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

## Create fasttext input files

In [46]:
def gen_dataset(file_name: str, dataset) -> None:
    with open(file_name, 'w') as f:
        #index_list = np.arange(len(dataset))
        #np.random.shuffle(index_list)
        for sample in dataset:
            f.write(f"__label__{sample['label']} {sample['text']}\n")

In [47]:
gen_dataset("valid.txt", Xvalid)
gen_dataset("train.txt", Xtrain)
gen_dataset("test.txt", Xtest)

# Train a FastText classifier with default parameters

In [61]:
default_model = fasttext.train_supervised('train.txt')

Read 4M words
Number of words:  86070
Number of labels: 2
Progress: 100.0% words/sec/thread: 4976132 lr:  0.000000 avg.loss:  0.424125 ETA:   0h 0m 0s


In [62]:
default_model.test('test.txt')

(25000, 0.86912, 0.86912)

# Hyper parameters search

In [55]:
hyper_model = fasttext.train_supervised(input='train.txt', autotuneValidationFile='valid.txt')

Progress: 100.0% Trials:    9 Best score:  0.897200 ETA:   0h 0m 0s
Training again with best arguments
Read 4M words
Number of words:  86070
Number of labels: 2
Progress: 100.0% words/sec/thread:  996486 lr:  0.000000 avg.loss:  0.046579 ETA:   0h 0m 0s


In [58]:
hyper_model.test("test.txt")

(25000, 0.89576, 0.89576)

# Differences between the models 

In [100]:
print(f" Field      |  default model   |     hyper model     |")
print(f"------------+------------------+---------------------+")
print(f" Dimension  | {default_model.get_dimension()}              | {hyper_model.get_dimension()}                  |")
print(f" minn       | {default_model.minn}                | {hyper_model.minn}                   |")
print(f" maxn       | {default_model.maxn}                | {hyper_model.maxn}                   |")
print(f" neg        | {default_model.neg}                | {hyper_model.neg}                   |")
print(f" wordNgrams | {default_model.wordNgrams}                | {hyper_model.wordNgrams}                   |")
print(f" lr         | {default_model.lr}              | {hyper_model.lr} |")
print(f" minCount   | {default_model.minCount}                | {hyper_model.minCount}                   |")

 Field      |  default model   |     hyper model     |
------------+------------------+---------------------+
 Dimension  | 100              | 92                  |
 minn       | 0                | 0                   |
 maxn       | 0                | 0                   |
 neg        | 5                | 5                   |
 wordNgrams | 1                | 2                   |
 lr         | 0.1              | 0.08499425639667486 |
 minCount   | 1                | 1                   |


# 2 wrongly classified examples

In [124]:
def search_wrong_classified(model):
    errors = []
    for sample in Xtest:
        if len(sample['text']) > 2000:
            continue
        prediction = model.predict(sample['text'])
        if int(prediction[0][0][-1]) != sample['label']:
            if len(errors) == 1:
                if sample['label'] == errors[0][0]['label']:
                    continue
            errors.append((sample, prediction))
            if len(errors) == 2:
                return errors
    return errors

In [125]:
search_wrong_classified(hyper_model)

[({'text': 'uzumaki succeeds as at plunging you into a bizarre surreality where uzumaki shapes haunt and curse a town  it fails at being a competent horror movie  while the film is sure to draw attention mainly to it s bizarre plot line and a few interesting visual treats  it s going to come off better as a dark comedy than a horror film  it s definitely a film you should see if your into the kind of stuff- but if your looking for a scare or even a small chill  you ll want to look elsewhere  uzumaki doesn t really have much else up it s sleeve but a great chain of odd events  br    br   a',
   'label': 1},
  (('__label__0',), array([0.99533099]))),
 ({'text': 'okay  this movie starts out and it  looks  like it s going to be a cute comedy about a completely obsessed soap opera fan  she has no touch with reality whatsoever outside of the soap  sort of the inverse of the main characters in  pleasantville   and runs away to los angeles to meet a fictional character  well it is a cute movie

- The first comment is clasified as positive because the author was disappointed that the film is not a horror movie.
- The second comment is clasified as negative. The author of the comment don't explicity say that he likes the movie.