In [15]:
import torch
from transformers import CamembertTokenizer, CamembertModel, CamembertForSequenceClassification, Trainer, TrainingArguments, AutoModel
from sklearn.ensemble import IsolationForest
import numpy as np
from sklearn.decomposition import PCA
from clustertools import *

In [16]:
datasets = ["africain", "antisemite", "arabe", "asie", "capacitiste", "chretiens", "gitan", "lgbt", "miso", "musulman"]
cluster_dir = "clusters/{}/"
datasets_dir = "datasets/{}.txt"

### Récupération des tweets

In [6]:
def load_msg(raw_dataset_file):
    with open(raw_dataset_file, "r") as file:
        raw_dataset = file.read().splitlines()
    return raw_dataset

### Encodage du jeu de données

On charge le modèle depuis le disque

In [7]:
model_dir = "models/tweetbert"
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [8]:
tokenizer = CamembertTokenizer.from_pretrained(model_dir, do_lowercase=False)
camembert = CamembertModel.from_pretrained(model_dir)

Some weights of the model checkpoint at models/tweetbert were not used when initializing CamembertModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertModel were not initialized from the model checkpoint at models/tweetbert and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
r = camembert.to(device)

In [10]:
def encode(sample):
    encoded = []
    for n in range(0, len(sample), 50):
        batch = tokenizer(sample[n:n+50], padding=True, truncation=True, max_length=512, return_tensors="pt")
        torch.cuda.empty_cache()
        batch.to(device)
        with torch.no_grad():
            outp = camembert(**batch)
        encoded.append(outp['last_hidden_state'][:,0,:].to('cpu'))
    return torch.cat(encoded)

### Utilisation de l'algorithme Isolation Forest sur tous les clusters pour supprimer les outliers
On sauvegarde les clusters débarassés de leurs outliers

In [14]:
%%time

for dataset in datasets:
    print("Dataset {}.".format(dataset))
    cd = cluster_dir.format(dataset)
    new_cd = cluster_dir.format(dataset + "_post_IF")
    if not os.path.exists(cd): 
        os.mkdir(cd)
    if not os.path.exists(new_cd): 
        os.mkdir(new_cd)
    dir_it = os.scandir(path=cd)
    filenames = [entry.name for entry in dir_it] 
    filenames.remove("info.txt")
    for filename in filenames:
        print("Cluster {}".format(filename))
        msg = load_msg(cd + filename)
        msg_encoded = encode(msg)
        if_model = IsolationForest(n_jobs=-1, random_state=42, contamination=0.15)
        if_model.fit(msg_encoded)
        pred = if_model.predict(msg_encoded)
        cluster_post_IF = []
        for p, m in zip(pred, msg):
            if (p == 1):
                cluster_post_IF.append(m)
        save_cluster_raw_msg(cluster_post_IF, new_cd + filename)


Dataset africain.
Cluster cluster42_haineux.txt
Cluster cluster0_haineux.txt
Cluster cluster10_haineux.txt
Cluster cluster11_haineux.txt
Cluster cluster13_haineux.txt
Cluster cluster15_haineux.txt
Cluster cluster16_haineux.txt
Cluster cluster21_normal.txt
Cluster cluster23_haineux.txt
Cluster cluster25_haineux.txt
Cluster cluster31_normal.txt
Cluster cluster33_normal.txt
Cluster cluster38_haineux.txt
Cluster cluster39_haineux.txt
Cluster cluster3_haineux.txt
Cluster cluster40_haineux.txt
Cluster cluster44_normal.txt
Cluster cluster49_haineux.txt
Cluster cluster4_haineux.txt
Cluster cluster54_normal.txt
Cluster cluster55_normal.txt
Cluster cluster56_normal.txt
Cluster cluster59_haineux.txt
Cluster cluster60_normal.txt
Cluster cluster69_haineux.txt
Cluster cluster6_haineux.txt
Cluster cluster70_normal.txt
Cluster cluster71_normal.txt
Cluster cluster74_normal.txt
Cluster cluster76_normal.txt
Cluster cluster77_normal.txt
Cluster cluster81_haineux.txt
Cluster cluster85_normal.txt
Cluster cl