In [2]:
import torch
from transformers import CamembertTokenizer, CamembertModel, CamembertForSequenceClassification, Trainer, TrainingArguments, AutoModel
#import pandas as pd
#from sklearn.model_selection import train_test_split
#import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from clustertools import *

In [3]:
from minisom import MiniSom

In [1]:
datasets = ["africain", "antisemite", "arabe", "asie", "capacitiste", "chretiens", "gitan", "lgbt", "miso", "musulman"]

cluster_dir = "clusters/{}/"
datasets_dir = "datasets/{}.txt"

### Récupération des tweets

In [5]:
def load_msg(raw_dataset_file):
    with open(raw_dataset_file, "r") as file:
        raw_dataset = file.read().splitlines()
    return raw_dataset

### Encodage du jeu de données

On charge le modèle depuis le disque

In [6]:
model_dir = "models/tweetbert"
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [7]:
tokenizer = CamembertTokenizer.from_pretrained(model_dir, do_lowercase=False)
camembert = CamembertModel.from_pretrained(model_dir)

Some weights of the model checkpoint at models/tweetbert were not used when initializing CamembertModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertModel were not initialized from the model checkpoint at models/tweetbert and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
r = camembert.to(device)

In [9]:
def encode(sample):
    encoded = []
    for n in range(0, len(sample), 50):
        batch = tokenizer(sample[n:n+50], padding=True, truncation=True, max_length=512, return_tensors="pt")
        torch.cuda.empty_cache()
        batch.to(device)
        with torch.no_grad():
            outp = camembert(**batch)
        encoded.append(outp['last_hidden_state'][:,0,:].to('cpu'))
    return torch.cat(encoded)

### Analyse des résultats

In [10]:
classifier = CamembertForSequenceClassification.from_pretrained("models/tweetbert_val")
#torch.cuda.empty_cache()
r = classifier.to(device)

In [11]:
def predict(sample):
    pred = []
    for n in range(0, len(sample), 10):
        phrases = sample[n:n+10]
        enc = tokenizer(phrases, padding=True, truncation=True, max_length=512, return_tensors='pt')
        enc.to('cuda:0')
        with torch.no_grad():
            outp = classifier(**enc)
        p = torch.nn.functional.softmax(outp.logits, dim=1)
        labels = torch.argmax(p, dim=1)
        pred.append(labels)
    return torch.cat(pred)

### Partitionnement de tous les datasets et sauvegarde des clusters 
On ne sauvegarde que les clusters dont le pourcentage de messages haineux est supérieur ou égal à 80% ou inférieur ou égal à 20%

In [13]:
for dataset in datasets:
    print("Travail sur le dataset {}.".format(dataset))
    print("Chargement des messages.")
    raw_dataset = load_msg(datasets_dir.format(dataset))
    print("Encodage.")
    dataset_encoded = encode(raw_dataset)
    pca = PCA(n_components=2)
    two_dim_dataset = pca.fit_transform(dataset_encoded)
    cd = cluster_dir.format(dataset)
    init_globals([msg.split() for msg in raw_dataset], raw_dataset, two_dim_dataset, cd)
    m = 10
    n = 10
    print("Partitionnement.")
    som = MiniSom(m, n, 768, neighborhood_function='gaussian', random_seed=0)
    som.pca_weights_init(dataset_encoded)
    som.train(dataset_encoded, 100000, verbose=True)
    pred = []
    for msg in dataset_encoded:
        winner = som.winner(msg)
        pred.append(winner[0]*n+winner[1])
    res, wlists, mlists, cmlists, e2dmlists, mfw, hkw, clust_n_msg = parse(pred)
    print("Évaluation des résultats.")
    hate_pred = []
    for mlist in mlists:
        hate_pred.append(predict(mlist))
    hper = []
    for i, p in enumerate(hate_pred):
        bc = torch.bincount(p, minlength=2)
        perc = (bc[1] / (bc[0]+bc[1])).item() * 100
        hper.append(perc)
    if not os.path.exists(cd): 
        os.mkdir(cd)
    print("Sauvegarde des résultats.")
    for i in range(100):
        if hper[i] >= 80 or hper[i] <= 20:
            lab = "haineux" if hper[i] >= 80 else "normal"
            save_cluster_info(cd + "info.txt", clust_n_msg[i], mfw[i], hkw[i], hper[i], None)
            save_cluster_raw_msg(mlists[i], cd + "cluster{}_{}.txt".format(i, lab))
    print()
                                 

Travail sur le dataset africain.
Chargement des messages.
Encodage.
Partitionnement.
 [ 100000 / 100000 ] 100% - 0:00:00 left 
 quantization error: 1.936816503551681
Évaluation des résultats.
Sauvegarde des résultats.

Travail sur le dataset antisemite.
Chargement des messages.
Encodage.
Partitionnement.
 [ 100000 / 100000 ] 100% - 0:00:00 left 
 quantization error: 1.7117942081307858
Évaluation des résultats.
Sauvegarde des résultats.

Travail sur le dataset arabe.
Chargement des messages.
Encodage.
Partitionnement.
 [ 100000 / 100000 ] 100% - 0:00:00 left 
 quantization error: 1.8146910193865977
Évaluation des résultats.
Sauvegarde des résultats.

Travail sur le dataset asie.
Chargement des messages.
Encodage.
Partitionnement.
 [ 100000 / 100000 ] 100% - 0:00:00 left 
 quantization error: 1.9508106050659635
Évaluation des résultats.
Sauvegarde des résultats.

Travail sur le dataset capacitiste.
Chargement des messages.
Encodage.
Partitionnement.
 [ 100000 / 100000 ] 100% - 0:00:00 le