In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
import re
import nltk
from nltk.corpus import stopwords
from torch.utils.data import Dataset
from transformers import AutoTokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from sklearn import metrics
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from sklearn.model_selection import train_test_split


In [None]:
train_df = pd.read_csv("../datas/train_data.csv")

In [None]:
train_df.head()

In [None]:
def remove_newlines(df):
    df = df.replace("\n", '', regex=True)
    return df
train_df = remove_newlines(train_df)

In [None]:
train_df.head()

# For sex

In [None]:
train = train_df.drop(["date_accident", "date_consolidation"], axis=1)
train = train[train.sexe != "n.c."]

In [None]:
nltk.download('stopwords')
stop_words = stopwords.words('french')
train['texte'] = train["texte"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [None]:
train = train.drop(["ID", "filename"], axis=1)

In [None]:
train['text_id'] = np.arange(len(train))

In [None]:
class JuridiqueDataset(Dataset):
    def __init__(self,
                 df,
                 tokenizer,
                 args
                ):
        # args is a dict, a nice way to share the global arguments (even accross multiple files)
        self.args = args
        self.tokenizer = tokenizer
        self.df = df
        
    def make_one_item(self,idx):
        # this function should encode (tokenize) a given text 
        text_id = self.df.iloc[idx].text_id
        text = self.df.iloc[idx].texte
        sexe = self.df.iloc[idx].sexe
        tokenizer_encoding = self.tokenizer(text, max_length=512)
        outputs = dict(**tokenizer_encoding)
        
        outputs['text_id'] = text_id
        outputs['sexe'] = sexe
        
        return outputs
    
    def __len__(self) -> int:
        return len(self.df)
    
    def __getitem__(self,idx):
        return self.make_one_item(idx)

In [None]:

model_name = "almanach/camembert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
import random
args = {}
ds = JuridiqueDataset(train,tokenizer,args)
idx = random.choice(range(len(ds)))

In [None]:
print(tokenizer.convert_ids_to_tokens(ds[idx]['input_ids']))

# DataLoader

In [None]:
from torch.utils.data import DataLoader

In [None]:
## =============================================================================== ##
class CustomCollator():
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        output["sexe"] = [sample["sexe"] for sample in batch]
        output["text_id"] = [sample["text_id"] for sample in batch]


        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]

        else:

            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        
        sexe_to_int = {"homme": 0, "femme": 1}
        output["sexe"] = torch.tensor([sexe_to_int[item] for item in output["sexe"]], dtype=torch.long)
        # output["sexe"] = torch.tensor(output["sexe"], dtype=torch.long)#.unsqueeze(-1) #mettre float au lieu de long
        output["text_id"] = torch.tensor(output["text_id"], dtype=torch.long)
        return output
    

In [None]:
collator_function = CustomCollator(tokenizer)
my_dataset = JuridiqueDataset(train,tokenizer,args)

In [None]:
data_loader = DataLoader(my_dataset,drop_last = False,num_workers=0,pin_memory=False,shuffle=False,
                              batch_size = 2,collate_fn = collator_function)

In [None]:
from tqdm import tqdm
for batch in tqdm(data_loader):
    break

In [None]:
batch['input_ids'].shape,batch['sexe'].shape,batch['attention_mask'].shape

In [None]:
batch['attention_mask']

In [None]:
import torch.nn as nn
from transformers import AutoConfig, AutoModel
import torch.utils.checkpoint
import torch.nn.functional as F

class MyBertModel(nn.Module):
    def __init__(self, model_name="almanach/camembert-base", num_labels=2):
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True)        
        self.backbone = AutoModel.from_pretrained(model_name)
        self.fc = nn.Linear(self.config.hidden_size, num_labels)

    def forward(self, batch):
        inputs = {k: v for k, v in batch.items() if k in ['input_ids', 'attention_mask']}
        outputs = self.backbone(**inputs)
        x = outputs.last_hidden_state[:, 0, :]
        x = self.fc(x)
        return x


In [None]:
def train_one_step(batch,model,criterion):
    """
    Complete this function which should return the loss generate on the bacth data
    """
    # convert bacth data to same device as model
    device  = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu")
    batch = batch_to_device(batch,device)
    # one step forward with the bacth
    pred = model(batch)
    
    # compute loss 
    loss = criterion(pred.squeeze(),batch['sexe'].float().squeeze(-1))
#     print(loss)
    return loss

In [None]:
def train_one_epoch(epoch_number,data_loader,model,criterion,optimzer,lr_scheduler):
    losses = []
    model.train()
    start_time = time.time()
    pbar = tqdm(data_loader)
    for batch in pbar:
        loss = train_one_step(batch,model,criterion)
        pbar.set_postfix({"loss":loss.item()})
        losses.append(loss.item())
        loss.backward()
        optimzer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
    
    lr = scheduler.get_lr()[0]
    elapsed_time = time.time() - start_time
    loss_ = np.mean(losses)
    print(f"Epoch {epoch_number + 1} :  lr={lr:.6f} t={elapsed_time:.0f}s loss : {loss_:.5f}")
    return model

In [None]:

def inference(valid_loader, model):
    predictions = []
    model.eval()  # Met le modèle en mode évaluation.
    
    device = next(model.parameters()).device
    
    with torch.no_grad():
        for batch in tqdm(valid_loader):
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
            pred = model(batch).sigmoid().squeeze()
            
            if pred.dim() == 0:
                pred = pred.unsqueeze(0)
            
            predictions.append(pred.detach().cpu().numpy())
    
    predictions = np.concatenate(predictions, axis=0)
    
    df_predict = pd.DataFrame({"sexe_pred": predictions.tolist()})
    return df_predict

In [None]:
def batch_to_device(batch, device):
    """Déplace uniquement les tenseurs du batch vers le dispositif spécifié."""
    batch_dict = {}
    for key in batch:
        if isinstance(batch[key], torch.Tensor):
            batch_dict[key] = batch[key].to(device)
        else:
            batch_dict[key] = batch[key]
    return batch_dict


In [None]:
# Define your model
device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu")
net = MyBertModel(model_name=model_name,num_labels=1)
net.to(device)

In [None]:
# Define an optimzer 
import torch.optim as optim

optimizer = optim.AdamW(net.parameters(),lr = 4e-6 )

In [None]:
# Define a scheduller for your model training
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup

BATCH_SIZE = 6
EPOCHS = 8
warmup_steps = 0.04 * (len(train)//BATCH_SIZE)
training_steps = EPOCHS * (len(train)// (BATCH_SIZE))

scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, training_steps)

In [None]:
warmup_steps,training_steps

In [None]:
import time

criterion = nn.BCEWithLogitsLoss().to(device)
for epoch_num in range(EPOCHS):
    net = train_one_epoch(epoch_num,data_loader,net,criterion,optimizer,scheduler)

In [None]:
collator_function = CustomCollator(tokenizer)
valid_dataset = JuridiqueDataset(train,tokenizer,args)
valid_loader = DataLoader(valid_dataset,drop_last = False,num_workers=0,pin_memory=False,shuffle=False,
                              batch_size = 6,collate_fn = collator_function)

In [None]:
pred_df = inference(valid_loader,net)

# TF-IDF

In [None]:

X = train[["texte", 'text_id']]

vect = TfidfVectorizer(
  max_features=5000,
  stop_words=list(fr_stop), binary=True)

X = vect.fit_transform(train['texte'])
train["sexe"] = train["sexe"].replace({'homme':0, "femme":1})
y = train['sexe']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=26)

clf = LogisticRegression(random_state=26).fit(X, y)

preds = clf.predict((X_test))

print("f1:", f1_score(y_test, preds, average='macro'))
print("accuracy:", accuracy_score(y_test, preds))
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds)
print("AUC: ", metrics.auc(fpr, tpr))