In [23]:
import sys
sys.path.append("../src")

import pandas as pd
import numpy as np
from transformers import AutoTokenizer

from bert_model import MyBertModel
from embedding_model import EmbeddingModel
from collator import CustomCollator
from dataset import JuridiqueDataset
from preprocess import Preprocessing
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim

from transformers import get_cosine_schedule_with_warmup

import random
import time

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Import our datas

In [2]:
train_df = pd.read_csv("../datas/train_data.csv")

'Le : 12/11/2019\n\xa0\n\xa0\nCour d’appel d’Agen \n\xa0\nchambre civile \n\xa0\nAudience publique du 25 janvier 2012 \n\xa0\nN° de RG: 11/00752 \n\xa0\n\xa0\nInfirme partiellement, réforme ou modifie certaines dispositions de la décision déférée \n\xa0\n\xa0\n\xa0\nREPUBLIQUE FRANCAISE\n\xa0\nAU NOM DU PEUPLE FRANCAIS\n\xa0\nARRÊT DU \n\xa0\n25 Janvier 2012 \xa0\n\xa0\n-------------------- \n\xa0\nRG N : 11/ 00752\n\xa0\n-------------------- \xa0\nMichaël X... \xa0\nSylvie X... \xa0\nPhilippe X... \xa0\n\xa0\nC/ \xa0\n\xa0\nS. A. FONDS DE GARANTIE AUTOMOBILE \xa0\nCAISSE NATIONALE DE SÉCURITÉ MILITAIRE \xa0\nCAISSE PRIMAIRE D’ASSURANCE MALADIE DE HAUTE GARONNE \xa0\nEtablissement UNEO MONTROUGE \xa0\nCAISSE DE SECURITE SOCIALE ETUDIANTE VITTAVI\xa0\n\xa0\n------------------- \xa0\n\xa0\nARRÊT no 123-2012 \xa0\n\xa0\nCOUR D’APPEL D’AGEN \xa0\nChambre Civile \xa0\n\xa0\nPrononcé par mise à disposition au greffe conformément au second alinéa de l’article 450 et 453 du Code de procédure c

# Remove rows with error ans keep only the sex column

In [None]:
train_df = train_df[train_df["sexe"] != "n.c."]

In [4]:
train_df = train_df.drop(["date_accident", "date_consolidation", "ID", "filename"], axis=1)

# Preprocessing

In [None]:
preprocess_train = Preprocessing(train_df)

train_df = preprocess_train.remove_newlines()
train_df = preprocess_train.remove_stopwords()


In [7]:
train_df['text_id'] = np.arange(len(train_df))

In [None]:
df_train, df_test = train_test_split(train_df, test_size=0.2, random_state=42)
df_train, df_test = df_train.copy(), df_test.copy()

# Let's call our created dataset

In [None]:
model_name = "almanach/camembert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
args = {}
ds = JuridiqueDataset(df_train, tokenizer, args)
idx = random.choice(range(len(ds)))

# Now we use our Collator and DataLoader

In [None]:
collator_function = CustomCollator(tokenizer)
my_dataset = JuridiqueDataset(df_train, tokenizer, args)

In [None]:
data_loader = DataLoader(my_dataset, drop_last=False, num_workers=0, pin_memory=False, shuffle=False,
                              batch_size=2, collate_fn=collator_function)

In [None]:
for batch in tqdm(data_loader):
    break

# Training of the model

In [None]:
def train_one_step(batch,model,criterion):
    """
    Complete this function which should return the loss generate on the bacth data
    """
    # convert bacth data to same device as model
    device  = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu")
    batch = batch_to_device(batch,device)
    # one step forward with the bacth
    pred = model(batch)

    # compute loss
    loss = criterion(pred.squeeze(),batch['sexe'].float().squeeze(-1))
    return loss

In [None]:
def train_one_epoch(epoch_number,data_loader,model,criterion,optimzer,lr_scheduler):
    losses = []
    model.train()
    start_time = time.time()
    pbar = tqdm(data_loader)
    for batch in pbar:
        loss = train_one_step(batch,model,criterion)
        pbar.set_postfix({"loss":loss.item()})
        losses.append(loss.item())
        loss.backward()
        optimzer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    lr = scheduler.get_lr()[0]
    elapsed_time = time.time() - start_time
    loss_ = np.mean(losses)
    print(f"Epoch {epoch_number + 1} :  lr={lr:.6f} t={elapsed_time:.0f}s loss : {loss_:.5f}")
    return model

In [None]:
def inference(valid_loader, model):
    predictions = []
    model.eval()

    device = next(model.parameters()).device

    with torch.no_grad():
        for batch in tqdm(valid_loader):
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
            pred = model(batch).sigmoid().squeeze()

            if pred.dim() == 0:
                pred = pred.unsqueeze(0)

            predictions.append(pred.detach().cpu().numpy())

    predictions = np.concatenate(predictions, axis=0)

    df_predict = pd.DataFrame({"sexe_pred": predictions.tolist()})
    return df_predict

In [None]:
def batch_to_device(batch, device):
    """Moves only batch tensors to the specified device."""
    batch_dict = {}
    for key in batch:
        if isinstance(batch[key], torch.Tensor):
            batch_dict[key] = batch[key].to(device)
        else:
            batch_dict[key] = batch[key]
    return batch_dict


In [None]:
# Define your model
device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu")
net = MyBertModel(model_name=model_name,num_labels=1)
net.to(device)

# Define an optimzer
optimizer = optim.AdamW(net.parameters(),lr = 4e-6 )

# Define a scheduller for your model training

BATCH_SIZE = 8
EPOCHS = 10
warmup_steps = 0.04 * (len(df_train)//BATCH_SIZE)
training_steps = EPOCHS * (len(df_train)// (BATCH_SIZE))

scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, training_steps)

In [None]:
criterion = nn.BCEWithLogitsLoss().to(device)
for epoch_num in range(EPOCHS):
    net = train_one_epoch(epoch_num, data_loader, net, criterion, optimizer, scheduler)

In [None]:
collator_function = CustomCollator(tokenizer)

test_dataset = JuridiqueDataset(df_test, tokenizer,args)
test_loader = DataLoader(test_dataset, drop_last=False, num_workers=0, pin_memory=False, shuffle=False,
                              batch_size=8, collate_fn=collator_function)


In [None]:
pred_df = inference(test_loader, net)

# Let's predict the sex of the victim!

In [None]:
def get_embeddings(model, df):
    collator_function = CustomCollator(tokenizer)
    valid_dataset = JuridiqueDataset(df, tokenizer, args)
    valid_loader = DataLoader(valid_dataset, drop_last=False, num_workers=0, pin_memory=False, shuffle=False,
                              batch_size=2, collate_fn=collator_function)
    
    embed_predictions = []
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    model.eval()
    with torch.no_grad():
        for batch in tqdm(valid_loader):
            batch = batch_to_device(batch, device)
            pred = model(batch)

            if pred.dim() == 1:
                pred = pred.unsqueeze(0)
            pred = pred.detach().cpu().numpy()
            embed_predictions.append(pred)
    
    embeddings = np.concatenate(embed_predictions, axis=0)
    df_predict = pd.DataFrame(embeddings)
    
    return df_predict

In [None]:
device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu")
net = EmbeddingModel(model_name=model_name)
net.to(device)

In [None]:
df_embed_train = get_embeddings(net,df_train)
df_embed_test = get_embeddings(net,df_test)

## Predictions on the test set

In [None]:
X = df_embed_train

df_train["sexe"] = df_train["sexe"].replace({"homme":1,
                      "femme":0})
df_test["sexe"] = df_test["sexe"].replace({"homme":1,
                      "femme":0})
y = df_train['sexe']

clf = LogisticRegression(C=1.5, random_state=1).fit(X, y)

preds_test = clf.predict(df_embed_test)

y_test = df_test["sexe"]


# Now, we compute our metrics

In [31]:
acc_test = metrics.accuracy_score(y_test, preds_test)
f1_test = metrics.f1_score(y_test, preds_test, average='macro')
print(f"accuracy on the test set: {100 * round(acc_test, 2)}%")
print(f"F1 on the test set: {100 * round(f1_test, 2)}%")


accuracy on the test set: 74.0%
F1 on the test set: 54.0%
