In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
import torch.nn as nn
from transformers import AutoConfig, AutoModel
import torch.utils.checkpoint
import torch.optim as optim


import nltk
from nltk.corpus import stopwords
from torch.utils.data import Dataset
from transformers import AutoTokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from sklearn import metrics
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from sklearn.model_selection import train_test_split

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# Import and clean datas

In [4]:
train_df = pd.read_csv("../datas/train_data.csv")

In [5]:
def remove_newlines(df):
    df = df.replace("\n", '', regex=True)
    return df
train_df = remove_newlines(train_df)

In [6]:
train_df.head()

Unnamed: 0,ID,filename,texte,sexe,date_accident,date_consolidation
0,0,Agen_100515.txt,Le : 12/11/2019 Cour d’appel d’Agen chambre ...,homme,1991-04-09,n.c.
1,1,Agen_1100752.txt,Le : 12/11/2019 Cour d’appel d’Agen chambre ...,homme,2005-06-10,2010-01-19
2,2,Agen_1613.txt,Le : 12/11/2019 Cour d’appel d’Agen Audience...,femme,1997-09-26,n.c.
3,3,Agen_2118.txt,Le : 12/11/2019 Cour d’appel d’Agen Audience...,femme,1982-08-07,1982-11-07
4,4,Agen_21229.txt,Le : 12/11/2019 Cour d’appel d’Agen Audience...,homme,1996-11-26,n.c.


# Only keep the sex column

In [7]:
train = train_df.drop(["date_accident", "date_consolidation"], axis=1)

In [8]:
nltk.download('stopwords')
stop_words = stopwords.words('french')
train['texte'] = train["texte"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/SamuelLP/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
train = train.drop(["ID", "filename"], axis=1)

In [10]:
train['text_id'] = np.arange(len(train))

# Let's create our dataset

In [54]:
class JuridiqueDataset(Dataset):
    def __init__(self,
                 df,
                 tokenizer,
                 args
                ):
        # args is a dict, a nice way to share the global arguments (even accross multiple files)
        self.args = args
        self.tokenizer = tokenizer
        self.df = df

    def make_one_item(self,idx):
        # this function should encode (tokenize) a given text
        text_id = self.df.iloc[idx].text_id
        text = self.df.iloc[idx].texte
        sexe = self.df.iloc[idx].sexe
        tokenizer_encoding = self.tokenizer(text, max_length=512)
        outputs = dict(**tokenizer_encoding)

        outputs['text_id'] = text_id
        outputs['sexe'] = sexe

        return outputs

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self,idx):
        return self.make_one_item(idx)

In [55]:

model_name = "almanach/camembert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [56]:
import random
args = {}
ds = JuridiqueDataset(train,tokenizer,args)
idx = random.choice(range(len(ds)))

In [57]:
print(tokenizer.convert_ids_to_tokens(ds[idx]['input_ids']))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


['<s>', '▁Le', '▁:', '▁12', '/11/', '2019', '▁Cour', '▁d', '’', 'appel', '▁Saint', '-', 'Denis', '▁Réunion', '▁chambre', '▁civile', '▁Audi', 'ence', '▁publique', '▁6', '▁mars', '▁2015', '▁N', '°', '▁', 'RG', ':', '▁13', '/01', '3', '66', '▁In', 'firm', 'e', '▁décision', '▁déf', 'érée', '▁toutes', '▁dispositions', ',', '▁l', '’', 'égard', '▁toutes', '▁parties', '▁recours', '▁REP', 'UB', 'L', 'IQUE', '▁FRANC', 'A', 'ISE', '▁AU', '▁', 'NOM', '▁DU', '▁PE', 'UP', 'LE', '▁FRANC', 'AIS', '▁C', 'OUR', '▁D', '’', 'APP', 'EL', '▁DE', '▁SAINT', '-', 'DEN', 'IS', '▁A', 'RR', '<unk>', 'T', '▁DU', '▁06', '▁MAR', 'S', '▁2015', '▁Chambre', '▁civile', '▁T', 'GI', '▁A', 'RR', '<unk>', 'T', '▁No', '15', '/', '▁P', 'B', '▁R', '.', '▁G', '▁:', '▁13', '/', '▁01', '3', '66', '▁X', '...', '▁C', '/', '▁Association', '▁G', 'ROU', 'P', 'EMENT', '▁SPORT', 'IF', '▁DE', '▁LA', '▁J', 'EU', 'NES', 'SE', '▁Organ', 'isme', '▁C', 'AIS', 'SE', '▁GE', 'NER', 'ALE', '▁DE', '▁SEC', 'UR', 'ITE', '▁SO', 'CI', 'ALE', '▁DE', '▁

# Now, we create our DataLoader

In [58]:
class CustomCollator():
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        output["sexe"] = [sample["sexe"] for sample in batch]
        output["text_id"] = [sample["text_id"] for sample in batch]


        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]

        else:

            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)

        sexe_to_int = {"homme": 0,
                       "femme": 1,
                       "n.c.": -1}
        output["sexe"] = torch.tensor([sexe_to_int[item] for item in output["sexe"]], dtype=torch.long)
        output["text_id"] = torch.tensor(output["text_id"], dtype=torch.long)
        return output


In [59]:
collator_function = CustomCollator(tokenizer)
my_dataset = JuridiqueDataset(train,tokenizer,args)

In [60]:
data_loader = DataLoader(my_dataset,drop_last = False,num_workers=0,pin_memory=False,shuffle=False,
                              batch_size = 2,collate_fn = collator_function)

In [61]:
for batch in tqdm(data_loader):
    break

  0%|          | 0/385 [00:00<?, ?it/s]


In [62]:
batch['input_ids'].shape,batch['sexe'].shape,batch['attention_mask'].shape

(torch.Size([2, 512]), torch.Size([2]), torch.Size([2, 512]))

In [63]:
batch['attention_mask']

tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])

## We create our model

In [64]:
class MyBertModel(nn.Module):
    def __init__(self, model_name="almanach/camembert-base", num_labels=2):
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True)
        self.backbone = AutoModel.from_pretrained(model_name)
        self.fc = nn.Linear(self.config.hidden_size, num_labels)

    def forward(self, batch):
        inputs = {k: v for k, v in batch.items() if k in ['input_ids', 'attention_mask']}
        outputs = self.backbone(**inputs)
        x = outputs.last_hidden_state[:, 0, :]
        x = self.fc(x)
        return x


# Training of the model

In [65]:
def train_one_step(batch,model,criterion):
    """
    Complete this function which should return the loss generate on the bacth data
    """
    # convert bacth data to same device as model
    device  = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu")
    batch = batch_to_device(batch,device)
    # one step forward with the bacth
    pred = model(batch)

    # compute loss
    loss = criterion(pred.squeeze(),batch['sexe'].float().squeeze(-1))
    return loss

In [66]:
import time
def train_one_epoch(epoch_number,data_loader,model,criterion,optimzer,lr_scheduler):
    losses = []
    model.train()
    start_time = time.time()
    pbar = tqdm(data_loader)
    for batch in pbar:
        loss = train_one_step(batch,model,criterion)
        pbar.set_postfix({"loss":loss.item()})
        losses.append(loss.item())
        loss.backward()
        optimzer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    lr = scheduler.get_lr()[0]
    elapsed_time = time.time() - start_time
    loss_ = np.mean(losses)
    print(f"Epoch {epoch_number + 1} :  lr={lr:.6f} t={elapsed_time:.0f}s loss : {loss_:.5f}")
    return model

In [67]:
def inference(valid_loader, model):
    predictions = []
    model.eval()

    device = next(model.parameters()).device

    with torch.no_grad():
        for batch in tqdm(valid_loader):
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
            pred = model(batch).sigmoid().squeeze()

            if pred.dim() == 0:
                pred = pred.unsqueeze(0)

            predictions.append(pred.detach().cpu().numpy())

    predictions = np.concatenate(predictions, axis=0)

    df_predict = pd.DataFrame({"sexe_pred": predictions.tolist()})
    return df_predict

In [68]:
def batch_to_device(batch, device):
    """Moves only batch tensors to the specified device."""
    batch_dict = {}
    for key in batch:
        if isinstance(batch[key], torch.Tensor):
            batch_dict[key] = batch[key].to(device)
        else:
            batch_dict[key] = batch[key]
    return batch_dict


In [69]:
# Define your model
device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu")
net = MyBertModel(model_name=model_name,num_labels=1)
net.to(device)

MyBertModel(
  (backbone): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Laye

In [70]:
# Define an optimzer

optimizer = optim.AdamW(net.parameters(),lr = 4e-6 )

In [71]:
# Define a scheduller for your model training
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup

BATCH_SIZE = 8
EPOCHS = 10
warmup_steps = 0.04 * (len(train)//BATCH_SIZE)
training_steps = EPOCHS * (len(train)// (BATCH_SIZE))

scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, training_steps)

In [72]:
warmup_steps,training_steps

(3.84, 960)

In [73]:
criterion = nn.BCEWithLogitsLoss().to(device)
for epoch_num in range(EPOCHS):
    net = train_one_epoch(epoch_num,data_loader,net,criterion,optimizer,scheduler)

100%|██████████| 385/385 [01:22<00:00,  4.66it/s, loss=0.284]


Epoch 1 :  lr=0.000003 t=83s loss : 0.58452


100%|██████████| 385/385 [01:24<00:00,  4.53it/s, loss=0.309]


Epoch 2 :  lr=0.000000 t=85s loss : 0.57482


100%|██████████| 385/385 [01:25<00:00,  4.48it/s, loss=0.307]


Epoch 3 :  lr=0.000000 t=86s loss : 0.57042


100%|██████████| 385/385 [01:26<00:00,  4.46it/s, loss=0.299]


Epoch 4 :  lr=0.000003 t=86s loss : 0.57314


100%|██████████| 385/385 [01:26<00:00,  4.44it/s, loss=0.288]


Epoch 5 :  lr=0.000004 t=87s loss : 0.57232


100%|██████████| 385/385 [01:27<00:00,  4.42it/s, loss=0.184]


Epoch 6 :  lr=0.000003 t=87s loss : 0.47023


100%|██████████| 385/385 [01:26<00:00,  4.43it/s, loss=0.171]


Epoch 7 :  lr=0.000000 t=87s loss : 0.32552


100%|██████████| 385/385 [01:26<00:00,  4.43it/s, loss=0.168]


Epoch 8 :  lr=0.000000 t=87s loss : 0.29475


100%|██████████| 385/385 [01:26<00:00,  4.43it/s, loss=0.164]


Epoch 9 :  lr=0.000003 t=87s loss : 0.28901


100%|██████████| 385/385 [01:26<00:00,  4.43it/s, loss=0.137]

Epoch 10 :  lr=0.000004 t=87s loss : 0.25261





In [74]:
collator_function = CustomCollator(tokenizer)
valid_dataset = JuridiqueDataset(train,tokenizer,args)
valid_loader = DataLoader(valid_dataset,drop_last = False,num_workers=0,pin_memory=False,shuffle=False,
                              batch_size = 6, collate_fn = collator_function)

In [75]:
pred_df = inference(valid_loader,net)

100%|██████████| 129/129 [00:37<00:00,  3.44it/s]


# Let's predict the sex of the victim!

## We will use a TF-IDF

In [11]:
X = train[["texte", 'text_id']]

vect = TfidfVectorizer(
  max_features=5000,
  stop_words=list(fr_stop), binary=True)

X = vect.fit_transform(train['texte'])
train["sexe"] = train["sexe"].replace({'homme':0, "femme":1, "n.c.": -1})
y = train['sexe']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(random_state=42).fit(X, y)

preds = clf.predict((X_test))

  train["sexe"] = train["sexe"].replace({'homme':0, "femme":1, "n.c.": -1})


# Now, we compute our metrics

In [12]:
print("accuracy:", accuracy_score(y_test, preds))
print("f1:", f1_score(y_test, preds, average='macro'))

accuracy: 0.9155844155844156
f1: 0.8854363376251788
