#### Author: Serge Wilson MENDY

### Dependencies installation

In [1]:
!pip install transformers



### Librairies importation

In [2]:
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification




### Data importation

In [3]:
df = pd.read_csv("/kaggle/input/allocine-french-movie-reviews/train.csv", index_col=0)
df.head()

Unnamed: 0,film-url,review,polarity
0,http://www.allocine.fr/film/fichefilm-135259/c...,Si vous cherchez du cinéma abrutissant à tous ...,0
1,http://www.allocine.fr/film/fichefilm-172430/c...,"Trash, re-trash et re-re-trash...! Une horreur...",0
2,http://www.allocine.fr/film/fichefilm-15105/cr...,"Et si, dans les 5 premières minutes du film, l...",0
3,http://www.allocine.fr/film/fichefilm-188629/c...,Mon dieu ! Quelle métaphore filée ! Je suis ab...,0
4,http://www.allocine.fr/film/fichefilm-23514/cr...,"Premier film de la saga Kozure Okami, ""Le Sabr...",1


In [4]:
sampled_label_0 = df[df['polarity'] == 0].sample(n=15000, random_state=42)
sampled_label_1 = df[df['polarity'] == 1].sample(n=15000, random_state=42)


final_df = pd.concat([sampled_label_0, sampled_label_1])
final_df = final_df.sample(frac=1, random_state=42)
final_df = final_df.reset_index(drop=True)
final_df.head()


Unnamed: 0,film-url,review,polarity
0,http://www.allocine.fr/film/fichefilm-179924/c...,Si c’était une prod sci Fy on pourrait être in...,0
1,http://www.allocine.fr/film/fichefilm-59668/cr...,Exactement ce que je voulais voir en ce début ...,1
2,http://www.allocine.fr/film/fichefilm-53619/cr...,Excellent film ! Quoi de mieux pour comprendre...,1
3,http://www.allocine.fr/film/fichefilm-42167/cr...,"Un film très noir, qui nous dépeint la guerre ...",1
4,http://www.allocine.fr/film/fichefilm-201345/c...,"Un bon gros nanar, bien qu'assez moyen. Les 45...",0


### Preprocessing

In [5]:
input_texts = final_df["review"].tolist()
labels = final_df["polarity"].tolist()

num_classes = len(set(labels))

In [6]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
label_to_index = {"négative":0, "positive":1}
index_to_label = {0:"négative", 1:"postive"}

In [8]:
input_encodings = tokenizer(input_texts, truncation=True, padding=True, return_tensors="pt")
encoded_labels = torch.tensor(labels)

In [9]:
from torch.utils.data import random_split, DataLoader


data = torch.utils.data.TensorDataset(input_encodings["input_ids"], input_encodings["attention_mask"], encoded_labels)

train_size = int(0.7 * len(data))
val_size = int(0.1 * len(data))
test_size = len(data) - train_size - val_size

train_data, val_data, test_data = random_split(data, [train_size, val_size, test_size], generator=torch.Generator().manual_seed(42))

batch_size = 16

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

### Model training

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [11]:
import wandb
wandb.init(project="Sentiment Analysis French")

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [12]:
from sklearn.metrics import accuracy_score
from tqdm import tqdm

optimizer = AdamW(model.parameters(), lr=1e-5)

num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batches'):
        batch = tuple(t.to(device) for t in batch)
        batch_inputs, batch_attention_mask, batch_labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids=batch_inputs, attention_mask = batch_attention_mask, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    average_loss = loss / len(train_loader)
    wandb.log({"train_loss": average_loss, "epoch": epoch})

    valid_labels = []
    pred_valid_labels = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batches'):
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = logits.argmax(dim=1).cpu().numpy()
            pred_valid_labels.extend(predictions)
            valid_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(valid_labels, pred_valid_labels)
    wandb.log({"val_accuracy": average_loss, "epoch": epoch})
    print(f'Epoch {epoch+1}, Taining loss: {average_loss}, Validation Accuracy: {accuracy}%')

Epoch 1/5: 100%|██████████| 1313/1313 [32:14<00:00,  1.47s/batches]
Epoch 1/5: 100%|██████████| 188/188 [01:43<00:00,  1.81batches/s]


Epoch 1, Taining loss: 0.0003507062210701406, Validation Accuracy: 0.9096666666666666%


Epoch 2/5: 100%|██████████| 1313/1313 [32:18<00:00,  1.48s/batches]
Epoch 2/5: 100%|██████████| 188/188 [01:44<00:00,  1.81batches/s]


Epoch 2, Taining loss: 0.0001326219498878345, Validation Accuracy: 0.928%


Epoch 3/5: 100%|██████████| 1313/1313 [32:19<00:00,  1.48s/batches]
Epoch 3/5: 100%|██████████| 188/188 [01:44<00:00,  1.81batches/s]


Epoch 3, Taining loss: 5.4231648391578346e-05, Validation Accuracy: 0.9313333333333333%


Epoch 4/5: 100%|██████████| 1313/1313 [32:18<00:00,  1.48s/batches]
Epoch 4/5: 100%|██████████| 188/188 [01:44<00:00,  1.81batches/s]


Epoch 4, Taining loss: 1.0595962521620095e-05, Validation Accuracy: 0.9306666666666666%


Epoch 5/5: 100%|██████████| 1313/1313 [32:19<00:00,  1.48s/batches]
Epoch 5/5: 100%|██████████| 188/188 [01:44<00:00,  1.80batches/s]

Epoch 5, Taining loss: 1.1792564009738271e-06, Validation Accuracy: 0.9176666666666666%





### Model evaluation

#### Accuracies computation

In [13]:
model.eval()

train_labels = []
pred_train_labels = []

for batch in train_loader:
    batch = tuple(t.to(device) for t in batch)
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=1).cpu().numpy()
        pred_train_labels.extend(predictions)
        train_labels.extend(labels.cpu().numpy())

test_labels = []
pred_test_labels = []

for batch in test_loader:
    batch = tuple(t.to(device) for t in batch)
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=1).cpu().numpy()
        pred_test_labels.extend(predictions)
        test_labels.extend(labels.cpu().numpy())


In [14]:
from sklearn.metrics import accuracy_score

accuracy_train = accuracy_score(train_labels, pred_train_labels)
accuracy_test = accuracy_score(test_labels, pred_test_labels)

print("Accuracy sur les données d'entrainement:", accuracy_train)
print("Accuracy sur les données de test :", accuracy_test)

Accuracy sur les données d'entrainement: 0.9900476190476191
Accuracy sur les données de test : 0.9133333333333333


### Model Deployement

In [15]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
huggingface_token = user_secrets.get_secret("huggingface")

!mkdir -p ~/.huggingface
!echo -n $huggingface_token > ~/.huggingface/token


In [16]:
model_name = "serge-wilson/sentiment_analysis_french"
model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)



pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/serge-wilson/sentiment_analysis_french/commit/82337d74b777d2a671e749da7a697d1513051639', commit_message='Upload tokenizer', commit_description='', oid='82337d74b777d2a671e749da7a697d1513051639', pr_url=None, pr_revision=None, pr_num=None)