In [1]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import nltk

from sklearn.model_selection import train_test_split

from gensim.models import KeyedVectors

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import LabelEncoder

In [2]:
# Loading pre-trained word embeddings for the French language
fasttext_model = KeyedVectors.load_word2vec_format('cc.fr.300.vec', binary=False)    

In [3]:
df = pd.read_csv('train.tsv', sep='\t')
df_test = pd.read_csv('test.tsv', sep='\t')
# 'url' attribute not used for classification
df.drop(['url'], axis=1, inplace=True)
df_test.drop(['url'], axis=1, inplace=True)

In [4]:
nltk.download("stopwords")
nltk.download("punkt")

french_stopwords = stopwords.words("french") + list(string.punctuation) + ["''", "``", "...", "’", "``", "«", "»", "``"]


def tokenize(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens] # convert to lower case
    tokens = [token for token in tokens if token.isalpha()] # remove punctuation
    tokens = [token for token in tokens if token not in french_stopwords]  # remove stopwords
    return tokens

[nltk_data] Downloading package stopwords to /home/bina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/bina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# tokenizing text to calculate word embeddings
df_text = df['text'].apply(tokenize)
df_headline = df['headline'].apply(tokenize)
df['combined_tokens'] = df_text + df_headline
df['length'] = df['combined_tokens'].apply(len) # used later to define padding

df_test_text = df_test['text'].apply(tokenize)
df_test_headline = df_test['headline'].apply(tokenize)
df_test['combined_tokens'] = df_test_text + df_test_headline
df_test['length'] = df_test['combined_tokens'].apply(len) # used later to define padding

max_length = int(np.percentile(df['length'], 95))  # Using 95th percentile to avoid outliers
print(f"95th percentile length of combined tokens: {max_length}")

95th percentile length of combined tokens: 1003


In [6]:
def pad_tokens(tokens, max_length):
    tokens = tokens[:max_length] + ['<pad>'] * max(0, max_length - len(tokens))
    return tokens

In [7]:
# padding needed so we have uniform dimension of word embeddings
df['padded_tokens'] = df['combined_tokens'].apply(lambda x: pad_tokens(x, max_length))
df_test['padded_tokens'] = df_test['combined_tokens'].apply(lambda x: pad_tokens(x, max_length))


In [8]:
# integer labels for categories
label_encoder = LabelEncoder()
df['encoded_category'] = label_encoder.fit_transform(df['category'])
df_test['encoded_category'] = label_encoder.transform(df_test['category'])

In [103]:
category_names = label_encoder.classes_
encoded_values = list(range(len(category_names)))
category_dict = dict(zip(encoded_values, category_names))


In [9]:
df

Unnamed: 0,category,headline,text,combined_tokens,length,padded_tokens,encoded_category
0,sports,"L'Ouganda à l'assaut des ""fimbu"" de la RDC","L'Ouganda, placé 79e au classement FIFA le 4 a...","[placé, classement, fifa, avril, a, pourtant, ...",184,"[placé, classement, fifa, avril, a, pourtant, ...",3
1,business,Stopper la détérioration de l’environnement po...,La responsable de la biodiversité des Nations ...,"[responsable, biodiversité, nations, unies, es...",321,"[responsable, biodiversité, nations, unies, es...",0
2,sports,Coupe d'Afrique des nations 2022 : le sélectio...,"Le sélectionneur de la Sierra Leone, John Keis...","[sélectionneur, sierra, leone, john, keister, ...",172,"[sélectionneur, sierra, leone, john, keister, ...",3
3,business,Tissus africains : pourquoi les teintureries h...,"Depuis plus de six siècles, une vaste zone sit...","[depuis, plus, six, siècles, vaste, zone, situ...",438,"[depuis, plus, six, siècles, vaste, zone, situ...",0
4,business,Les revenus pendant la pandémie des dix hommes...,"Pendant la pandémie de coronavirus, la richess...","[pendant, pandémie, coronavirus, richesse, com...",350,"[pendant, pandémie, coronavirus, richesse, com...",0
...,...,...,...,...,...,...,...
1471,sports,C1: Le PSG va disputer sa toute première finale,Le Paris SG s'est qualifié pour sa toute premi...,"[paris, sg, qualifié, toute, première, finale,...",133,"[paris, sg, qualifié, toute, première, finale,...",3
1472,sports,"Aubameyang invité à rejoindre un ""club plus am...","L'attaquant gabonais d'Arsenal, dont le contra...","[gabonais, dont, contrat, doit, expirer, fin, ...",150,"[gabonais, dont, contrat, doit, expirer, fin, ...",3
1473,politics,Guerre Ukraine - Russie : qui est Sergey Surov...,La désignation de Sergey Surovikin pour dirige...,"[désignation, sergey, surovikin, diriger, russ...",499,"[désignation, sergey, surovikin, diriger, russ...",2
1474,technology,"Svetlana Jitomirskaya, la mathématicienne à l'...",Il existe un type de papillon qui captive le p...,"[existe, type, papillon, captive, professeur, ...",871,"[existe, type, papillon, captive, professeur, ...",4


In [10]:
# creating vocabulary of all tokens
all_tokens = [token for tokens in df['padded_tokens'] for token in tokens]
vocab = set(all_tokens)
word_to_index = {word: idx for idx, word in enumerate(vocab)}
vocab_size = len(vocab)

In [11]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        tokens = self.texts[idx]
        indices = [word_to_index.get(token, len(vocab)) for token in tokens]
        return torch.tensor(indices, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)


In [12]:
X_train, X_val, y_train, y_val = train_test_split(df['padded_tokens'].tolist(), df['encoded_category'].tolist(), test_size=0.2, random_state=42)


In [26]:
train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=10, shuffle=False)

In [27]:
num_classes = 5
vocab_size = len(vocab)
embed_dim = 300  

In [28]:
vocab_size

41680

In [29]:
# word embedding matrix to use in the CNN Embedding layer
embedding_matrix = np.zeros((vocab_size, embed_dim))
for word, idx in word_to_index.items():
    if word in fasttext_model.key_to_index:
        embedding_matrix[idx] = fasttext_model[word]


N - batch size

max_length - maximum length of a text sequence

vocab_size - size of the vocabulary

embed_dim - dimensionality of word embeddings

In [48]:
class CNN_Text(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, kernel_num=100, kernel_sizes=[3, 4, 5], dropout=0.5, embedding_matrix=None):
        super(CNN_Text, self).__init__()
        if embedding_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=False)
        else:
            self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, kernel_num, (K, embed_dim)) for K in kernel_sizes])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(kernel_sizes) * kernel_num, num_classes)
    
    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)  # (N, 1, max_length, embed_dim)
        x = [nn.functional.relu(conv(x)).squeeze(3) for conv in self.convs]  # [(N, kernel_num, W), ...]*len(kernel_sizes)
        x = [nn.functional.max_pool1d(line, line.size(2)).squeeze(2) for line in x]  # [(N, kernel_num), ...]*len(kernel_sizes)
        x = torch.cat(x, 1)  # (N, kernel_num * len(kernel_sizes))
        x = self.dropout(x)
        logits = self.fc(x)  # (N, num_classes)
        return logits

In [49]:
model = CNN_Text(vocab_size, embed_dim, num_classes, embedding_matrix=embedding_matrix)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [51]:
# training with early stopping
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, patience=3):
    best_val_acc = 0.0
    no_improvement = 0

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        val_loss = 0.0
        correct = 0
        total = 0
        model.eval()
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_acc = correct / total

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            no_improvement = 0
        else:
            no_improvement += 1

        if no_improvement >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break

        print(f"Epoch {epoch + 1}, Train Loss: {running_loss / len(train_loader)}, Val Loss: {val_loss / len(val_loader)}, Val Acc: {val_acc}")


In [52]:
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, patience=3)

Epoch 1, Train Loss: 1.4159768223762512, Val Loss: 1.317355751991272, Val Acc: 0.4594594594594595
Epoch 2, Train Loss: 1.0031138375401496, Val Loss: 0.9845913529396058, Val Acc: 0.7364864864864865
Epoch 3, Train Loss: 0.6285928912460804, Val Loss: 0.7737149596214294, Val Acc: 0.75
Epoch 4, Train Loss: 0.38354796059429647, Val Loss: 0.6965559154748917, Val Acc: 0.7871621621621622
Epoch 5, Train Loss: 0.22943217866122723, Val Loss: 0.6624760389328003, Val Acc: 0.8006756756756757
Epoch 6, Train Loss: 0.11999846268445254, Val Loss: 0.6622957706451416, Val Acc: 0.7871621621621622
Epoch 7, Train Loss: 0.07272455333732068, Val Loss: 0.6860948741436005, Val Acc: 0.8006756756756757
Early stopping at epoch 8


In [61]:
model.eval()
test_loss = 0.0
correct = 0
total = 0
predictions = []
with torch.no_grad():
    for inputs, labels in val_loader:
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

In [63]:
test_acc = correct / total
print(f"Test Loss: {test_loss / len(val_loader)}, Test Acc: {test_acc}")


Test Loss: 0.7034702569246292, Test Acc: 0.7871621621621622


In [71]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [97]:
label_encoder.classes_

array(['business', 'health', 'politics', 'sports', 'technology'],
      dtype=object)

In [108]:
conf_mat = confusion_matrix(y_val, predictions)


In [109]:
conf_mat

array([[63,  4,  8,  1,  1],
       [ 4, 59,  1,  1,  1],
       [15,  2, 45,  0,  2],
       [ 2,  4,  1, 60,  1],
       [ 6,  2,  1,  6,  6]])