In [2]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import nltk

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv('train.tsv', sep='\t')
df_test = pd.read_csv('test.tsv', sep='\t')
df.drop(['url'], axis=1, inplace=True)
df_test.drop(['url'], axis=1, inplace=True)

In [4]:
nltk.download("stopwords")
nltk.download("punkt")

french_stopwords = stopwords.words("french") + list(string.punctuation) + ["''", "``", "...", "’", "``", "«", "»", "``"]


def tokenize(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens] # convert to lower case
    tokens = [token for token in tokens if token.isalpha()] # remove punctuation
    return tokens

[nltk_data] Downloading package stopwords to /home/bina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/bina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
df_text = df['text'].apply(tokenize)
df_headline = df['headline'].apply(tokenize)
df['combined_tokens'] = df_text + df_headline
df['length'] = df['combined_tokens'].apply(len)

df_test_text = df_test['text'].apply(tokenize)
df_test_headline = df_test['headline'].apply(tokenize)
df_test['combined_tokens'] = df_test_text + df_test_headline
df_test['length'] = df_test['combined_tokens'].apply(len) # used later to define padding

max_length = int(np.percentile(df['length'], 95))  # Using 95th percentile to avoid outliers
print(f"95th percentile length of combined tokens: {max_length}")

95th percentile length of combined tokens: 1795


In [6]:
def pad_tokens(tokens, max_length):
    tokens = tokens[:max_length] + ['<pad>'] * max(0, max_length - len(tokens))
    return tokens

In [7]:
df['padded_tokens'] = df['combined_tokens'].apply(lambda x: pad_tokens(x, max_length))
df_test['padded_tokens'] = df_test['combined_tokens'].apply(lambda x: pad_tokens(x, max_length))


In [8]:
label_encoder = LabelEncoder()
df['encoded_category'] = label_encoder.fit_transform(df['category'])
df_test['encoded_category'] = label_encoder.transform(df_test['category'])

In [32]:
df

Unnamed: 0,category,headline,text,combined_tokens,length,padded_tokens,encoded_category
0,sports,"L'Ouganda à l'assaut des ""fimbu"" de la RDC","L'Ouganda, placé 79e au classement FIFA le 4 a...","[placé, au, classement, fifa, le, avril, a, po...",319,"[placé, au, classement, fifa, le, avril, a, po...",3
1,business,Stopper la détérioration de l’environnement po...,La responsable de la biodiversité des Nations ...,"[la, responsable, de, la, biodiversité, des, n...",585,"[la, responsable, de, la, biodiversité, des, n...",0
2,sports,Coupe d'Afrique des nations 2022 : le sélectio...,"Le sélectionneur de la Sierra Leone, John Keis...","[le, sélectionneur, de, la, sierra, leone, joh...",305,"[le, sélectionneur, de, la, sierra, leone, joh...",3
3,business,Tissus africains : pourquoi les teintureries h...,"Depuis plus de six siècles, une vaste zone sit...","[depuis, plus, de, six, siècles, une, vaste, z...",734,"[depuis, plus, de, six, siècles, une, vaste, z...",0
4,business,Les revenus pendant la pandémie des dix hommes...,"Pendant la pandémie de coronavirus, la richess...","[pendant, la, pandémie, de, coronavirus, la, r...",598,"[pendant, la, pandémie, de, coronavirus, la, r...",0
...,...,...,...,...,...,...,...
1471,sports,C1: Le PSG va disputer sa toute première finale,Le Paris SG s'est qualifié pour sa toute premi...,"[le, paris, sg, qualifié, pour, sa, toute, pre...",209,"[le, paris, sg, qualifié, pour, sa, toute, pre...",3
1472,sports,"Aubameyang invité à rejoindre un ""club plus am...","L'attaquant gabonais d'Arsenal, dont le contra...","[gabonais, dont, le, contrat, doit, expirer, à...",235,"[gabonais, dont, le, contrat, doit, expirer, à...",3
1473,politics,Guerre Ukraine - Russie : qui est Sergey Surov...,La désignation de Sergey Surovikin pour dirige...,"[la, désignation, de, sergey, surovikin, pour,...",862,"[la, désignation, de, sergey, surovikin, pour,...",2
1474,technology,"Svetlana Jitomirskaya, la mathématicienne à l'...",Il existe un type de papillon qui captive le p...,"[il, existe, un, type, de, papillon, qui, capt...",1540,"[il, existe, un, type, de, papillon, qui, capt...",4


In [22]:
all_tokens = [token for tokens in df['padded_tokens'] for token in tokens]
vocab = set(all_tokens)
word_to_index = {word: idx for idx, word in enumerate(vocab)}
vocab_size = len(vocab)

In [23]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        tokens = self.texts[idx]
        indices = [word_to_index.get(token, len(vocab)) for token in tokens]
        return torch.tensor(indices, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)


In [24]:
X_train, X_val, y_train, y_val = train_test_split(df['padded_tokens'].tolist(), df['encoded_category'].tolist(), test_size=0.2, random_state=42)


In [25]:
train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [27]:
train_loader.batch_size

32

In [39]:
class CNN_Text(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, kernel_num=100, kernel_sizes=[3, 4, 5], dropout=0.5):
        super(CNN_Text, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, kernel_num, (K, embed_dim)) for K in kernel_sizes])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(kernel_sizes) * kernel_num, num_classes)
    
    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)  # (N, 1, max_length, embed_dim)
        x = [nn.functional.relu(conv(x)).squeeze(3) for conv in self.convs]  # [(N, kernel_num, W), ...]*len(kernel_sizes)
        x = [nn.functional.max_pool1d(line, line.size(2)).squeeze(2) for line in x]  # [(N, kernel_num), ...]*len(kernel_sizes)
        x = torch.cat(x, 1)  # (N, kernel_num * len(kernel_sizes))
        x = self.dropout(x)
        logits = self.fc(x)  # (N, num_classes)
        return logits

In [40]:
num_classes = 5
vocab_size = len(vocab)
embed_dim = 300  
model = CNN_Text(vocab_size, embed_dim, num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [41]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        val_loss = 0.0
        correct = 0
        total = 0
        model.eval()
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        print(f"Epoch {epoch + 1}, Train Loss: {running_loss / len(train_loader)}, Val Loss: {val_loss / len(val_loader)}, Val Acc: {correct / total}")



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = model.to(device)

In [42]:
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10)

Epoch 1, Train Loss: 1.856938207471693, Val Loss: 1.2453073024749757, Val Acc: 0.5878378378378378
Epoch 2, Train Loss: 1.1350007121627395, Val Loss: 1.0019733428955078, Val Acc: 0.7128378378378378
Epoch 3, Train Loss: 0.7827557550894247, Val Loss: 0.897068202495575, Val Acc: 0.7398648648648649
Epoch 4, Train Loss: 0.5982036614740217, Val Loss: 0.8988564252853394, Val Acc: 0.7128378378378378
Epoch 5, Train Loss: 0.4714181962850931, Val Loss: 0.8510267436504364, Val Acc: 0.7567567567567568
Epoch 6, Train Loss: 0.382610909439422, Val Loss: 0.828895315527916, Val Acc: 0.7533783783783784
Epoch 7, Train Loss: 0.3680238792219678, Val Loss: 0.8156717658042908, Val Acc: 0.7635135135135135
Epoch 8, Train Loss: 0.2887197360396385, Val Loss: 0.7901791721582413, Val Acc: 0.7736486486486487
Epoch 9, Train Loss: 0.25411405011608795, Val Loss: 0.8095291316509247, Val Acc: 0.7837837837837838
Epoch 10, Train Loss: 0.17124696237009926, Val Loss: 0.7890667378902435, Val Acc: 0.8040540540540541
