In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from sklearn.metrics import f1_score

In [2]:
# Load the data
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')
sample_submission = pd.read_csv('../data/sample_submission.csv')

# Preprocess the text data using CountVectorizer
count_vect = CountVectorizer()
X_train = count_vect.fit_transform(train_data['text']).astype('float32')
y_train = train_data['label'].astype('float32')
X_test = count_vect.transform(test_data['text']).astype('float32')

In [14]:
# Initialize the soft-voting model
models = []
lgbm_params = {'boosting_type': 'gbdt',
              'objective': 'multiclass',
              'num_class': 8,
              'num_leaves': 31,
              'learning_rate': 0.05,
              'feature_fraction': 0.9,
              'bagging_fraction': 0.8,
              'bagging_freq': 5,
              'verbose': -1,
              'random_state': 42,
              'num_iterations': 5 ##################
              }
lgbm = LGBMClassifier(**lgbm_params)
models.append(('lgbm', lgbm))

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=8)
model.to(device)
models.append(('bert', model))

# Train the soft-voting model using Stratified K-fold
n_splits = 2 ##################
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
preds_list = []
for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
    print(f'Fold {i+1} LGB START')
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Train the LGBM model
    lgbm.fit(X_train_fold, y_train_fold,
             eval_set=[(X_val_fold, y_val_fold)],
             early_stopping_rounds=50,
             verbose=100)
    
    print(f'Fold {i+1} LGB FINISH')
    print('')
    print(f'Fold {i+1} BERT START')
    
    # Train the BERT model
    batch_size = 16
    train_dataset = torch.utils.data.TensorDataset(torch.tensor(X_train_fold.toarray()).to(device),
                                                   torch.tensor(y_train_fold.values).to(device))
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataset = torch.utils.data.TensorDataset(torch.tensor(X_val_fold.toarray()).to(device),
                                                 torch.tensor(y_val_fold.values).to(device))
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    num_epochs = 1 ##################
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, labels = batch
            outputs = model(input_ids, labels=labels)
            loss, logits = outputs[:2]
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)

        model.eval()
        val_loss = 0
        val_preds = []
        with torch.no_grad():
            for batch in val_loader:
                input_ids, labels = batch
                outputs = model(input_ids, labels=labels)
                loss, logits = outputs[:2]
                val_loss += loss.item()
                preds = torch.argmax(logits, axis=1)
                val_preds.append(preds.cpu().numpy())
        val_loss /= len(val_loader)
        val_preds = np.concatenate(val_preds)
    
    print(f'Fold {i+1} BERT FINISH')
        
    # Predict on the test set
    lgbm_preds = lgbm.predict_proba(X_test)
    bert_preds = []
    test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test.toarray()).to(device))
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch[0]
            outputs = model(input_ids)
            logits = outputs[0]
            preds = torch.argmax(logits, axis=1)
            bert_preds.append(preds.cpu().numpy())
    bert_preds = np.concatenate(bert_preds)

    # Combine the predictions using soft-voting
    preds = np.average([lgbm_preds, bert_preds], axis=0, weights=[2, 1])

    # Store the predictions
    preds_list.append(preds)
    
preds_mean = np.mean(preds_list, axis=0)
preds_argmax = np.argmax(preds_mean, axis=1)

f1 = f1_score(y_train, preds_argmax, average='macro')
print(f'Macro F1 score: {f1}')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Fold 1 LGB START
Fold 1 LGB FINISH

Fold 1 BERT START


RuntimeError: The expanded size of the tensor (248567) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [16, 248567].  Tensor sizes: [1, 512]

In [10]:
torch.cuda.empty_cache()

In [13]:
import gc
gc.collect()

0