In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

from src.preprocess_data import clean_text
from src.FastTextVectorizer import FastTextVectorizer
from src.utilits import metrics_model
from src.find_optimal_threshold import find_optimal_threshold

TARGET = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
RANDOM_STATE = 42
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using device: cpu


In [7]:
df = pd.read_csv('data/train.csv')

train_df, val_df = train_test_split(df, test_size=0.999, random_state=RANDOM_STATE)


FastText_pipeline = Pipeline([
    ('fasttext', FastTextVectorizer(vector_size=300, window=3, epochs=20, min_n=3, max_n=6)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear')))
])

print("FastText training started.")
FastText_pipeline.fit(train_df['comment_text'], train_df[TARGET])
print("FastText training complete.")

FastText training started.
FastText training complete.




In [8]:
from src.loader import ToxictDataset, create_embedding_matrix_and_vocab
MAX_LEN = 300
BATCH_SIZE = 64

embedding_matrix, word_to_idx = create_embedding_matrix_and_vocab(FastText_pipeline)


train_dataset = ToxictDataset(train_df, 'comment_text', TARGET, MAX_LEN, word_to_idx)
val_dataset = ToxictDataset(val_df, 'comment_text', TARGET, MAX_LEN, word_to_idx)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
print("DataLoaders ready.")

Vocab size: 291, Embedding dim: 300
DataLoaders ready.


In [9]:
from src.BiLSTM import BiLSTMClassifier
from src.train_LSTM import train_model, evaluate_model

model = BiLSTMClassifier(
    vocab_size=embedding_matrix.shape[0],
    hidden_dim=128*2,
    output_dim=len(TARGET),
    n_layers=2,
    dropout=0.3,
    embed_dim=embedding_matrix.shape[1],
    pretrained_embeddings=embedding_matrix
)

model.to(DEVICE)
print(model)

BiLSTMClassifier(
  (embedding): Embedding(291, 300, padding_idx=0)
  (lstm): LSTM(300, 256, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (clf): Linear(in_features=512, out_features=6, bias=True)
)


In [11]:
print("training")
train_model(model, train_loader, val_loader, n_epochs=5, lr=1e-3, device=DEVICE)

print("\nEvaluating")
val_labels, val_outputs = evaluate_model(model, val_loader, device=DEVICE)

print("optimal thresholds...")
best_thresholds, best_scores = find_optimal_threshold(val_labels, val_outputs)
print("\nMetrics with optimal thresholds:")
metrics_model(val_df[TARGET], val_outputs, thresholds=best_thresholds)
print(f'Validation ROC-AUC Score: {roc_auc_score(val_df[TARGET], val_outputs):.4f}')

training


KeyboardInterrupt: 