In [None]:
import os
import gc
import sys
import json
import time
import math
import random
from datetime import datetime
from collections import Counter, defaultdict
import joblib
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

import warnings
warnings.filterwarnings("ignore")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class CFG:
    BATCH_SIZE=32
    NUM_WORKERS = 4
    WEIGHT_DECAY=1e-6
    LR=1e-4
    EPOCHS=20
    N_FOLDS=5
    N_LAYERS = 2
    SEQ_LEN = 200
    OUTPUT_SIZE = 1
    HIDDEN_DIM = 128

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
class AverageMeter:

    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
porter_stemmer = PorterStemmer()

def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text

def tokenization(text):
    tokens = text.split(' ')
    return tokens

def text_to_sequences(word2idx, seq):
    for i, sentence in enumerate(seq):
        seq[i] = [word2idx[word] if word in word2idx else 0 for word in sentence]
    return seq

def pad_sequences(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

def build_vocab(seq):
    words = Counter()  
    for i, sentence in enumerate(seq):
        for word in sentence:  
            words.update([word.lower()])  
    words = {k:v for k,v in words.items() if v>1}
    words = sorted(words, key=words.get, reverse=True)

    words = ['_PAD','_UNK'] + words
    word2idx = {o:i for i,o in enumerate(words)}
    idx2word = {i:o for i,o in enumerate(words)}
    return words, word2idx, idx2word

def load_vectors():   
    path_to_glove_file = os.path.join(
        '../input/glove6b100dtxt', "glove.6B.100d.txt"
    )
    
    embeddings_index = {}
    with open(path_to_glove_file, encoding="utf8") as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs
    
    print("Found %s word vectors." % len(embeddings_index))
    return embeddings_index
        
def create_embedding_matrix(word_index, embedding_dict, embedding_dim=100):
    hits = 0
    misses = 0
    
    # Prepare embedding matrix
    embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embedding_dict.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    print("Converted %d words (%d misses)" % (hits, misses))
    return embedding_matrix


def preprocess(df):
    df['more_toxic_text']= df['more_toxic'].str.replace('\d+', '0')
    df['more_toxic_text']= df['more_toxic_text'].str.replace('\W+', ' ')
    df['more_toxic_text']= df['more_toxic_text'].apply(lambda x: tokenization(x))
    df['more_toxic_text']= df['more_toxic_text'].apply(lambda x:remove_stopwords(x))

    df['less_toxic_text']= df['less_toxic'].str.replace('\d+', '0')
    df['less_toxic_text']= df['less_toxic_text'].str.replace('\W+', ' ')
    df['less_toxic_text']= df['less_toxic_text'].apply(lambda x: tokenization(x))
    df['less_toxic_text']= df['less_toxic_text'].apply(lambda x:remove_stopwords(x))
    return df


def build_embedding(df):
    train_sequenceses = list(df['more_toxic_text'].values) + list(df['less_toxic_text'].values)
    words, word2idx, idx2word = build_vocab(np.unique(train_sequenceses))
    joblib.dump(word2idx, 'word2idx.pkl', compress=1)
    
    print("Loading embeddings")
    embedding_dict = load_vectors()
    embedding_matrix = create_embedding_matrix(
        word2idx, embedding_dict
        )
    joblib.dump(embedding_matrix, 'embedding_matrix.pkl', compress=1)
    
    return word2idx, embedding_matrix


In [None]:
def train_one_step(model, data, optimizer, scheduler, loss_fn, device):
    optimizer.zero_grad()
    for key, value in data.items():
        data[key] = value.to(device)
    more_toxic = data['more_toxic']
    less_toxic = data['less_toxic']
    target = data['target']
    logit1 = model(more_toxic)
    logit2 = model(less_toxic)      
    loss = loss_fn(logit1, logit2, target)   
    loss.backward()
    optimizer.step()
    scheduler.step()
    return loss


def train_one_epoch(model, train_loader, optimizer, scheduler, loss_fn, device):
    model.train()
    losses = AverageMeter()
    tk0 = tqdm(train_loader, total=len(train_loader))
    for idx, data in enumerate(tk0):
        loss = train_one_step(model, data, optimizer, scheduler, loss_fn, device)
        losses.update(loss.item(), train_loader.batch_size)
        tk0.set_postfix(loss=losses.avg, stage="train")
    tk0.close()
    return losses.avg 


def valid_one_step(model, data, loss_fn, device):
    for key, value in data.items():
        data[key] = value.to(device)    
    more_toxic = data['more_toxic']
    less_toxic = data['less_toxic']
    target = data['target']
    logit1 = model(more_toxic)
    logit2 = model(less_toxic)
    loss = loss_fn(logit1, logit2, target)
    return loss

 
def valid_one_epoch(model, valid_loader, loss_fn, device):
    model.eval()
    losses = AverageMeter()
    tk0 = tqdm(valid_loader, total=len(valid_loader))
    for idx, data in enumerate(tk0):
        with torch.no_grad():
            loss = valid_one_step(model, data, loss_fn, device)
        losses.update(loss.item(), valid_loader.batch_size)
        tk0.set_postfix(loss=losses.avg, stage="valid")
    tk0.close()
    return losses.avg

 

def predict_one_step(model, data, device):
    for key, value in data.items():
        data[key] = value.to(device)
    toxic = data['toxic']   
    logit = model(toxic)
    return logit

 

def predict_one_epoch(model, test_loader, device):
    model.eval()
    predictions = []
    tk0 = tqdm(test_loader, total=len(test_loader))
    for idx, data in enumerate(tk0):
        with torch.no_grad():
            logit = predict_one_step(model, data, device)
        predictions.append(logit.view(-1).detach().cpu().numpy())
    return np.concatenate(predictions)


In [None]:
class ToxicModel(nn.Module):
    def __init__(self, output_size, embedding_matrix, hidden_dim, n_layers, drop_prob=0.3):
         super(ToxicModel, self).__init__()
         self.output_size = output_size
         self.n_layers = n_layers
         self.hidden_dim = hidden_dim
         num_words = embedding_matrix.shape[0]
         embed_dim =  embedding_matrix.shape[1]
        
         self.embedding = nn.Embedding(num_words, embed_dim)
        
         self.embedding.weight = nn.Parameter(
         torch.tensor(
         embedding_matrix,
         dtype=torch.float32
         )
         )     

         self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, dropout=drop_prob, bidirectional=True, batch_first=True)
         self.fc = nn.Linear(256*2, output_size)            
            
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers*2, batch_size, self.hidden_dim).zero_().to('cuda'),
                      weight.new(self.n_layers*2, batch_size, self.hidden_dim).zero_().to('cuda'))
        return hidden
    
    def forward(self, toxic):
        batch_size = toxic.size(0)
        h = self.init_hidden(batch_size)
        h = tuple([e.data for e in h])
        x = toxic.long()
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds, h)
        mean_  = torch.mean(lstm_out,1)
        max_ , _ = torch.max(lstm_out,1)
        out = torch.cat((mean_, max_), 1)
        out = self.fc(out)
        return out

In [None]:
class ToxicDataset:
    def __init__(self, more_toxic, less_toxic):
        self.more_toxic = more_toxic
        self.less_toxic = less_toxic

    def __len__(self):
        return len(self.more_toxic)

    def __getitem__(self, item):

        more_toxic = self.more_toxic[item, :]
        less_toxic = self.less_toxic[item, :]
        return {
              "more_toxic": torch.tensor(more_toxic, dtype=torch.long),
              "less_toxic": torch.tensor(less_toxic, dtype=torch.long),
              "target": torch.tensor(1, dtype=torch.float)
        }

In [None]:
def fit_one_epoch(
                  fold,
                  seed,       
                  train_sentences_more,
                  val_sentences_more,
                  train_sentences_less,
                  val_sentences_less,
                  embedding_matrix
    ):
    
    seed_everything(seed)
    
    train_dataset = ToxicDataset(
        train_sentences_more,
        train_sentences_less
        )
    
    valid_dataset = ToxicDataset(
         val_sentences_more,
         val_sentences_less
        )

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.BATCH_SIZE,
                              shuffle=True,
                              num_workers=CFG.NUM_WORKERS, pin_memory=True, drop_last=True)

    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.BATCH_SIZE,
                              shuffle=False,
                              num_workers=CFG.NUM_WORKERS, pin_memory=True, drop_last=False)


    model = ToxicModel(CFG.OUTPUT_SIZE, embedding_matrix, CFG.HIDDEN_DIM, CFG.N_LAYERS)
    model.to(device)
 
    optimizer = AdamW(model.parameters(), lr=CFG.LR, weight_decay=CFG.WEIGHT_DECAY)
    num_train_steps = int(len(train_sentences_more)/ CFG.BATCH_SIZE * CFG.EPOCHS)

    scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=50, T_mult=1, eta_min=1e-7, last_epoch=-1)

    criterion = nn.MarginRankingLoss(margin=0.5)
    
    best_score = np.inf

    for epoch in range(CFG.EPOCHS):

        avg_loss = train_one_epoch(model, train_loader, optimizer, scheduler, criterion, device)
        score = valid_one_epoch(model, valid_loader, criterion, device)
        if score < best_score:
            best_score = score
            print(f'Epoch {epoch+1} - Save Best Score: {score:.4f} Model')
            torch.save({'model': model.state_dict(),},f"fold{fold}_best.pth")

    torch.cuda.empty_cache()
    gc.collect()
    return best_score

In [None]:
def run_fold(fold, seed, df, word2idx, embedding_matrix):
    
    df_train=df.loc[df.kfold!=fold].reset_index(drop=True)
    df_valid=df.loc[df.kfold==fold].reset_index(drop=True)
    
    train_sequenceses_more = text_to_sequences(word2idx, list(df_train['more_toxic_text'].values))
    val_sequenceses_more = text_to_sequences(word2idx, list(df_valid['more_toxic_text'].values))

    train_sequenceses_less = text_to_sequences(word2idx, list(df_train['less_toxic_text'].values))
    val_sequenceses_less = text_to_sequences(word2idx, list(df_valid['less_toxic_text'].values))

    train_sentences_more = pad_sequences(train_sequenceses_more, CFG.SEQ_LEN)
    val_sentences_more = pad_sequences(val_sequenceses_more, CFG.SEQ_LEN)

    train_sentences_less = pad_sequences(train_sequenceses_less, CFG.SEQ_LEN)
    val_sentences_less = pad_sequences(val_sequenceses_less, CFG.SEQ_LEN)
    
    #run the training
    score = fit_one_epoch(
                 fold, 
                 seed, 
                 train_sentences_more,
                 val_sentences_more,
                 train_sentences_less,
                 val_sentences_less,
                 embedding_matrix
        
    )
    
    return score

In [None]:
def run_training():
    df = pd.read_csv(os.path.join('../input/training-data','toxic_valid_folds.csv'))
    
    #Preprocess the data
    df = preprocess(df)
    
    #Save embedding
    word2idx, embedding_matrix = build_embedding(df)
    
    for f in range(CFG.N_FOLDS):
        score = run_fold(f, 42, df, word2idx, embedding_matrix)
        print(f'fold:{f} training completed!!! best score:{score}')

In [None]:
if __name__=='__main__':
    run_training()