In [None]:
!pip install ../input/pytorch-ignite-020/pytorch_ignite-0.2.0-py2.py3-none-any.whl

In [None]:
import pprint
import random
import os
import pathlib

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import torchtext
from torchtext.vocab import Vectors

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.dataset import Subset

from ignite.engine import Engine, Events
from ignite.metrics import Accuracy, Loss, RunningAverage, Precision, Recall
from ignite.handlers import ModelCheckpoint, EarlyStopping
from ignite.contrib.handlers import ProgressBar

In [None]:
torch.cuda.is_available()

In [None]:
device = 'cuda'
seed = 632

# .pt is omitted to emulate cache pytorch files for torchtext Vectors
torch_glove_path = '../input/glove-840b300d-for-torchtext/glove.840B.300d.txt'
torch_fasttext_path = '../input/fastextcrawl300d2m/crawl-300d-2M.vec'
input_path = '../input/jigsaw-unintended-bias-in-toxicity-classification'

In [None]:
def seed_torch(seed=632):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

## Dataset and embeddings

In [None]:
def preprocess(data):
    '''
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    '''
    punct = "/-'?!.,#$%()*+-/:;<=>@[\\]^_`{|}~`" + \
        '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    return clean_special_chars(data, punct)

def tokenize(data):
    data = preprocess(data)
    return data.split()


In [None]:
TEXT = torchtext.data.Field(lower=True, include_lengths=False, 
#                             fix_length=MAX_LEN,
                            batch_first=True, tokenize=tokenize)
LABEL = torchtext.data.Field(use_vocab=False, sequential=False,
                             batch_first=True, is_target=True, 
                             preprocessing=lambda x: int(float(x) >= 0.5))
LABEL_IDENTITY = torchtext.data.Field(use_vocab=False, sequential=False,
                             batch_first=True, 
                             preprocessing=lambda x: 0 if x == '' else int(float(x) >= 0.5))

train_fields = {
    'comment_text': ('text', TEXT),
    'target': ('label', LABEL),
    'male': ('male', LABEL_IDENTITY),
    'female': ('female', LABEL_IDENTITY),
    'homosexual_gay_or_lesbian': ('homosexual_gay_or_lesbian', LABEL_IDENTITY),
    'christian': ('christian', LABEL_IDENTITY),
    'jewish': ('jewish', LABEL_IDENTITY),
    'muslim': ('muslim', LABEL_IDENTITY),
    'black': ('black', LABEL_IDENTITY),
    'white': ('white', LABEL_IDENTITY),
    'psychiatric_or_mental_illness': ('psychiatric_or_mental_illness', LABEL_IDENTITY),
}

train_dataset = torchtext.data.TabularDataset(path=f'{input_path}/train.csv',
                                                   format='csv',
                                                   fields=train_fields)
  

In [None]:
train_dataset[0].text[:15]

In [None]:
TEXT.build_vocab(train_dataset, min_freq=1)
vocab = TEXT.vocab

In [None]:
len(TEXT.vocab)


In [None]:
vocab.load_vectors([
    Vectors(torch_glove_path, cache='.'),
    Vectors(torch_fasttext_path, cache='.')
])

In [None]:
print('Attributes of TEXT : ', [attr for attr in dir(TEXT) if '_' not in attr])
print('Attributes of TEXT.vocab : ', [
      attr for attr in dir(TEXT.vocab) if '_' not in attr])
print('First 5 values TEXT.vocab.itos : ', TEXT.vocab.itos[0:5])
print('First 5 key, value pairs of TEXT.vocab.stoi : ', {
      key: value for key, value in list(TEXT.vocab.stoi.items())[0:5]})

## Model

In [None]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

class TextModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, kernel_sizes, num_filters,
                 num_classes, d_prob, mode, hidden_dim, lstm_units,
                 emb_vectors=None, spatial_drop=0.1):
        super(TextModel, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.kernel_sizes = kernel_sizes
        self.num_filters = num_filters
        self.num_classes = num_classes
        self.d_prob = d_prob
        self.mode = mode
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=1)
        self.embedding_dropout = SpatialDropout(spatial_drop)

        if emb_vectors is not None:
            self.load_embeddings(emb_vectors)

        self.conv = nn.ModuleList([nn.Conv1d(in_channels=embedding_dim,
                                             out_channels=num_filters,
                                             kernel_size=k, stride=1) for k in kernel_sizes])
        self.lstm1 = nn.LSTM(embedding_dim, lstm_units,
                             bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(lstm_units * 2, lstm_units,
                             bidirectional=True, batch_first=True)
        self.lstm_body = nn.LSTM(
            embedding_dim, lstm_units, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(d_prob)
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, hidden_dim)
        self.fc_total = nn.Linear(hidden_dim * 1 + lstm_units * 4, hidden_dim)
        self.fc_final = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x_emb = self.embedding(x)
        x_emb = self.embedding_dropout(x_emb)
        
        # pad for CNN kernel 5
        if x_emb.shape[1] < 5:
            x_emb = F.pad(x_emb, (0, 0, 0, 5 - x_emb.shape[1]), value=0)
            
        x = [F.relu(conv(x_emb.transpose(1, 2))) for conv in self.conv]
        x = [F.max_pool1d(c, c.size(-1)).squeeze(dim=-1) for c in x]
        x = torch.cat(x, dim=1)
        x = self.fc(self.dropout(x))

        h_lstm1, _ = self.lstm1(x_emb)
        h_lstm2, _ = self.lstm2(h_lstm1)

        # average pooling
        avg_pool2 = torch.mean(h_lstm2, 1)
        # global max pooling
        max_pool2, _ = torch.max(h_lstm2, 1)


        out = torch.cat([x, avg_pool2, max_pool2], dim=1)
        out = F.relu(self.fc_total(self.dropout(out)))
        out = self.fc_final(out)

        return out

    def load_embeddings(self, emb_vectors):
        if 'static' in self.mode:
            self.embedding.weight.data.copy_(emb_vectors)
            if 'non' not in self.mode:
                self.embedding.weight.data.requires_grad = False
                print('Loaded pretrained embeddings, weights are not trainable.')
            else:
                self.embedding.weight.data.requires_grad = True
                print('Loaded pretrained embeddings, weights are trainable.')
        elif self.mode == 'rand':
            print('Randomly initialized embeddings are used.')
        else:
            raise ValueError(
                'Unexpected value of mode. Please choose from static, nonstatic, rand.')


## Evaluator (not yet implemented)

In [None]:
class JigsawEvaluator:
    """Credits to https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification/discussion/90527"""
    def __init__(self, y_true, y_identity, power=-5, overall_model_weight=0.25):
        self.y = y_true
        self.y_i = y_identity
        self.n_subgroups = self.y_i.shape[1]
        self.power = power
        self.overall_model_weight = overall_model_weight

    @staticmethod
    def _compute_auc(y_true, y_pred):
        try:
            return roc_auc_score(y_true, y_pred)
        except ValueError:
            return np.nan

    def _compute_subgroup_auc(self, i, y_pred):
        mask = self.y_i[:, i] == 1
        return self._compute_auc(self.y[mask], y_pred[mask])

    def _compute_bpsn_auc(self, i, y_pred):
        mask = self.y_i[:, i] + self.y == 1
        return self._compute_auc(self.y[mask], y_pred[mask])

    def _compute_bnsp_auc(self, i, y_pred):
        mask = self.y_i[:, i] + self.y != 1
        return self._compute_auc(self.y[mask], y_pred[mask])

    def compute_bias_metrics_for_model(self, y_pred):
        records = np.zeros((3, self.n_subgroups))
        for i in range(self.n_subgroups):
            records[0, i] = self._compute_subgroup_auc(i, y_pred)
            records[1, i] = self._compute_bpsn_auc(i, y_pred)
            records[2, i] = self._compute_bnsp_auc(i, y_pred)
        return records

    def _calculate_overall_auc(self, y_pred):
        return roc_auc_score(self.y, y_pred)

    def _power_mean(self, array):
        total = sum(np.power(array, self.power))
        return np.power(total / len(array), 1 / self.power)

    def get_final_metric(self, y_pred):
        bias_metrics = self.compute_bias_metrics_for_model(y_pred)
        bias_score = np.average([
            self._power_mean(bias_metrics[0]),
            self._power_mean(bias_metrics[1]),
            self._power_mean(bias_metrics[2])
        ])
        overall_score = self.overall_model_weight * self._calculate_overall_auc(y_pred)
        bias_score = (1 - self.overall_model_weight) * bias_score
        return overall_score + bias_score

In [None]:
identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

## Ignite processing and metrics

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
def process_function(engine, batch):
    model.train()
    optimizer.zero_grad()
    x, y = batch.text, batch.label
    y_pred = model(x)
    loss = criterion(y_pred, y)
    loss.backward()
    optimizer.step()
    return loss.item()



In [None]:
def eval_function(engine, batch):
    model.eval()
    with torch.no_grad():
        x, y = batch.text, batch.label
        y_pred = model(x)
        return y_pred, y



In [None]:
trainer = Engine(process_function)
train_evaluator = Engine(eval_function)
validation_evaluator = Engine(eval_function)

In [None]:
RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')

In [None]:
def thresholded_output_transform(output):
    y_pred, y = output
    y_pred = torch.round(y_pred)
    return y_pred, y

In [None]:
Accuracy(output_transform=thresholded_output_transform).attach(
    train_evaluator, 'accuracy')
Loss(criterion).attach(train_evaluator, 'ce')


Accuracy(output_transform=thresholded_output_transform).attach(
    validation_evaluator, 'accuracy')
Loss(criterion).attach(validation_evaluator, 'ce')

In [None]:
pbar = ProgressBar(persist=True, bar_format="")
pbar.attach(trainer, ['loss'])

In [None]:
def log_training_results(engine):
    train_evaluator.run(train_loader)
    metrics = train_evaluator.state.metrics
    pbar.log_message(
        "Training Results - Epoch: {} \nMetrics\n{}"
        .format(engine.state.epoch, pprint.pformat(metrics)))

def log_validation_results(engine):
    validation_evaluator.run(val_loader)
    metrics = validation_evaluator.state.metrics
    pbar.log_message(
        "Validation Results - Epoch: {} \nMetrics\n{}"
        .format(engine.state.epoch, pprint.pformat(metrics)))
    pbar.n = pbar.last_print_n = 0



# trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results)
# if USE_VALIDATION:
trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results)

In [None]:


def score_function(engine):
    val_loss = engine.state.metrics['ce']
    return -val_loss

handler = EarlyStopping(patience=1, score_function=score_function, trainer=trainer)
# if USE_VALIDATION:
validation_evaluator.add_event_handler(Events.COMPLETED, handler)


In [None]:
best_model_save = ModelCheckpoint(
    'best_model', 'textcnn', n_saved=1,
    create_dir=True, save_as_state_dict=True,
    score_function=score_function)



## Training with Stratified folds

In [None]:
def get_predictions(model, loader):
    model.eval()

    with torch.no_grad():
        predictions = []
        for batch in loader:
            x = batch.text
            logits = model(x)
            y_pred = F.softmax(logits, dim=1)[:, 1]
            # move from GPU to CPU and convert to numpy array
            y_pred_numpy = y_pred.cpu().numpy()

            predictions.append(y_pred_numpy)
        predictions = np.concatenate(predictions)
    return predictions

In [None]:
class TorchtextSubset(Subset):
    def __init__(self, dataset, indices):
        super(TorchtextSubset, self).__init__(dataset, indices)
        self.fields = self.dataset.fields
        self.sort_key = self.dataset.sort_key

In [None]:
kernel_sizes = [3, 4, 5]
num_filters = 64
num_classes = 2
d_prob = 0.5
mode = 'nonstatic'
hidden_dim = 256
lstm_units = 128
spatial_drop = 0.1

batch_size = 1024
n_folds = 10
num_folds_use = 1
num_epoch = 10

In [None]:
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
labels = [example.label for example in train_dataset]

In [None]:
batch_test_size = 1024

test_dataset = torchtext.data.TabularDataset(f'{input_path}/test.csv',
                                            format='csv',
                                            fields={'comment_text': ('text', TEXT)})

test_loader = torchtext.data.Iterator(test_dataset, batch_size=batch_test_size,
                                      device='cuda', shuffle=False, sort=False)

# test_loader = torchtext.data.BucketIterator(
#     test_dataset, batch_size=batch_test_size, device=device,
#     sort_key=lambda x: len(x.text),
#     sort_within_batch=True, repeat=False, shuffle=False)

In [None]:
idx_splits = list(skf.split(range(len(train_dataset)), y=labels))[:num_folds_use]

for train_idx, val_idx in idx_splits:
    seed_torch(seed)
    train_ds = TorchtextSubset(train_dataset, train_idx)
    val_ds = TorchtextSubset(train_dataset, val_idx)

    train_loader, val_loader = torchtext.data.BucketIterator.splits(
        [train_ds, val_ds], batch_sizes=[batch_size, batch_size], device=device,
        sort_key=lambda x: len(x.text),
        sort_within_batch=True, repeat=False)

#     train_loader, val_loader = torchtext.data.Iterator.splits(
#         [train_ds, val_ds], batch_sizes=[batch_size, batch_size],
#         device=device, sort=False)
    
    vocab_size, embedding_dim = vocab.vectors.shape
    model = TextModel(vocab_size=vocab_size,
                      embedding_dim=embedding_dim,
                      kernel_sizes=kernel_sizes,
                      num_filters=num_filters,
                      num_classes=num_classes,
                      d_prob=d_prob,
                      mode=mode,
                      hidden_dim=hidden_dim,
                      lstm_units=lstm_units,
                      spatial_drop=spatial_drop,
                      emb_vectors=vocab.vectors)
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    
    validation_evaluator.add_event_handler(Events.EPOCH_COMPLETED, best_model_save, {'text_model': model})
    trainer.run(train_loader, max_epochs=num_epoch)
    
    # load best model 
    model_path = next(pathlib.Path('best_model').rglob('*'))
    model_state_dict = torch.load(model_path)
    model.load_state_dict(model_state_dict)
    
    predictions = get_predictions(model, test_loader)

In [None]:
df_sub = pd.read_csv(f'{input_path}/sample_submission.csv')
df_sub['prediction'] = predictions

In [None]:
df_sub.head()

In [None]:
df_sub.to_csv('submission.csv', index=False)