In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json, string, re, random, pickle, gc, operator, time, sys
from contextlib import contextmanager
from collections import Counter

import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader, Sampler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
from keras_preprocessing.text import text_to_word_sequence

from tqdm.auto import tqdm
tqdm.pandas()

from matplotlib import pyplot as plt
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# disable progress bars when submitting
def is_interactive():
    return 'SHLVL' not in os.environ

if not is_interactive():
    def nop(it, *a, **k):
        return it

    tqdm = nop

In [None]:
def manual_seed(seed=420, cuda=False):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

manual_seed(cuda=True)

In [None]:
@contextmanager
def timer(msg):
    start = time.time()
    print(f'[{msg}] start...')
    yield
    elapsed = time.time() - start
    hours, rem = divmod(elapsed, 3600)
    minutes, seconds = divmod(rem, 60)
    elapsed_str = "{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds)
    print(f'[{msg}] done in {elapsed_str}.')

In [None]:
MAX_VOCAB_SIZE = 100000
MAX_LEN = 256
TARGET_COLUMNS = ['target'] #, 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']

In [None]:
identity_columns = ['asian', 'atheist',
       'bisexual', 'black', 'buddhist', 'christian', 'female',
       'heterosexual', 'hindu', 'homosexual_gay_or_lesbian',
       'intellectual_or_learning_disability', 'jewish', 'latino', 'male',
       'muslim', 'other_disability', 'other_gender',
       'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white']

In [None]:
special_characters = {
    "’": "'", "‘": "'", "´": "'", "`": "'", "…": "...", "&": " and ", "“": '"', "”": '"',
    "⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4", "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9",
    "₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9", 
    "ᴀ": "a", "ʙ": "b", "ᴄ": "c", "ᴅ": "d", "ᴇ": "e", "ғ": "f", "ɢ": "g", "ʜ": "h", "ɪ": "i", 
    "ᴊ": "j", "ᴋ": "k", "ʟ": "l", "ᴍ": "m", "ɴ": "n", "ᴏ": "o", "ᴘ": "p", "ǫ": "q", "ʀ": "r", 
    "s": "s", "ᴛ": "t", "ᴜ": "u", "ᴠ": "v", "ᴡ": "w", "x": "x", "ʏ": "y", "ᴢ": "z"
}
contractions = json.load(open('../input/english-contractions/contractions.json', 'r'))
contractions = {key.lower():value.lower() for key, value in contractions.items()}
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

special_characters_re = re.compile('({})'.format('|'.join(special_characters.keys())))
special_characters_map = lambda match: special_characters[match.group(0)]
contractions_re = re.compile('({})'.format('|'.join(contractions.keys())))
contractions_map = lambda match: contractions[match.group(0)]
punct_table = str.maketrans(punct, ' '*len(punct))

def tokenize(text):
    text = text.lower()
    text = special_characters_re.sub(special_characters_map, text)
    text = contractions_re.sub(contractions_map, text)
    text = text.translate(punct_table)
    tokens = text_to_word_sequence(text, lower=False)
    return tokens

In [None]:
with timer('Process train.csv'):
    print("Loading File...")
    train_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
    print("Filling NaNs with 0s")
    train_df.fillna(0.0, inplace=True)
    print(f'Training data size: {len(train_df)}')
    print("Extracting Data...")
    train_text, train_target = train_df['comment_text'].values, torch.from_numpy(train_df[TARGET_COLUMNS].astype(np.float32).values)
    #train_target = torch.where(train_target >= 0.5, torch.ones_like(train_target), torch.zeros_like(train_target))
    train_has_identities = (train_df[identity_columns].values >= 0.5).any(axis=1)
    train_is_toxic = (train_df['target'].values >= 0.5)
    train_weights = (1 + ~train_is_toxic) * train_has_identities + train_is_toxic * ~train_has_identities + 1
    train_weights = train_weights.astype(np.float32) / train_weights.mean()
    train_weights = torch.from_numpy(train_weights)
    del train_df, train_has_identities, train_is_toxic
    gc.collect()
    print("Tokenizing...")
    for index, comment in enumerate(tqdm(train_text)):
        train_text[index] = tokenize(comment)
    gc.collect()

In [None]:
with timer('Process test.csv'):
    print("Loading File...")
    test_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')
    print(f'Testing data size: {len(test_df)}')
    print("Extracting Data...")
    test_ids, test_text = torch.from_numpy(test_df['id'].astype(np.int32).values), test_df['comment_text'].values
    del test_df
    gc.collect()
    print("Tokenizing...")
    for index, comment in enumerate(tqdm(test_text)):
        test_text[index] = tokenize(comment)
    gc.collect()

In [None]:
with timer('Calculating Lengths'):
    oversized = 0
    print('Trimming')
    for index, comment in enumerate(train_text):
        if len(comment) > MAX_LEN:
            train_text[index] = comment[:MAX_LEN]
            oversized += 1
    train_lengths = torch.tensor([len(comment) for comment in train_text], dtype=torch.int16)
    for index, comment in enumerate(test_text):
        if len(comment) > MAX_LEN:
            test_text[index] = comment[:MAX_LEN]
            oversized += 1
    test_lengths = torch.tensor([len(comment) for comment in test_text], dtype=torch.int16)
    print(f'{oversized} comment(s) ({oversized*100/(len(train_text)+len(test_text))}%) are longers than {MAX_LEN}')
    gc.collect()

In [None]:
with timer('Indexing tokens'):
    vocab = Counter()
    print('Counting training tokens...')
    vocab.update(token for comment in tqdm(train_text) for token in comment)
    print('Counting testing tokens...')
    vocab.update(token for comment in tqdm(test_text) for token in comment)
    print(f'Full vocabulary size is {len(vocab)} covering {sum(vocab.values())} tokens.')
    print('Top 20 words:', vocab.most_common(20))
    top_words, top_freq = zip(*vocab.most_common(min(len(vocab),MAX_VOCAB_SIZE)))
    print(f'Top-{len(top_words)} covers {sum(top_freq)*100/sum(vocab.values())}%.')
    del vocab
    del top_freq
    print("Building token index...")
    token2index = {token:index for index, token in enumerate(['<PAD>', '<UNK>'] + list(top_words))}
    del top_words
    gc.collect()
    print("Indexing training data...")
    train_input = torch.zeros(len(train_text), MAX_LEN, dtype=torch.int32)
    for index, comment in enumerate(tqdm(train_text)):
        train_input[index,:len(comment)] = torch.tensor([token2index.get(token,1) for token in comment], dtype=torch.int32) 
    del train_text
    gc.collect()
    print("Indexing testing data...")
    test_input = torch.zeros(len(test_text), MAX_LEN, dtype=torch.int32)
    for index, comment in enumerate(tqdm(test_text)):
        test_input[index,:len(comment)] = torch.tensor([token2index.get(token,1) for token in comment], dtype=torch.int32) 
    del test_text
    gc.collect()

In [None]:
with timer('Resolving zero length elements'):
    zero_mask = (train_lengths == 0)
    print(f'found {zero_mask.sum().item()} zero length elements in training data')
    train_lengths[zero_mask] = 1
    train_input[zero_mask, 0] = 1
    zero_mask = (test_lengths == 0)
    print(f'found {zero_mask.sum().item()} zero length elements in testing data')
    test_lengths[zero_mask] = 1
    test_input[zero_mask, 0] = 1

In [None]:
embedding_files = [
    {'file': '../input/pickled-crawl300d2m-for-kernel-competitions/crawl-300d-2M.pkl', 'size': 300},
    {'file': '../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl', 'size': 300}
]

In [None]:
vector_size = sum(file['size'] for file in embedding_files)
word_embeddings = torch.empty(len(token2index), vector_size, dtype=torch.float32)
word_embeddings[0] = 0
start = 0
for file in embedding_files:
    with timer(f'Loading {file["file"]}'):
        w2v = pickle.load(open(file["file"], 'rb'))
    size = file['size']
    end = start + size
    unk_gen = lambda: F.normalize(torch.randn(size), p=2, dim=0)
    word_embeddings[1, start:end] = unk_gen()
    not_found = []
    for token, index in tqdm(token2index.items()):
        if index < 2: continue
        try:
            word_embeddings[index, start:end] = torch.from_numpy(w2v[token])
        except KeyError:
            word_embeddings[index, start:end] = unk_gen()
            not_found.append((index, token))
    print(f'Could not find vectors for {len(not_found)} token(s) ({len(not_found)*100/(len(token2index)-2)}%)')
    not_found.sort()
    print('Top-10 not found words:\n'+'\n'.join(f'{str(index+1)}- {token}' for index, token in not_found[:10]))
    del w2v
    del not_found
    gc.collect()
    start = end

In [None]:
class LengthSortedBatchSampler(Sampler):
    def __init__(self, lengths, batch_size):
        self.lengths = lengths
        self.batch_size = batch_size
    
    def __iter__(self):
        shuffled_indices = torch.randperm(self.lengths.size(0))
        shuffled_lengths = self.lengths[shuffled_indices]
        shuffled_lengths, sorted_indices = shuffled_lengths.sort()
        shuffled_indices = shuffled_indices[sorted_indices]
        batches = self.batch_size * torch.randperm(len(self))
        for batch_start in batches:
            yield shuffled_indices[batch_start:batch_start+self.batch_size]

    def __len__(self):
        return (len(self.lengths) + self.batch_size - 1) // self.batch_size

In [None]:
BATCH_SIZE = 512
train_dataset = TensorDataset(train_lengths, train_input, train_target, train_weights)
train_dataloader = DataLoader(train_dataset, batch_sampler=LengthSortedBatchSampler(train_lengths, BATCH_SIZE))
test_dataset = TensorDataset(test_lengths, test_input)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

class Model(nn.Module):
    def __init__(self, embeddings, hidden_size=128, layer_count=2, linear_layer_count=2, num_outputs=len(TARGET_COLUMNS)):
        super(Model, self).__init__()
        vocab_size, emb_dim = embeddings.size()
        self.embed = nn.Embedding.from_pretrained(embeddings)
        self.embed_dropout = SpatialDropout(0.2)
        self.gru = nn.GRU(emb_dim, hidden_size, num_layers=layer_count, batch_first=True, bidirectional=True)
        feats_size = 2*2*hidden_size # bidirectional * (mean+max) * hidden_size
        self.linear_layers = nn.ModuleList([nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(feats_size, feats_size),
            nn.PReLU()
        ) for _ in range(linear_layer_count)])
        self.head = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(feats_size, num_outputs)
        )

    def forward(self, sentences, lengths):
        embeddings = self.embed(sentences)
        embeddings = self.embed_dropout(embeddings)
        packed_input = pack_padded_sequence(embeddings, lengths, batch_first=True, enforce_sorted=False)
        packed_output, h = self.gru(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        output_max, _ = output.max(dim=1)
        output_mean = output.sum(dim=1)/(lengths.unsqueeze(1).float()+(1e-8))
        feats = torch.cat([output_max, output_mean], dim=1)
        for layer in self.linear_layers:
            feats = feats + layer(feats)
        return self.head(feats)

In [None]:
# Make 1st target more important
target_weights = torch.ones(len(TARGET_COLUMNS), dtype=torch.float32)
target_weights[0] = len(TARGET_COLUMNS)+1
target_weights /= 2
target_weights = target_weights.cuda()

def loss_fn(predicted, target, weights):
    return ((F.binary_cross_entropy_with_logits(predicted, target, reduction='none') * weights.unsqueeze(1)) * target_weights).mean()

In [None]:
def fit_one_epoch(model, optimizer, epoch):
    model.train()
    interactive = is_interactive()
    epoch_loss = 0
    batch_it = tqdm(train_dataloader, desc="Epoch {}/{}".format(epoch, num_epochs))
    for batch_lengths, batch_sentences, batch_target, batch_weights in batch_it:
        optimizer.zero_grad()
        predicted = model(batch_sentences.cuda().long(), batch_lengths.cuda())
        loss = loss_fn(predicted, batch_target.cuda(), batch_weights.cuda())
        loss.backward()
        optimizer.step()
        batch_loss = loss.item()
        if interactive:
            batch_it.set_postfix({'batch loss': batch_loss})
        epoch_loss += batch_loss * batch_lengths.size(0)
    if interactive:
        batch_it.close()
    epoch_loss /= train_lengths.size(0)
    return epoch_loss

In [None]:
def get_predictions(model):
    model.eval()
    test_size = test_ids.size(0)
    predictions = torch.empty(test_size, dtype=torch.float)
    index = 0
    for batch_lengths, batch_sentences in tqdm(test_dataloader, desc="Test"):
        predicted = torch.sigmoid(model(batch_sentences.cuda().long(), batch_lengths.cuda()))
        index_to = index + predicted.size(0)
        predictions[index:index_to] = predicted[:,0].cpu().detach()
        index = index_to
    return predictions

In [None]:
def extract(history, key, default=None):
        return [elem.get(key, default) for elem in history]

def view_history(history):
    plt.plot(extract(history, 'epoch'), extract(history, 'loss'))
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.show()
    print(history)

In [None]:
num_models = 7
predictions = torch.zeros(test_ids.size(0), dtype=torch.float)
total_weight = 0
for index in range(1,num_models+1):
    print(f'Start training model#{index}')
    model = Model(embeddings=word_embeddings, hidden_size=128, layer_count=2, linear_layer_count=2).cuda()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epoch: 0.6**epoch)
    num_epochs = 4
    history = []
    for epoch in range(1,num_epochs+1):
        epoch_loss = fit_one_epoch(model, optimizer, epoch)
        print("Epoch {}/{}: Average Loss={}".format(epoch, num_epochs, epoch_loss), flush=True)
        history.append({'epoch': epoch, 'loss': epoch_loss})
        epoch_predictions = get_predictions(model)
        epoch_weight = 2**epoch
        predictions += epoch_weight * epoch_predictions
        total_weight += epoch_weight
        lr_scheduler.step()
    view_history(history)
predictions /= total_weight

In [None]:
del identity_columns, train_target, train_weights, test_ids, train_lengths, test_lengths, token2index, train_input, test_input, zero_mask
del word_embeddings, train_dataset, train_dataloader, test_dataset, test_dataloader, target_weights, model, optimizer, lr_scheduler
del history, epoch_predictions
gc.collect()

**BERT**

In [None]:
package_dir = "../input/ppbert/pytorch-pretrained-bert/pytorch-pretrained-BERT"
sys.path.append(package_dir)

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import warnings
import torch.utils.data
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam
from pytorch_pretrained_bert import BertConfig

warnings.filterwarnings(action='once')
device = torch.device('cuda')

In [None]:
def convert_lines(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    for text in tqdm(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    return np.array(all_tokens)

In [None]:
MAX_SEQUENCE_LENGTH = 220
SEED = 1234
BERT_MODEL_PATH = '../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

bert_config = BertConfig('../input/finetuned-bert-for-jigsaw-toxicity-classification/bert_config.json')
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True)

In [None]:
test_df = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv")
test_df['comment_text'] = test_df['comment_text'].astype(str) 
X_test = convert_lines(test_df["comment_text"].fillna("DUMMY_VALUE"), MAX_SEQUENCE_LENGTH, tokenizer)

In [None]:
model = BertForSequenceClassification(bert_config, num_labels=1)
model.load_state_dict(torch.load("../input/finetuned-bert-for-jigsaw-toxicity-classification/bert_pytorch.bin"))
model.to(device)
for param in model.parameters():
    param.requires_grad = False
model.eval()

In [None]:
BATCH_SIZE = 64
test_preds = np.zeros((len(X_test)))
test = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.long))
test_loader = torch.utils.data.DataLoader(test, batch_size=BATCH_SIZE, shuffle=False)
tk0 = tqdm(test_loader)
for i, (x_batch,) in enumerate(tk0):
    pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device), labels=None)
    test_preds[i * BATCH_SIZE:(i + 1) * BATCH_SIZE] = pred[:, 0].detach().cpu().squeeze().numpy()

test_pred = torch.sigmoid(torch.tensor(test_preds)).numpy().ravel()

In [None]:
final_predictions = 0.5*test_pred + 0.5*predictions.numpy()

In [None]:
submission = pd.DataFrame({'id': test_df['id'], 'prediction': final_predictions})
submission.to_csv('submission.csv', index=False)

In [None]:
submission.head()