In [2]:
#https://www.kaggle.com/christofhenkel/how-to-preprocessing-for-glove-part2-usage/
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import fastai
from fastai.train import Learner
from fastai.train import DataBunch
from fastai.callbacks import *
from fastai.basic_data import DatasetType
import fastprogress
from fastprogress import force_console_behavior
import numpy as np
from pprint import pprint
import pandas as pd
import os
import time

import gc
import random
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from keras.preprocessing import text, sequence
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F

#from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold


tqdm.pandas()

# disable progress bars when submitting
def is_interactive():
   return 'SHLVL' not in os.environ

if not is_interactive():
    def nop(it, *a, **k):
        return it

    tqdm = nop

    fastprogress.fastprogress.NO_BAR = True
    master_bar, progress_bar = force_console_behavior()
    fastai.basic_train.master_bar, fastai.basic_train.progress_bar = master_bar, progress_bar
    
    
def seed_everything(seed=123):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(29)

Using TensorFlow backend.


In [3]:
#CRAWL_EMBEDDING_PATH = '../input/pickled-crawl300d2m-for-kernel-competitions/crawl-300d-2M.pkl'
#GLOVE_EMBEDDING_PATH = '../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl'

CRAWL_EMBEDDING_PATH = '/home/jupyter/jigsaw/data/crawl-300d-2M.pkl'
GLOVE_EMBEDDING_PATH = '/home/jupyter/jigsaw/data/glove.840B.300d.pkl'
TRAIN_CSV = '/home/jupyter/jigsaw/data/train.csv'
TEST_CSV = '/home/jupyter/jigsaw/data/test.csv'


#Of course we also need to adjust the load_embeddings function, to now handle the pickled dict.

NUM_MODELS = 2
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
MAX_LEN = 220

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path,'rb') as f:
        emb_arr = pickle.load(f)
    return emb_arr


#The next function is really important. Although we put a lot of effort in making the preprocessing right 
#there are stil some out of vocabulary words we could easily fix. 
#One example I implement here is to try a "lower/upper case version of a" word 
#if an embedding is not found, which sometimes gives us an embedding. Sorry for the bad coding style in the loop

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((max_features + 1, 300))
    unknown_words = []
    
    for word, i in word_index.items():
        if i <= max_features:
            try:
                embedding_matrix[i] = embedding_index[word]
            except KeyError:
                try:
                    embedding_matrix[i] = embedding_index[word.lower()]
                except KeyError:
                    try:
                        embedding_matrix[i] = embedding_index[word.title()]
                    except KeyError:
                        unknown_words.append(word)
    return embedding_matrix, unknown_words


def sigmoid(x):
    return 1 / (1 + np.exp(-x))

class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

In [4]:
# def train_model(model_name, foldno, learn,test,output_dim,lr=0.001,
#                 batch_size=512, n_epochs=4,
#                 enable_checkpoint_ensemble=True):
    
#     all_test_preds = []
#     checkpoint_weights = [2 ** epoch for epoch in range(n_epochs)]
#     test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
#     valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size,shuffle=False)

#     all_valid_preds = []
    
#     n = len(learn.data.train_dl)
#     phases = [(TrainingPhase(n).schedule_hp('lr', lr * (0.6**(i)))) for i in range(n_epochs)]
#     sched = GeneralScheduler(learn, phases)
#     learn.callbacks.append(sched)
#     for epoch in range(n_epochs):
#         learn.fit(1)
#         test_preds = np.zeros((len(test), output_dim))   
#         valid_preds = np.zeros((len(valid_dataset),output_dim))
        
#         for i, x_batch in enumerate(valid_loader):
#             X = x_batch[0].cuda()
#             y_valid_pred = sigmoid(learn.model(X).detach().cpu().numpy())
#             valid_preds[i*batch_size:(i+1)*batch_size,:]=y_valid_pred
#         all_valid_preds.append(valid_preds)
        
#         for i, x_batch in enumerate(test_loader):
#             X = x_batch[0].cuda()
#             y_pred = sigmoid(learn.model(X).detach().cpu().numpy())
#             test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred

#         all_test_preds.append(test_preds)
        
#         dict_save = model.state_dict().copy()
#         dict_save.pop('embedding.weight')
#         torch.save(dict_save, str(foldno) + '_' + model_name + '_' + str(epoch)+'_epoch.bin')
        
# #     if enable_checkpoint_ensemble:
# #         test_preds = np.average(all_test_preds, weights=checkpoint_weights, axis=0)   
# #         valid_preds = np.average(all_valid_preds, weights=checkpoint_weights, axis=0)
# #     else:
# #         test_preds = all_test_preds[-1]
        
#     #return test_preds, valid_preds
#     return np.array(all_test_preds), np.array(all_valid_preds)

In [5]:
#train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv') #, nrows=100004)
#test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv') #, nrows=1000)

train = pd.read_csv(TRAIN_CSV) #, nrows=100004)
test = pd.read_csv(TEST_CSV) #, nrows=1000)


#IDX = 1624387 #95000 

## Preprocessing
#See part1 for an explanation how I came to the list of symbols and contraction function. I copied them from that kernel.

symbols_to_isolate = '.,?!-;*"…:—()%#$&_/@＼・ω+=”“[]^–>\\°<~•≠™ˈʊɒ∞§{}·τα❤☺ɡ|¢→̶`❥━┣┫┗Ｏ►★©―ɪ✔®\x96\x92●£♥➤´¹☕≈÷♡◐║▬′ɔː€۩۞†μ✒➥═☆ˌ◄½ʻπδηλσερνʃ✬ＳＵＰＥＲＩＴ☻±♍µº¾✓◾؟．⬅℅»Вав❣⋅¿¬♫ＣＭβ█▓▒░⇒⭐›¡₂₃❧▰▔◞▀▂▃▄▅▆▇↙γ̄″☹➡«φ⅓„✋：¥̲̅́∙‛◇✏▷❓❗¶˚˙）сиʿ✨。ɑ\x80◕！％¯−ﬂﬁ₁²ʌ¼⁴⁄₄⌠♭✘╪▶☭✭♪☔☠♂☃☎✈✌✰❆☙○‣⚓年∎ℒ▪▙☏⅛ｃａｓǀ℮¸ｗ‚∼‖ℳ❄←☼⋆ʒ⊂、⅔¨͡๏⚾⚽Φ×θ￦？（℃⏩☮⚠月✊❌⭕▸■⇌☐☑⚡☄ǫ╭∩╮，例＞ʕɐ̣Δ₀✞┈╱╲▏▕┃╰▊▋╯┳┊≥☒↑☝ɹ✅☛♩☞ＡＪＢ◔◡↓♀⬆̱ℏ\x91⠀ˤ╚↺⇤∏✾◦♬³の｜／∵∴√Ω¤☜▲↳▫‿⬇✧ｏｖｍ－２０８＇‰≤∕ˆ⚜☁'
symbols_to_delete = '\n🍕\r🐵😑\xa0\ue014\t\uf818\uf04a\xad😢🐶️\uf0e0😜😎👊\u200b\u200e😁عدويهصقأناخلىبمغر😍💖💵Е👎😀😂\u202a\u202c🔥😄🏻💥ᴍʏʀᴇɴᴅᴏᴀᴋʜᴜʟᴛᴄᴘʙғᴊᴡɢ😋👏שלוםבי😱‼\x81エンジ故障\u2009🚌ᴵ͞🌟😊😳😧🙀😐😕\u200f👍😮😃😘אעכח💩💯⛽🚄🏼ஜ😖ᴠ🚲‐😟😈💪🙏🎯🌹😇💔😡\x7f👌ἐὶήιὲκἀίῃἴξ🙄Ｈ😠\ufeff\u2028😉😤⛺🙂\u3000تحكسة👮💙فزط😏🍾🎉😞\u2008🏾😅😭👻😥😔😓🏽🎆🍻🍽🎶🌺🤔😪\x08‑🐰🐇🐱🙆😨🙃💕𝘊𝘦𝘳𝘢𝘵𝘰𝘤𝘺𝘴𝘪𝘧𝘮𝘣💗💚地獄谷улкнПоАН🐾🐕😆ה🔗🚽歌舞伎🙈😴🏿🤗🇺🇸мυтѕ⤵🏆🎃😩\u200a🌠🐟💫💰💎эпрд\x95🖐🙅⛲🍰🤐👆🙌\u2002💛🙁👀🙊🙉\u2004ˢᵒʳʸᴼᴷᴺʷᵗʰᵉᵘ\x13🚬🤓\ue602😵άοόςέὸתמדףנרךצט😒͝🆕👅👥👄🔄🔤👉👤👶👲🔛🎓\uf0b7\uf04c\x9f\x10成都😣⏺😌🤑🌏😯ех😲Ἰᾶὁ💞🚓🔔📚🏀👐\u202d💤🍇\ue613小土豆🏡❔⁉\u202f👠》कर्मा🇹🇼🌸蔡英文🌞🎲レクサス😛外国人关系Сб💋💀🎄💜🤢َِьыгя不是\x9c\x9d🗑\u2005💃📣👿༼つ༽😰ḷЗз▱ц￼🤣卖温哥华议会下降你失去所有的钱加拿大坏税骗子🐝ツ🎅\x85🍺آإشء🎵🌎͟ἔ油别克🤡🤥😬🤧й\u2003🚀🤴ʲшчИОРФДЯМюж😝🖑ὐύύ特殊作戦群щ💨圆明园קℐ🏈😺🌍⏏ệ🍔🐮🍁🍆🍑🌮🌯🤦\u200d𝓒𝓲𝓿𝓵안영하세요ЖљКћ🍀😫🤤ῦ我出生在了可以说普通话汉语好极🎼🕺🍸🥂🗽🎇🎊🆘🤠👩🖒🚪天一家⚲\u2006⚭⚆⬭⬯⏖新✀╌🇫🇷🇩🇪🇮🇬🇧😷🇨🇦ХШ🌐\x1f杀鸡给猴看ʁ𝗪𝗵𝗲𝗻𝘆𝗼𝘂𝗿𝗮𝗹𝗶𝘇𝗯𝘁𝗰𝘀𝘅𝗽𝘄𝗱📺ϖ\u2000үսᴦᎥһͺ\u2007հ\u2001ɩｙｅ൦ｌƽｈ𝐓𝐡𝐞𝐫𝐮𝐝𝐚𝐃𝐜𝐩𝐭𝐢𝐨𝐧Ƅᴨןᑯ໐ΤᏧ௦Іᴑ܁𝐬𝐰𝐲𝐛𝐦𝐯𝐑𝐙𝐣𝐇𝐂𝐘𝟎ԜТᗞ౦〔Ꭻ𝐳𝐔𝐱𝟔𝟓𝐅🐋ﬃ💘💓ё𝘥𝘯𝘶💐🌋🌄🌅𝙬𝙖𝙨𝙤𝙣𝙡𝙮𝙘𝙠𝙚𝙙𝙜𝙧𝙥𝙩𝙪𝙗𝙞𝙝𝙛👺🐷ℋ𝐀𝐥𝐪🚶𝙢Ἱ🤘ͦ💸ج패티Ｗ𝙇ᵻ👂👃ɜ🎫\uf0a7БУі🚢🚂ગુજરાતીῆ🏃𝓬𝓻𝓴𝓮𝓽𝓼☘﴾̯﴿₽\ue807𝑻𝒆𝒍𝒕𝒉𝒓𝒖𝒂𝒏𝒅𝒔𝒎𝒗𝒊👽😙\u200cЛ‒🎾👹⎌🏒⛸公寓养宠物吗🏄🐀🚑🤷操美𝒑𝒚𝒐𝑴🤙🐒欢迎来到阿拉斯ספ𝙫🐈𝒌𝙊𝙭𝙆𝙋𝙍𝘼𝙅ﷻ🦄巨收赢得白鬼愤怒要买额ẽ🚗🐳𝟏𝐟𝟖𝟑𝟕𝒄𝟗𝐠𝙄𝙃👇锟斤拷𝗢𝟳𝟱𝟬⦁マルハニチロ株式社⛷한국어ㄸㅓ니͜ʖ𝘿𝙔₵𝒩ℯ𝒾𝓁𝒶𝓉𝓇𝓊𝓃𝓈𝓅ℴ𝒻𝒽𝓀𝓌𝒸𝓎𝙏ζ𝙟𝘃𝗺𝟮𝟭𝟯𝟲👋🦊多伦🐽🎻🎹⛓🏹🍷🦆为和中友谊祝贺与其想象对法如直接问用自己猜本传教士没积唯认识基督徒曾经让相信耶稣复活死怪他但当们聊些政治题时候战胜因圣把全堂结婚孩恐惧且栗谓这样还♾🎸🤕🤒⛑🎁批判检讨🏝🦁🙋😶쥐스탱트뤼도석유가격인상이경제황을렵게만들지않록잘관리해야합다캐나에서대마초와화약금의품런성분갈때는반드시허된사용🔫👁凸ὰ💲🗯𝙈Ἄ𝒇𝒈𝒘𝒃𝑬𝑶𝕾𝖙𝖗𝖆𝖎𝖌𝖍𝖕𝖊𝖔𝖑𝖉𝖓𝖐𝖜𝖞𝖚𝖇𝕿𝖘𝖄𝖛𝖒𝖋𝖂𝕴𝖟𝖈𝕸👑🚿💡知彼百\uf005𝙀𝒛𝑲𝑳𝑾𝒋𝟒😦𝙒𝘾𝘽🏐𝘩𝘨ὼṑ𝑱𝑹𝑫𝑵𝑪🇰🇵👾ᓇᒧᔭᐃᐧᐦᑳᐨᓃᓂᑲᐸᑭᑎᓀᐣ🐄🎈🔨🐎🤞🐸💟🎰🌝🛳点击查版🍭𝑥𝑦𝑧ＮＧ👣\uf020っ🏉ф💭🎥Ξ🐴👨🤳🦍\x0b🍩𝑯𝒒😗𝟐🏂👳🍗🕉🐲چی𝑮𝗕𝗴🍒ꜥⲣⲏ🐑⏰鉄リ事件ї💊「」\uf203\uf09a\uf222\ue608\uf202\uf099\uf469\ue607\uf410\ue600燻製シ虚偽屁理屈Г𝑩𝑰𝒀𝑺🌤𝗳𝗜𝗙𝗦𝗧🍊ὺἈἡχῖΛ⤏🇳𝒙ψՁմեռայինրւդձ冬至ὀ𝒁🔹🤚🍎𝑷🐂💅𝘬𝘱𝘸𝘷𝘐𝘭𝘓𝘖𝘹𝘲𝘫کΒώ💢ΜΟΝΑΕ🇱♲𝝈↴💒⊘Ȼ🚴🖕🖤🥘📍👈➕🚫🎨🌑🐻𝐎𝐍𝐊𝑭🤖🎎😼🕷ｇｒｎｔｉｄｕｆｂｋ𝟰🇴🇭🇻🇲𝗞𝗭𝗘𝗤👼📉🍟🍦🌈🔭《🐊🐍\uf10aლڡ🐦\U0001f92f\U0001f92a🐡💳ἱ🙇𝗸𝗟𝗠𝗷🥜さようなら🔼'

from nltk.tokenize.treebank import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

isolate_dict = {ord(c):f' {c} ' for c in symbols_to_isolate}
remove_dict = {ord(c):f'' for c in symbols_to_delete}


def handle_punctuation(x):
    x = x.translate(remove_dict)
    x = x.translate(isolate_dict)
    return x

def handle_contractions(x):
    x = tokenizer.tokenize(x)
    return x

def fix_quote(x):
    x = [x_[1:] if x_.startswith("'") else x_ for x_ in x]
    x = ' '.join(x)
    return x

def preprocess(x):
    x = handle_punctuation(x)
    x = handle_contractions(x)
    x = fix_quote(x)
    
    x = x.replace("n't",'not')
    x = x.replace("N'T",'NOT')
    x = x.replace('tRump','Trump')
    x = x.replace("gov't","government")
    x = x.replace("Gov't","Government")
    x = x.replace("Twitler","Twitter")
    return x

In [6]:
#So lets apply that preprocess function to our text

train['comment_text'].head()

0    This is so cool. It's like, 'would you want yo...
1    Thank you!! This would make my life a lot less...
2    This is such an urgent design problem; kudos t...
3    Is this something I'll be able to install on m...
4                 haha you guys are a bunch of losers.
Name: comment_text, dtype: object

In [7]:
x_train = train['comment_text'].progress_apply(lambda x:preprocess(x))
y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
x_test = test['comment_text'].progress_apply(lambda x:preprocess(x))

# identity_columns = [
#     'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
#     'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
# # Overall
# weights = np.ones((len(x_train),)) / 4
# # Subgroup
# weights += (train[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) / 4
# # Background Positive, Subgroup Negative
# weights += (( (train['target'].values>=0.5).astype(bool).astype(np.int) +
#    (train[identity_columns].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
# # Background Negative, Subgroup Positive
# weights += (( (train['target'].values<0.5).astype(bool).astype(np.int) +
#    (train[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
# loss_weight = 1.0 / weights.mean()

# y_train = np.vstack([(train['target'].values>=0.5).astype(np.int),weights]).T


#Credits - https://www.kaggle.com/matsuik/subgroup-negative-weighting
TOXICITY_COLUMN = 'target'
identity_columns = ['asian', 'atheist',
       'bisexual', 'black', 'buddhist', 'christian', 'female',
       'heterosexual', 'hindu', 'homosexual_gay_or_lesbian',
       'intellectual_or_learning_disability', 'jewish', 'latino', 'male',
       'muslim', 'other_disability', 'other_gender',
       'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white']
    
subgroup_bool_train = train[identity_columns].fillna(0)>=0.5
toxic_bool_train = train[TOXICITY_COLUMN].fillna(0)>=0.5
subgroup_negative_mask = subgroup_bool_train.values.sum(axis=1).astype(bool) & ~toxic_bool_train
# Overall
weights = np.ones((len(train),))
# Subgroup negative
weights += subgroup_negative_mask
loss_weight = 1.0 / weights.mean()

y_train = np.vstack([(train['target'].values>=0.5).astype(np.int),weights]).T


HBox(children=(IntProgress(value=0, max=1804874), HTML(value='')))




HBox(children=(IntProgress(value=0, max=97320), HTML(value='')))




In [8]:
max_features = 400000
tokenizer = text.Tokenizer(num_words = max_features, filters='',lower=False, oov_token = 'UNK')
tokenizer.fit_on_texts(list(x_train)+list(x_test))
print(len(tokenizer.word_counts)) 
#there are 473630 words in train. 

# tokenizer_test = text.Tokenizer(filters='',lower=False, oov_token = 'UNK')
# tokenizer_test.fit_on_texts(list(x_test))
# print(len(tokenizer_test.word_counts)) #there are 100349 words in test. 
# wordsonlyintest = set(tokenizer_test.word_counts.keys()).difference(set(tokenizer.word_counts.keys()))
# len(wordsonlyintest) #There are 15206 words only in test set, not in training data.

487757


In [9]:
gc.collect()
torch.cuda.empty_cache()

crawl_matrix, unknown_words_crawl = build_matrix(tokenizer.word_index, CRAWL_EMBEDDING_PATH)
print('n unknown words (crawl): ', len(unknown_words_crawl))

glove_matrix, unknown_words_glove = build_matrix(tokenizer.word_index, GLOVE_EMBEDDING_PATH)
print('n unknown words (glove): ', len(unknown_words_glove))

max_features = max_features or len(tokenizer.word_index) + 1
max_features

embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)
embedding_matrix.shape

del crawl_matrix
del glove_matrix
gc.collect()

# x_train_torch = torch.tensor(x_train, dtype=torch.long)
y_train_torch = torch.tensor(np.hstack([y_train, y_aux_train]), dtype=torch.float32)

n unknown words (crawl):  148597
n unknown words (glove):  152051


In [10]:
# import pickle #most likely not needed
# pickle.dump(embedding_matrix, open('emb_mat.p','wb'))

# embedding_matrix = pickle.load( open( "emb_mat.p", "rb" ) )
# pickle.dump(tokenizer,open('tokensaved.p','wb'))
# tokenizer = pickle.load(open('tokensaved.p','rb'))

In [11]:
gc.collect()

0

# Sequence Bucketing

In [12]:
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

lengths = torch.from_numpy(np.array([len(x) for x in x_train]))

#maxlen = lengths.max() 
maxlen = 300
x_train_padded = torch.from_numpy(sequence.pad_sequences(x_train, maxlen=maxlen))
print(x_train_padded.shape)

# lengths = torch.from_numpy(np.array([len(x) for x in x_train]))
test_lengths = torch.from_numpy(np.array([len(x) for x in x_test]))
# maxlen = 299

# x_train_padded = torch.from_numpy(sequence.pad_sequences(x_train, maxlen=maxlen))
x_test_padded = torch.from_numpy(sequence.pad_sequences(x_test, maxlen=maxlen))



torch.Size([1804874, 300])


In [13]:
class SequenceBucketCollator():
    def __init__(self, choose_length, sequence_index, length_index, label_index=None):
        self.choose_length = choose_length
        self.sequence_index = sequence_index
        self.length_index = length_index
        self.label_index = label_index
        
    def __call__(self, batch):
        batch = [torch.stack(x) for x in list(zip(*batch))]
        
        sequences = batch[self.sequence_index]
        lengths = batch[self.length_index]
        
        length = self.choose_length(lengths)
        mask = torch.arange(start=maxlen, end=0, step=-1) < length
        padded_sequences = sequences[:, mask]
        
        batch[self.sequence_index] = padded_sequences
        
        if self.label_index is not None:
            return [x for i, x in enumerate(batch) if i != self.label_index], batch[self.label_index]
    
        return batch

In [14]:
class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix, num_aux_targets):
        super(NeuralNet, self).__init__()
        embed_size = embedding_matrix.shape[1]
        
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.2) #SpatialDropout(0.3)
        
        self.lstm1 = nn.LSTM(embed_size, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True)
    
        self.linear1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.linear2 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        
        self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, 1)
        self.linear_aux_out = nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets)
        
    def forward(self, x, lengths=None):
        h_embedding = self.embedding(x.long())
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        # global average pooling
        avg_pool = torch.mean(h_lstm2, 1)
        # global max pooling
        max_pool, _ = torch.max(h_lstm2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2
        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        
        return out
    
    
def custom_loss(data, targets):
    ''' Define custom loss function for weighted BCE on 'target' column '''
    bce_loss_1 = nn.BCEWithLogitsLoss(weight=targets[:,1:2])(data[:,:1],targets[:,:1])
    bce_loss_2 = nn.BCEWithLogitsLoss()(data[:,1:],targets[:,2:])
    return (bce_loss_1 * loss_weight) + bce_loss_2

# Training

In [15]:
print(x_train_padded.shape)
print(x_test_padded.shape)
print(y_train_torch.shape)

torch.Size([1804874, 300])
torch.Size([97320, 300])
torch.Size([1804874, 8])


In [16]:
x_train_padded.shape[0]

1804874

In [None]:
gc.collect()
torch.cuda.empty_cache()
    
batch_size = 512
SPLITS = 5
EPOCHS = 4
lr = 0.001
OUTPUT_DIM = 7
#kf = KFold(n_splits=SPLITS, random_state=47, shuffle=True)
splits = list(StratifiedKFold(n_splits=SPLITS, shuffle=True, random_state=47).split(x_train, y_train[:,0]))

test_dataset = data.TensorDataset(x_test_padded, test_lengths)
test_collator = SequenceBucketCollator(lambda lenghts: lenghts.max(), sequence_index=0, length_index=1)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=test_collator)


train_collator = SequenceBucketCollator(lambda lenghts: lenghts.max(), 
                                        sequence_index=0, 
                                        length_index=1, 
                                        label_index=2)
fld = 0 
model_name = "model1"
test_preds_all_folds = []
valid_preds_all_epochs = [] 

results_oof = np.zeros((x_train_padded.shape[0],OUTPUT_DIM)) 
checkpoint_weights = [2 ** epoch for epoch in range(EPOCHS)]
    

#for train_index, val_index in kf.split(x_train_padded):
for fld, (train_index, val_index) in enumerate(splits):
    #print(train_index, test_index)
    #print(len(train_index),len(test_index))
    
    print("Fold",fld)
    #print(val_index)
    train_dataset = data.TensorDataset(x_train_padded[train_index], lengths[train_index], y_train_torch[train_index])
    valid_dataset = data.TensorDataset(x_train_padded[val_index], lengths[val_index], y_train_torch[val_index])
    
    train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_collator)
    valid_loader = data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=train_collator)
    
    databunch = DataBunch(train_dl=train_loader, valid_dl=valid_loader, collate_fn=train_collator)

    model_idx = 1 #just one model
    seed_everything(1 + model_idx)
    model = NeuralNet(embedding_matrix, y_aux_train.shape[-1] )
    learn = Learner(databunch, model, loss_func=custom_loss)
    
    n = len(learn.data.train_dl)
    phases = [(TrainingPhase(n).schedule_hp('lr', lr * (0.6**(i)))) for i in range(EPOCHS)]
    sched = GeneralScheduler(learn, phases)
    learn.callbacks.append(sched)
    
    test_preds = np.zeros((len(test), OUTPUT_DIM))  
    test_preds_all_epochs = []
    valid_preds = np.zeros((len(valid_dataset),OUTPUT_DIM))
    valid_preds_all_epochs = [] 
    
    for epoch in range(EPOCHS):
        learn.fit(1)  
        
        dict_save = model.state_dict().copy()
        dict_save.pop('embedding.weight')
        torch.save(dict_save, str(fld) + '_' + model_name + '_' + str(epoch)+'_epoch.bin')
        
        for i, x_batch in enumerate(test_loader):
            X = x_batch[0].cuda()
            y_pred = sigmoid(learn.model(X).detach().cpu().numpy())
            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred

        test_preds_all_epochs.append(test_preds)
        
         
        for i, x_batch in enumerate(valid_loader):
            X = x_batch[0][0].cuda()
            y_valid_pred = sigmoid(learn.model(X).detach().cpu().numpy())
            valid_preds[i*batch_size:(i+1)*batch_size,:]=y_valid_pred
            
        valid_preds_all_epochs.append(valid_preds)
        
        
        
    test_preds_all_folds.append(np.average(test_preds_all_epochs, weights=checkpoint_weights, axis=0))
    results_oof[val_index] = np.average(valid_preds_all_epochs, weights=checkpoint_weights, axis=0)
    
    gc.collect()
    torch.cuda.empty_cache()
    
    

Fold 0
epoch     train_loss  valid_loss  time    
0         0.217625    0.218009    06:09     
epoch     train_loss  valid_loss  time    
0         0.214542    0.213919    06:10     
epoch     train_loss  valid_loss  time    
0         0.210838    0.211351    06:15     
epoch     train_loss  valid_loss  time    
0         0.207023    0.211398    06:14     
Fold 1
epoch     train_loss  valid_loss  time    
0         0.219800    0.217090    06:09     
epoch     train_loss  valid_loss  time    
0         0.217685    0.211323    05:51     
epoch     train_loss  valid_loss  time    
0         0.208620    0.212653    05:48     
epoch     train_loss  valid_loss  time    
0         0.205166    0.210587    05:44     
Fold 2
epoch     train_loss  valid_loss  time    


In [19]:
len(test_preds_all_folds), test_preds_all_folds[0].shape

(5, (97320, 7))

In [20]:
np.save('results_oof.npy',results_oof)

In [21]:
# From baseline kernel

def calculate_overall_auc(df, model_name):
    true_labels = df[TOXICITY_COLUMN]>0.5
    predicted_labels = df[model_name]
    return metrics.roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)



SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return metrics.roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, model_name):
    subgroup_examples = df[df[subgroup]>0.5]
    return compute_auc((subgroup_examples[label]>0.5), subgroup_examples[model_name])

def compute_bpsn_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[(df[subgroup]>0.5) & (df[label]<=0.5)]
    non_subgroup_positive_examples = df[(df[subgroup]<=0.5) & (df[label]>0.5)]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label]>0.5, examples[model_name])

def compute_bnsp_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[(df[subgroup]>0.5) & (df[label]>0.5)]
    non_subgroup_negative_examples = df[(df[subgroup]<=0.5) & (df[label]<=0.5)]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label]>0.5, examples[model_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]>0.5])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)

In [22]:
submission = pd.DataFrame.from_dict({
    'id': test['id'],
    'prediction': np.mean(test_preds_all_folds, axis=0)[:, 0]
})

submission.to_csv('submission.csv', index=False)

In [23]:
submission

Unnamed: 0,id,prediction
0,7000000,0.005196
1,7000001,0.000049
2,7000002,0.002186
3,7000003,0.001291
4,7000004,0.994546
5,7000005,0.000067
6,7000006,0.001083
7,7000007,0.007646
8,7000008,0.011384
9,7000009,0.004104


In [24]:
results_oof.shape

(1804874, 7)

In [25]:
from sklearn import metrics

#vpreds = np.mean(all_valid_preds, axis=0)[:, 0] 

identity_columns = [
     'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
     'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

MODEL_NAME = 'model1'
#test_df_tmp=train.iloc[1624387:,].copy()
test_df_tmp =train.copy()

test_df_tmp[MODEL_NAME]=torch.sigmoid(torch.tensor(results_oof)).numpy()[:,0]
TOXICITY_COLUMN = 'target'
bias_metrics_df = compute_bias_metrics_for_model(test_df_tmp, identity_columns, MODEL_NAME, 'target')
print(get_final_metric(bias_metrics_df, calculate_overall_auc(test_df_tmp, MODEL_NAME)))
bias_metrics_df
#0.9423216517399409 - Seventh rb_5fold.ipynb value

0.9428340992650939


Unnamed: 0,bnsp_auc,bpsn_auc,subgroup,subgroup_auc,subgroup_size
2,0.957903,0.895763,homosexual_gay_or_lesbian,0.876495,10233
6,0.959438,0.89704,black,0.881579,13869
7,0.958984,0.905968,white,0.884828,23852
5,0.950737,0.925147,muslim,0.893764,19666
4,0.942714,0.947238,jewish,0.917866,7239
0,0.955146,0.953245,male,0.942372,40036
1,0.951314,0.959127,female,0.944204,50548
8,0.9685,0.933356,psychiatric_or_mental_illness,0.944246,4077
3,0.939724,0.968674,christian,0.94858,35507


0.942505853105013


Unnamed: 0,bnsp_auc,bpsn_auc,subgroup,subgroup_auc,subgroup_size
2,0.955956,0.897537,homosexual_gay_or_lesbian,0.874197,10233
6,0.961094,0.89318,black,0.88246,13869
7,0.959278,0.905455,white,0.88544,23852
5,0.951215,0.925525,muslim,0.896678,19666
4,0.94113,0.945905,jewish,0.913435,7239
0,0.954713,0.953521,male,0.942537,40036
1,0.950167,0.960046,female,0.944449,50548
8,0.970687,0.930848,psychiatric_or_mental_illness,0.946192,4077
3,0.937055,0.969449,christian,0.947428,35507
