# Imports

In [None]:
import gc
import os
import nltk
import time
import joblib
import pickle
import numpy as np
import pandas as pd
import scipy.optimize as optimize
from scipy import sparse
from pprint import pprint
from nltk.corpus import stopwords
from IPython.display import display
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings; warnings.filterwarnings("ignore")

In [None]:
def timer(func):
    def wrapper(*args, **kws):
        st = time.time()
        res = func(*args, **kws)
        et = time.time()
        tt = (et-st)/60
        print(f'Time taken is {tt:.2f} mins')
        return res
    return wrapper

In [None]:
def clean(data, col):
    data[col] = data[col].str.replace(r"what's", "what is ")
    data[col] = data[col].str.replace(r"\'ve", " have ")
    data[col] = data[col].str.replace(r"can't", "cannot ")
    data[col] = data[col].str.replace(r"n't", " not ")
    data[col] = data[col].str.replace(r"i'm", "i am ")
    data[col] = data[col].str.replace(r"\'re", " are ")
    data[col] = data[col].str.replace(r"\'d", " would ")
    data[col] = data[col].str.replace(r"\'ll", " will ")
    data[col] = data[col].str.replace(r"\'scuse", " excuse ")
    data[col] = data[col].str.replace(r"\'s", " ")
    data[col] = data[col].str.replace('\n', ' \n ')
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)', r'\1 \2 \3')
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}', r'\1\1\1')
    data[col] = data[col].str.replace(r'([*!?\']+)', r' \1 ')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b', r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B', r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}', ' ').str.strip()
    data[col] = data[col].str.replace(r'[ ]{2,}', ' ').str.strip()
    data[col] = data[col].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    return data

stop = stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()

In [None]:
def regular(pre):
    return (pre-pre.min())/(pre.max()-pre.min())

In [None]:
# Test data
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
@timer
def infer(data_path_name, n_folds, clean_prm = False):
    test_preds_arr = np.zeros((df_sub.shape[0], n_folds))
    for fld in range(n_folds):
        print("\n\n")
        print(f'Predicting model: {data_path_name}_fld{fld}.pkl')
        pipeline = joblib.load(f'{data_path_name}_fld{fld}.pkl')
        if clean_prm:
            test_preds_arr[:,fld] = pipeline.predict(clean(df_sub,'text')['text'])
        else:
            test_preds_arr[:,fld] = pipeline.predict(df_sub['text'])
    return regular(test_preds_arr.mean(axis = 1))

# Toxic Infer

# preds_1 = infer("df", 7)

# preds_2 = infer("df_clean", 7,True)

preds_1 = infer("../input/tfidf-model/df2_pkl/df2",7)
preds_2 = infer("../input/tfidf-model/dfu_pkl/dfu", 7)
preds_3 = infer("../input/extra-data-tfidf/dfe_pkl/dfe", 7)

# Model Ensemble

# Attention-Large

In [None]:
import os
import gc
import cv2
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# For Transformer Models
from transformers import AutoTokenizer, AutoModel, AutoConfig
from torch.utils.data import TensorDataset, SequentialSampler, RandomSampler, DataLoader

# Utils
from tqdm import tqdm

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
def set_seed(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(42)

class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df['text'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']        
        
        return {
            'ids': torch.tensor(ids),
            'mask': torch.tensor(mask)
        }
        
class AttentionHead(nn.Module):
    def __init__(self, h_size, hidden_dim=512):
        super().__init__()
        self.W = nn.Linear(h_size, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        
    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class JigsawModel(nn.Module):
    def __init__(self,model_name):
        super(JigsawModel,self).__init__()
        config = AutoConfig.from_pretrained(model_name)
        config.update({
            "hidden_dropout_prob": 0.3,
            "layer_norm_eps": 1e-7,
            "output_hidden_states": True
            }) 
        self.h_size = config.hidden_size
        self.transformer = AutoModel.from_pretrained(model_name, config = config)
        self.head = AttentionHead(self.h_size*4)
        self.linear = nn.Linear(self.h_size*2, 1)
        self.linear_out = nn.Linear(self.h_size*8, 1)

              
    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer(input_ids, attention_mask)
       
        all_hidden_states = torch.stack(transformer_out.hidden_states)
        cat_over_last_layers = torch.cat(
            (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]),-1
        )
        
        cls_pooling = cat_over_last_layers[:, 0]   
        head_logits = self.head(cat_over_last_layers)
        y_hat = self.linear_out(torch.cat([head_logits, cls_pooling], -1))
        
        return y_hat

@torch.no_grad()
def valid_fn(model, dataloader, device):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    PREDS = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        
        outputs = model(ids, mask)
        PREDS.append(outputs.view(-1).cpu().detach().numpy()) 
    
    PREDS = np.concatenate(PREDS)
    gc.collect()
    
    return PREDS

def inference(model_paths, dataloader, device, norm):
    final_preds = []
    for i, path in enumerate(model_paths):
        model = JigsawModel(CONFIG['model_name'])
        model.to(CONFIG['device'])
        model.load_state_dict(torch.load(path))
        
        print(f"Getting predictions for model {i+1}")
        preds = valid_fn(model, dataloader, device)
        
        if norm:
            preds = regular(preds)
            
        final_preds.append(preds)
        
        del model;
        gc.collect()
        torch.cuda.empty_cache()
    
    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds

In [None]:
import os
import gc
import cv2
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# For Transformer Models
from transformers import AutoTokenizer, AutoModel, AutoConfig
from torch.utils.data import TensorDataset, SequentialSampler, RandomSampler, DataLoader

# Utils
from tqdm import tqdm

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Attention Large

# Attention base

In [None]:
class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()

        config = AutoConfig.from_pretrained(model_name)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.2,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(model_name, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        weights = self.attention(last_layer_hidden_states)
                
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        

        return self.regressor(context_vector)

# Attention Large + tanh

In [None]:
class JigsawModel(nn.Module):
    def __init__(self,model_name):
        super(JigsawModel,self).__init__()
        config = AutoConfig.from_pretrained(model_name)
        config.update({
            "hidden_dropout_prob": 0.,
            "layer_norm_eps": 1e-7,
            "output_hidden_states": True
            }) 
        self.h_size = config.hidden_size
        self.transformer = AutoModel.from_pretrained(model_name, config = config)
        self.head = AttentionHead(self.h_size*4)
        self.linear = nn.Linear(self.h_size*8, self.h_size // 2)
        self.linear_out = nn.Linear(self.h_size // 2, 1)
        self.tanh = nn.Tanh()
              
    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer(input_ids, attention_mask)
       
        all_hidden_states = torch.stack(transformer_out.hidden_states)
        cat_over_last_layers = torch.cat(
            (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]),-1
        )
        
        cls_pooling = cat_over_last_layers[:, 0]   
        head_logits = self.head(cat_over_last_layers)
        logits = self.tanh(self.linear(torch.cat([head_logits, cls_pooling], -1)))
        y_hat = self.linear_out(logits)
        
        return y_hat

# Attention large + dense

In [None]:
class JigsawModel(nn.Module):
    def __init__(self,model_name):
        super(JigsawModel,self).__init__()
        config = AutoConfig.from_pretrained(model_name)
        config.update({
            "hidden_dropout_prob": 0.,
            "layer_norm_eps": 1e-7,
            "output_hidden_states": True
            }) 
        self.h_size = config.hidden_size
        self.transformer = AutoModel.from_pretrained(model_name, config = config)
        self.head = AttentionHead(self.h_size*4)
        self.drop = nn.Dropout(p=0.2)
        self.layer_norm = nn.LayerNorm(self.h_size*8)
        self.dense = nn.Sequential(
            nn.Linear(self.h_size*8, self.h_size // 2),
#             nn.LeakyReLU(negative_slope=0.01),
            nn.Tanh(),
            nn.Dropout(0.2),
            nn.Linear(self.h_size // 2, 1)
        )

              
    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer(input_ids, attention_mask)
       
        all_hidden_states = torch.stack(transformer_out.hidden_states)
        cat_over_last_layers = torch.cat(
            (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]),-1
        )
        
        cls_pooling = cat_over_last_layers[:, 0]   
        head_logits = self.head(cat_over_last_layers)
        cat_logits = torch.cat([head_logits, cls_pooling], -1)
        features = self.drop(self.layer_norm(cat_logits))
        y_hat = self.dense(features)
        
        return y_hat

In [None]:
CONFIG = dict(
    model_name = '../input/j-dblv3-29-67-att-dense-5f-02ldrop-0-5margin',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

test_dataset = JigsawDataset(df_sub, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['test_batch_size'],
                         num_workers=2, shuffle=False, pin_memory=True)

MODEL_PATHS = [
    '../input/j-dblv3-29-67-att-dense-5f-02ldrop-0-5margin/Loss-Fold-0.bin',
    '../input/j-dblv3-29-67-att-dense-5f-02ldrop-0-5margin/Loss-Fold-1.bin',
    '../input/j-dblv3-29-67-att-dense-5f-02ldrop-0-5margin/Loss-Fold-2.bin',
    '../input/j-dblv3-29-67-att-dense-5f-02ldrop-0-5margin/Loss-Fold-3.bin',
    '../input/j-dblv3-29-67-att-dense-5f-02ldrop-0-5margin/Loss-Fold-4.bin'
]
preds = inference(MODEL_PATHS, test_loader, CONFIG['device'],True)
preds45 = regular(preds)
preds45

# Meanpooling base

In [None]:
class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        config = AutoConfig.from_pretrained(model_name)
        config.update({"output_hidden_states":True,
                      "hidden_dropout_prob": 0.})                       
        self.model = AutoModel.from_pretrained(model_name,config=config)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, CONFIG['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        last_hidden_states = out[0]
        feature = torch.mean(last_hidden_states, 1)
        out = self.drop(feature)
        outputs = self.fc(out)
        return outputs

In [None]:
CONFIG = dict(
    model_name = '../input/j-rbb-29-66-mpl-5f-02ldrop-0-5margin',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

test_dataset = JigsawDataset(df_sub, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['test_batch_size'],
                         num_workers=2, shuffle=False, pin_memory=True)

MODEL_PATHS = [
    '../input/j-rbb-29-66-mpl-5f-02ldrop-0-5margin/Loss-Fold-0.bin',
    '../input/j-rbb-29-66-mpl-5f-02ldrop-0-5margin/Loss-Fold-1.bin',
    '../input/j-rbb-29-66-mpl-5f-02ldrop-0-5margin/Loss-Fold-2.bin',
    '../input/j-rbb-29-66-mpl-5f-02ldrop-0-5margin/Loss-Fold-3.bin',
    '../input/j-rbb-29-66-mpl-5f-02ldrop-0-5margin/Loss-Fold-4.bin'
]
preds = inference(MODEL_PATHS, test_loader, CONFIG['device'],True)
preds42 = regular(preds)
preds42

CONFIG = dict(
    model_name = '../input/j-rbbo-29-83-mpl-5f-02ldrop-0margin',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

test_dataset = JigsawDataset(df_sub, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['test_batch_size'],
                         num_workers=2, shuffle=False, pin_memory=True)

MODEL_PATHS = [
    '../input/j-rbbo-29-83-mpl-5f-02ldrop-0margin/Loss-Fold-0.bin',
    '../input/j-rbbo-29-83-mpl-5f-02ldrop-0margin/Loss-Fold-1.bin',
    '../input/j-rbbo-29-83-mpl-5f-02ldrop-0margin/Loss-Fold-2.bin',
    '../input/j-rbbo-29-83-mpl-5f-02ldrop-0margin/Loss-Fold-3.bin',
    '../input/j-rbbo-29-83-mpl-5f-02ldrop-0margin/Loss-Fold-4.bin'
]
preds = inference(MODEL_PATHS, test_loader, CONFIG['device'],True)
preds43 = regular(preds)
preds43

In [None]:
CONFIG = dict(
    model_name = '../input/j-rbbtu-2895-mpl-5f-02ldrop-0-5margin',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

test_dataset = JigsawDataset(df_sub, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['test_batch_size'],
                         num_workers=2, shuffle=False, pin_memory=True)

MODEL_PATHS = [
    '../input/j-rbbtu-2895-mpl-5f-02ldrop-0-5margin/Loss-Fold-0.bin',
    '../input/j-rbbtu-2895-mpl-5f-02ldrop-0-5margin/Loss-Fold-1.bin',
    '../input/j-rbbtu-2895-mpl-5f-02ldrop-0-5margin/Loss-Fold-2.bin',
    '../input/j-rbbtu-2895-mpl-5f-02ldrop-0-5margin/Loss-Fold-3.bin',
    '../input/j-rbbtu-2895-mpl-5f-02ldrop-0-5margin/Loss-Fold-4.bin'
]
preds = inference(MODEL_PATHS, test_loader, CONFIG['device'],True)
preds43 = regular(preds)
preds43

In [None]:
CONFIG = dict(
    model_name = '../input/j-rbbtm-2886-mpl-5f-02ldrop-0-5margin',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

test_dataset = JigsawDataset(df_sub, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['test_batch_size'],
                         num_workers=2, shuffle=False, pin_memory=True)

MODEL_PATHS = [
    '../input/j-rbbtm-2886-mpl-5f-02ldrop-0-5margin/Loss-Fold-0.bin',
    '../input/j-rbbtm-2886-mpl-5f-02ldrop-0-5margin/Loss-Fold-1.bin',
    '../input/j-rbbtm-2886-mpl-5f-02ldrop-0-5margin/Loss-Fold-2.bin',
    '../input/j-rbbtm-2886-mpl-5f-02ldrop-0-5margin/Loss-Fold-3.bin',
    '../input/j-rbbtm-2886-mpl-5f-02ldrop-0-5margin/Loss-Fold-4.bin'
]
preds = inference(MODEL_PATHS, test_loader, CONFIG['device'],True)
preds44 = regular(preds)
preds44

In [None]:
CONFIG = dict(
    model_name = '../input/j-rbbtm-2880-mpl-5f-02ldrop-1-0margin',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

test_dataset = JigsawDataset(df_sub, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['test_batch_size'],
                         num_workers=2, shuffle=False, pin_memory=True)

MODEL_PATHS = [
    '../input/j-rbbtm-2880-mpl-5f-02ldrop-1-0margin/Loss-Fold-0.bin',
    '../input/j-rbbtm-2880-mpl-5f-02ldrop-1-0margin/Loss-Fold-1.bin',
    '../input/j-rbbtm-2880-mpl-5f-02ldrop-1-0margin/Loss-Fold-2.bin',
    '../input/j-rbbtm-2880-mpl-5f-02ldrop-1-0margin/Loss-Fold-3.bin',
    '../input/j-rbbtm-2880-mpl-5f-02ldrop-1-0margin/Loss-Fold-4.bin'
]
preds = inference(MODEL_PATHS, test_loader, CONFIG['device'],True)
preds48 = regular(preds)
preds48

# Meanpooling base 4layer

In [None]:
class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        config = AutoConfig.from_pretrained(model_name)
        config.update({"output_hidden_states":True,
                      "hidden_dropout_prob": 0.,
                      "num_hidden_layers":4})                       
        self.model = AutoModel.from_pretrained(model_name,config=config)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, CONFIG['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        last_hidden_states = out[0]
        feature = torch.mean(last_hidden_states, 1)
        out = self.drop(feature)
        outputs = self.fc(out)
        return outputs

In [None]:
CONFIG = dict(
    model_name = '../input/j-rbbtu-2897-mpl-10f-02ldrop-4layer-0-5margin',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

test_dataset = JigsawDataset(df_sub, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['test_batch_size'],
                         num_workers=2, shuffle=False, pin_memory=True)

MODEL_PATHS = [
    '../input/j-rbbtu-2897-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-0.bin',
    '../input/j-rbbtu-2897-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-1.bin',
    '../input/j-rbbtu-2897-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-2.bin',
    '../input/j-rbbtu-2897-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-3.bin',
    '../input/j-rbbtu-2897-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-4.bin',
    '../input/j-rbbtu-2897-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-5.bin',
    '../input/j-rbbtu-2897-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-6.bin',
    '../input/j-rbbtu-2897-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-7.bin',
    '../input/j-rbbtu-2897-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-8.bin',
    '../input/j-rbbtu-2897-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-9.bin'
]
preds = inference(MODEL_PATHS, test_loader, CONFIG['device'],True)
preds41 = regular(preds)
preds41

CONFIG = dict(
    model_name = '../input/j-btbtu-2970-mpl-10f-02ldrop-4layer-0-5margin',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

test_dataset = JigsawDataset(df_sub, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['test_batch_size'],
                         num_workers=2, shuffle=False, pin_memory=True)

MODEL_PATHS = [
    '../input/j-btbtu-2970-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-0.bin',
    '../input/j-btbtu-2970-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-1.bin',
    '../input/j-btbtu-2970-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-2.bin',
    '../input/j-btbtu-2970-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-3.bin',
    '../input/j-btbtu-2970-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-4.bin',
    '../input/j-btbtu-2970-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-5.bin',
    '../input/j-btbtu-2970-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-6.bin',
    '../input/j-btbtu-2970-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-7.bin',
    '../input/j-btbtu-2970-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-8.bin',
    '../input/j-btbtu-2970-mpl-10f-02ldrop-4layer-0-5margin/Loss-Fold-9.bin'
]
preds = inference(MODEL_PATHS, test_loader, CONFIG['device'],True)
preds42 = regular(preds)
preds42

# Meanpooling base + dense

In [None]:
class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        config = AutoConfig.from_pretrained(model_name)
        config.update({"output_hidden_states":True,
                      "hidden_dropout_prob": 0.})                       
        self.model = AutoModel.from_pretrained(model_name,config=config)
        self.drop = nn.Dropout(p=0.2)
        self.layer_norm = nn.LayerNorm(config.hidden_size)
#         self.tanh = nn.Tanh()
#         self.fc = nn.Linear(768, CONFIG['num_classes'])
        self.dense = nn.Sequential(
            nn.Linear(config.hidden_size, 256),
#             nn.LeakyReLU(negative_slope=0.01),
            nn.Tanh(),
            nn.Dropout(0.2),
            nn.Linear(256, 1)
        )
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        last_hidden_states = (out[0])
        feature = torch.mean(last_hidden_states, 1)
        out = self.drop(self.layer_norm(feature))
        outputs = self.dense(out)
        return outputs

# MeanPooling-large

In [None]:
def get_data_loaders(data, model_name, max_length, bs, inference_only=False):
    
    x_train = data['text'].tolist()
    
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    encoded_train = tokenizer.batch_encode_plus(
        x_train, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding='max_length', 
        truncation=True,
        max_length=max_length, 
        return_tensors='pt'
    )

    if not inference_only:
        y_train = data['y'].values
        
        dataset = TensorDataset(
        encoded_train['input_ids'],
        encoded_train['attention_mask'],
        torch.tensor(y_train)
        )
        dataloader = DataLoader(
        dataset,
        batch_size=bs,
        num_workers=2, shuffle=False, pin_memory=True
        )

        return dataloader
    
    else:
        dataset = TensorDataset(
        encoded_train['input_ids'],
        encoded_train['attention_mask']
        )
        dataloader = DataLoader(
        dataset,
        batch_size=bs,
        num_workers=2, shuffle=False, pin_memory=True
        )

        return dataloader
    

def predict(model, data_loader, device):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
#     bar = tqdm(enumerate(data_loader), total=len(data_loader))
    with torch.no_grad():
        for batch in data_loader:
        
            batch = tuple(b.to(device) for b in batch)
        
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1]
                     }
                        
            pred = model(**inputs)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

def inference(model_paths, dataloader, device, norm):
    final_preds = []
    for i, path in enumerate(model_paths):
        model = JigsawModel(CONFIG['model_name'])
        model.to(CONFIG['device'])
        model.load_state_dict(torch.load(path))
        
        print(f"Getting predictions for model {i+1}")
        preds = predict(model, dataloader, CONFIG['device'])
        
        if norm:
            preds = regular(preds)
            
        final_preds.append(preds)
        
        del model;
        gc.collect()
        torch.cuda.empty_cache()
    
    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds

class JigsawModel(nn.Module):
    
    def __init__(self, model_name):
        super().__init__()
        
        config = AutoConfig.from_pretrained(model_name)
        config.update({"hidden_dropout_prob": 0.2})
        self.model = AutoModel.from_pretrained(model_name, config=config)
        self.linear = nn.Linear(config.hidden_size, 1)
        self.loss = nn.MSELoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        logits = self.linear(mean_embeddings)
        
        preds = logits.squeeze(-1).squeeze(-1)
        
        if labels is not None:
            loss = self.loss(preds.view(-1).float(), labels.view(-1).float())
            return loss
        else:
            return preds

# Meanpooling large + dense

In [None]:
class JigsawModel(nn.Module):
    
    def __init__(self, model_name):
        super().__init__()
        
        config = AutoConfig.from_pretrained(model_name)
        config.update({"hidden_dropout_prob": 0.})
        self.model = AutoModel.from_pretrained(model_name, config=config)
        self.linear = nn.Linear(config.hidden_size, 1)
        self.drop = nn.Dropout(p=0.2)
        self.layer_norm = nn.LayerNorm(config.hidden_size)
        self.dense = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size // 2),
#             nn.LeakyReLU(negative_slope=0.01),
            nn.Tanh(),
            nn.Dropout(0.2),
            nn.Linear(config.hidden_size // 2, 1)
        )
        self.loss = nn.MSELoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        features = self.drop(self.layer_norm(mean_embeddings))
        logits = self.dense(features)
        
        preds = logits.squeeze(-1).squeeze(-1)
        return preds

CONFIG = dict(
    model_name = '../input/j-rbbmt-2889-mpl-dense-5f-02ldrop-08gm-0-5margin',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)
test_loader = get_data_loaders(df_sub, CONFIG['model_name'], CONFIG['max_length'], CONFIG['test_batch_size'], inference_only=True)

MODEL_PATHS = [
    '../input/j-rbbmt-2889-mpl-dense-5f-02ldrop-08gm-0-5margin/Loss-Fold-0.bin',
    '../input/j-rbbmt-2889-mpl-dense-5f-02ldrop-08gm-0-5margin/Loss-Fold-1.bin',
    '../input/j-rbbmt-2889-mpl-dense-5f-02ldrop-08gm-0-5margin/Loss-Fold-2.bin',
    '../input/j-rbbmt-2889-mpl-dense-5f-02ldrop-08gm-0-5margin/Loss-Fold-3.bin',
    '../input/j-rbbmt-2889-mpl-dense-5f-02ldrop-08gm-0-5margin/Loss-Fold-4.bin'
]
preds = inference(MODEL_PATHS, test_loader, CONFIG['device'],True)
preds47 = regular(preds)
preds47

In [None]:
CONFIG = dict(
    model_name = '../input/j-dblv3-29-39-mpl-dense-03gm-5f-02ldrop-0-5margin',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)
test_loader = get_data_loaders(df_sub, CONFIG['model_name'], CONFIG['max_length'], CONFIG['test_batch_size'], inference_only=True)

MODEL_PATHS = [
    '../input/j-dblv3-29-39-mpl-dense-03gm-5f-02ldrop-0-5margin/Loss-Fold-0.bin',
    '../input/j-dblv3-29-39-mpl-dense-03gm-5f-02ldrop-0-5margin/Loss-Fold-1.bin',
    '../input/j-dblv3-29-39-mpl-dense-03gm-5f-02ldrop-0-5margin/Loss-Fold-2.bin',
    '../input/j-dblv3-29-39-mpl-dense-03gm-5f-02ldrop-0-5margin/Loss-Fold-3.bin',
    '../input/j-dblv3-29-39-mpl-dense-03gm-5f-02ldrop-0-5margin/Loss-Fold-4.bin'
]
preds = inference(MODEL_PATHS, test_loader, CONFIG['device'],True)
preds46 = regular(preds)
preds46

CONFIG = dict(
    model_name = '../input/j-rblt-2937-mpl-dense-5f-02ldrop-0-5margin',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)
test_loader = get_data_loaders(df_sub, CONFIG['model_name'], CONFIG['max_length'], CONFIG['test_batch_size'], inference_only=True)

MODEL_PATHS = [
    '../input/j-rblt-2937-mpl-dense-5f-02ldrop-0-5margin/Loss-Fold-0.bin',
    '../input/j-rblt-2937-mpl-dense-5f-02ldrop-0-5margin/Loss-Fold-1.bin',
    '../input/j-rblt-2937-mpl-dense-5f-02ldrop-0-5margin/Loss-Fold-2.bin',
    '../input/j-rblt-2937-mpl-dense-5f-02ldrop-0-5margin/Loss-Fold-3.bin',
    '../input/j-rblt-2937-mpl-dense-5f-02ldrop-0-5margin/Loss-Fold-4.bin'
]
preds = inference(MODEL_PATHS, test_loader, CONFIG['device'],True)
preds410 = regular(preds)
preds410

In [None]:
CONFIG = dict(
    model_name = '../input/j-rblt-n2937-mpl-dense-5f-02ldrop-0-5margin',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)
test_loader = get_data_loaders(df_sub, CONFIG['model_name'], CONFIG['max_length'], CONFIG['test_batch_size'], inference_only=True)

MODEL_PATHS = [
    '../input/j-rblt-n2937-mpl-dense-5f-02ldrop-0-5margin/Loss-Fold-0.bin',
    '../input/j-rblt-n2937-mpl-dense-5f-02ldrop-0-5margin/Loss-Fold-1.bin',
    '../input/j-rblt-n2937-mpl-dense-5f-02ldrop-0-5margin/Loss-Fold-2.bin',
    '../input/j-rblt-n2937-mpl-dense-5f-02ldrop-0-5margin/Loss-Fold-3.bin',
    '../input/j-rblt-n2937-mpl-dense-5f-02ldrop-0-5margin/Loss-Fold-4.bin'
]
preds = inference(MODEL_PATHS, test_loader, CONFIG['device'],True)
preds47 = regular(preds)
preds47

In [None]:
# w1 = [0.02, 0.02, 0.06]
# w2 = [0.2, 0.26]
# w3 = [1]
w4 = [-0.023694, 0.015 , 0.047388, 0.019 , -0.018  , 0.019 , 0.07898, 0.07898]

In [None]:
# preds1 = preds11*w1[0] + preds12*w1[1] + preds13*w1[2]
# preds1 = regular(preds1)

# preds2 = preds21*w2[0] + preds22*w2[1]
# preds2 = regular(preds2)

# preds3 = preds31*w3[0]
# preds3 = regular(preds3)

preds4 = preds41*w4[0] + preds42*w4[1] + preds43*w4[2] + preds44*w4[3] + preds45*w4[4] + preds46*w4[5] + preds47*w4[6] + preds48*w4[7]
preds4 = regular(preds4)

In [None]:
# w0 = [ 0.16, -0.07 ,-0.01 , 0.54]
# preds0 = preds1*w0[0] + preds2*w0[1] + preds3*w0[2] + preds4*w0[3]
# preds0 = regular(preds0)

import matplotlib.pyplot as plt
plt.hist(preds4,bins = 100)
plt.show()

# x = [ 0.06 ,0.08 ,0.04 ,0.38]

# predsx = preds_1*x[0] + preds_2*x[1] + preds_3*x[2] + preds0*x[3]
# predsx = regular(predsx)

# plt.hist(predsx,bins = 100)
# plt.show()

df_sub['score'] = preds4
df_sub['score'] = df_sub['score'].rank(method='first')
df_sub.head(10)

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)