In [None]:
## install detoxify from dataset
!cp -r ../input/detoxify-sourcemodels/detoxify .
!pip install -q ./detoxify
!rm -r ./detoxify


## copy detoxify pretrained models and transformers configuration files from dataset to local caches
!mkdir -p  /root/.cache/torch/hub/checkpoints
!mkdir -p  /root/.cache/huggingface/transformers
!cp -r ../input/detoxify-sourcemodels/torch/hub/checkpoints /root/.cache/torch/hub
!cp -r ../input/detoxify-sourcemodels/huggingface/transformers /root/.cache/huggingface


# Setting environment variable TRANSFORMERS_OFFLINE=1 will tell Transformers to use local files only and will not try to look things up.
# It’s possible to run Transformers in a firewalled or a no-network environment or in a Kaggle inference kernel !
import os
os.environ["TRANSFORMERS_OFFLINE"] = "1"

In [None]:
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
class CFG:
    model_dir='../input/jigsaw4-luke-large-training-tito-cv-strategy/'
    num_workers=4
    model="studio-ousia/luke-large"
    batch_size=128
    fc_dropout=0.
    text="text"
    target="target"
    target_size=1
    head=32
    tail=32
    seed=42
    n_fold=5


CFG.max_len = CFG.head + CFG.tail

import os
import gc
import re
import sys
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -q transformers -y')
os.system('pip uninstall -q tokenizers -y')
os.system('pip uninstall -q huggingface_hub -y')

os.system('mkdir -p /tmp/pip/cache-tokenizers/')
os.system('cp ../input/tokenizers-0103/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl /tmp/pip/cache-tokenizers/')
os.system('pip install -q --no-index --find-links /tmp/pip/cache-tokenizers/ tokenizers')

os.system('mkdir -p /tmp/pip/cache-huggingface-hub/')
os.system('cp ../input/huggingface-hub-008/huggingface_hub-0.0.8-py3-none-any.whl /tmp/pip/cache-huggingface-hub/')
os.system('pip install -q --no-index --find-links /tmp/pip/cache-huggingface-hub/ huggingface_hub')

os.system('mkdir -p /tmp/pip/cache-transformers/')
os.system('cp ../input/transformers-470/transformers-4.7.0-py3-none-any.whl /tmp/pip/cache-transformers/')
os.system('pip install -q --no-index --find-links /tmp/pip/cache-transformers/ transformers')

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import LukeTokenizer, LukeModel, LukeConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_score(df):
    score = len(df[df['less_toxic_pred'] < df['more_toxic_pred']]) / len(df)
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

test = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
submission = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')
print(test.shape, submission.shape)
display(test.head())
display(submission.head())

CFG.tokenizer = LukeTokenizer.from_pretrained(CFG.model_dir+'tokenizer/')

def prepare_input(text, cfg):
    if cfg.tail == 0:
        inputs = cfg.tokenizer.encode_plus(text, 
                                           return_tensors=None, 
                                           add_special_tokens=True, 
                                           max_length=cfg.max_len,
                                           pad_to_max_length=True,
                                           truncation=True)
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    else:
        inputs = cfg.tokenizer.encode_plus(text,
                                           return_tensors=None, 
                                           add_special_tokens=True, 
                                           truncation=True)
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > cfg.max_len:
                v = np.hstack([v[:cfg.head], v[-cfg.tail:]])
            if k == 'input_ids':
                new_v = np.ones(cfg.max_len) * cfg.tokenizer.pad_token_id
            else:
                new_v = np.zeros(cfg.max_len)
            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df[cfg.text].fillna("none").values

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = prepare_input(text, self.cfg)
        return inputs
    
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = LukeConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = LukeModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = LukeModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output
    
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, 
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
config_path = CFG.model_dir+"config.pth"
predictions = []
for fold in range(CFG.n_fold):
    model = CustomModel(CFG, config_path=config_path, pretrained=False)
    state = torch.load(CFG.model_dir+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state; gc.collect()
    torch.cuda.empty_cache()
    
submission['score'] = np.mean(predictions, axis=0)
comments5 = submission[['comment_id', 'score']].copy()
comments5

In [None]:
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
    
class CFG:
    model_dir='../input/jigsaw4-luke-base-training-tito-cv-strategy/'
    num_workers=4
    model="studio-ousia/luke-base"
    batch_size=128
    fc_dropout=0.
    text="text"
    target="target"
    target_size=1
    head=32
    tail=32
    seed=42
    n_fold=5
    
CFG.max_len = CFG.head + CFG.tail

import os
import gc
import re
import sys
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -q transformers -y')
os.system('pip uninstall -q tokenizers -y')
os.system('pip uninstall -q huggingface_hub -y')

os.system('mkdir -p /tmp/pip/cache-tokenizers/')
os.system('cp ../input/tokenizers-0103/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl /tmp/pip/cache-tokenizers/')
os.system('pip install -q --no-index --find-links /tmp/pip/cache-tokenizers/ tokenizers')

os.system('mkdir -p /tmp/pip/cache-huggingface-hub/')
os.system('cp ../input/huggingface-hub-008/huggingface_hub-0.0.8-py3-none-any.whl /tmp/pip/cache-huggingface-hub/')
os.system('pip install -q --no-index --find-links /tmp/pip/cache-huggingface-hub/ huggingface_hub')

os.system('mkdir -p /tmp/pip/cache-transformers/')
os.system('cp ../input/transformers-470/transformers-4.7.0-py3-none-any.whl /tmp/pip/cache-transformers/')
os.system('pip install -q --no-index --find-links /tmp/pip/cache-transformers/ transformers')

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import LukeTokenizer, LukeModel, LukeConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_score(df):
    score = len(df[df['less_toxic_pred'] < df['more_toxic_pred']]) / len(df)
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

test = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
submission = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')
print(test.shape, submission.shape)
display(test.head())
display(submission.head())

CFG.tokenizer = LukeTokenizer.from_pretrained(CFG.model_dir+'tokenizer/')

def prepare_input(text, cfg):
    if cfg.tail == 0:
        inputs = cfg.tokenizer.encode_plus(text, 
                                           return_tensors=None, 
                                           add_special_tokens=True, 
                                           max_length=cfg.max_len,
                                           pad_to_max_length=True,
                                           truncation=True)
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    else:
        inputs = cfg.tokenizer.encode_plus(text,
                                           return_tensors=None, 
                                           add_special_tokens=True, 
                                           truncation=True)
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > cfg.max_len:
                v = np.hstack([v[:cfg.head], v[-cfg.tail:]])
            if k == 'input_ids':
                new_v = np.ones(cfg.max_len) * cfg.tokenizer.pad_token_id
            else:
                new_v = np.zeros(cfg.max_len)
            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df[cfg.text].fillna("none").values

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = prepare_input(text, self.cfg)
        return inputs
    
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = LukeConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = LukeModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = LukeModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output
    
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, 
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
config_path = CFG.model_dir+"config.pth"
predictions = []
for fold in range(CFG.n_fold):
    model = CustomModel(CFG, config_path=config_path, pretrained=False)
    state = torch.load(CFG.model_dir+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state; gc.collect()
    torch.cuda.empty_cache()
    

submission['score'] = np.mean(predictions, axis=0)
comments4 = submission[['comment_id', 'score']].copy()
comments4

In [None]:
N_TRY = 100
comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

comments['score1'] = comments4['score']
comments['score2'] = comments5['score']


my_list = ['luke_base','luke_large']


validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

validation_annotations = validation.set_index('more_toxic').join(
    comments[['comment_id', 'text']].set_index('text'), how='inner').set_index('less_toxic').join(
    comments[['comment_id', 'text']].set_index('text'),
    lsuffix='_more', rsuffix='_less', how='inner').reset_index(drop=True)

best_score = 0
best_w = 0
for _ in range(N_TRY):
#for w in np.arange(0,1.01,0.01):
    w = np.random.rand(len(my_list))
    w = w / np.linalg.norm(w)
    #w = np.array([w,1-w])
    scores =  { c[1].comment_id: sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows() }
    percent_correct = 100.0 * (
        sum(validation_annotations['comment_id_more'].map(scores).gt(
            validation_annotations['comment_id_less'].map(scores)))
        / len(validation_annotations))
    print(percent_correct)
    if percent_correct > best_score:
        best_score = percent_correct
        best_w = w
        
        
print('----------')
print('----------')
print('----------')


print('w_score',percent_correct)
print('best_score',best_score)

print('w',w)
print('w_norm',w/sum(w))

print('best_w',best_w)
print('best_w_norm',best_w/sum(best_w))


comments['score'] = [sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows()]

comments_luke = comments[['comment_id', 'score']].copy()
comments_luke

In [None]:
from detoxify import Detoxify
import pandas as pd
import numpy as np


# each model takes in either a string or a list of strings
#results = Detoxify('original').predict('example text 1')
#results = Detoxify('unbiased').predict(['example text 1','example text 2'])
#results = Detoxify('multilingual').predict(['example text','exemple de text','texto de ejemplo','testo di esempio','texto de exemplo','örnek metin','пример текста'])

# to specify the device the model will be allocated on (defaults to cpu), accepts any torch.device input
#model = Detoxify('original', device='cuda')
#results = model.predict(df.text.head(10).to_list())
    
# to specify the device the model will be allocated on (defaults to cpu), accepts any torch.device input
#model = Detoxify('unbiased', device='cuda')
#results = model.predict(df.text.head(10).to_list())

# to specify the device the model will be allocated on (defaults to cpu), accepts any torch.device input
#model = Detoxify('multilingual', device='cuda')
#results = model.predict(df.text.head(10).to_list())

# to specify the device the model will be allocated on (defaults to cpu), accepts any torch.device input
model = Detoxify('original', device='cuda')
print(model.predict('Hello World...'))


model = Detoxify('unbiased', device='cuda')
print(model.predict('Hello World...'))


model = Detoxify('multilingual', device='cuda')
print(model.predict('Hello World...'))



auxi = pd.DataFrame(model.predict(['Hello World...','Hello World...']))
auxi

In [None]:
select = ['original','unbiased','multilingual']
N_TRY = 100

In [None]:
SEL = 0
comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

model = Detoxify(select[SEL], device='cuda')
auxi = pd.DataFrame(model.predict(['Hello World...']))
my_list = list(auxi.columns)

for f in my_list:
    comments[f]= 0.5
    
#comments = comments.head(100)    
for row in range(len(comments)):
    comments.loc[row,my_list] = np.array(list(model.predict(comments['text'].iloc[row]).values())).reshape(1,-1)[0]
    
comments

In [None]:
validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

validation_annotations = validation.set_index('more_toxic').join(
    comments[['comment_id', 'text']].set_index('text'), how='inner').set_index('less_toxic').join(
    comments[['comment_id', 'text']].set_index('text'),
    lsuffix='_more', rsuffix='_less', how='inner').reset_index(drop=True)

best_score = 0
best_w = 0
for _ in range(N_TRY):
    w = np.random.rand(len(my_list))
    w = w / np.linalg.norm(w)
    scores =  { c[1].comment_id: sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows() }
    percent_correct = 100.0 * (
        sum(validation_annotations['comment_id_more'].map(scores).gt(
            validation_annotations['comment_id_less'].map(scores)))
        / len(validation_annotations))
    print(percent_correct)
    if percent_correct > best_score:
        best_score = percent_correct
        best_w = w
        
        
print('----------')
print('----------')
print('----------')

print('w_score',percent_correct)
print('best_score',best_score)

print('w',w)
print('w_norm',w/sum(w))

print('best_w',best_w)
print('best_w_norm',best_w/sum(best_w))


comments['score'] = [sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows()]

comments1 = comments[['comment_id', 'score']].copy()

comments1[['comment_id', 'score']]

In [None]:
SEL = 1
comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

model = Detoxify(select[SEL], device='cuda')
auxi = pd.DataFrame(model.predict(['Hello World...']))
my_list = list(auxi.columns)

for f in my_list:
    comments[f]= 0.5
    
#comments = comments.head(100)    
for row in range(len(comments)):
    comments.loc[row,my_list] = np.array(list(model.predict(comments['text'].iloc[row]).values())).reshape(1,-1)[0]
    
comments

In [None]:
validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

validation_annotations = validation.set_index('more_toxic').join(
    comments[['comment_id', 'text']].set_index('text'), how='inner').set_index('less_toxic').join(
    comments[['comment_id', 'text']].set_index('text'),
    lsuffix='_more', rsuffix='_less', how='inner').reset_index(drop=True)

best_score = 0
best_w = 0
for _ in range(N_TRY):
    w = np.random.rand(len(my_list))
    w = w / np.linalg.norm(w)
    scores =  { c[1].comment_id: sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows() }
    percent_correct = 100.0 * (
        sum(validation_annotations['comment_id_more'].map(scores).gt(
            validation_annotations['comment_id_less'].map(scores)))
        / len(validation_annotations))
    print(percent_correct)
    if percent_correct > best_score:
        best_score = percent_correct
        best_w = w
        
        
print('----------')
print('----------')
print('----------')

print('w_score',percent_correct)
print('best_score',best_score)

print('w',w)
print('w_norm',w/sum(w))

print('best_w',best_w)
print('best_w_norm',best_w/sum(best_w))


comments['score'] = [sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows()]

comments2 = comments[['comment_id', 'score']].copy()

comments2[['comment_id', 'score']]

In [None]:
SEL = 2
comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

model = Detoxify(select[SEL], device='cuda')
auxi = pd.DataFrame(model.predict(['Hello World...']))
my_list = list(auxi.columns)

for f in my_list:
    comments[f]= 0.5
    
#comments = comments.head(100)    
for row in range(len(comments)):
    comments.loc[row,my_list] = np.array(list(model.predict(comments['text'].iloc[row]).values())).reshape(1,-1)[0]
    
comments

In [None]:
validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

validation_annotations = validation.set_index('more_toxic').join(
    comments[['comment_id', 'text']].set_index('text'), how='inner').set_index('less_toxic').join(
    comments[['comment_id', 'text']].set_index('text'),
    lsuffix='_more', rsuffix='_less', how='inner').reset_index(drop=True)

best_score = 0
best_w = 0
for _ in range(N_TRY):
    w = np.random.rand(len(my_list))
    w = w / np.linalg.norm(w)
    scores =  { c[1].comment_id: sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows() }
    percent_correct = 100.0 * (
        sum(validation_annotations['comment_id_more'].map(scores).gt(
            validation_annotations['comment_id_less'].map(scores)))
        / len(validation_annotations))
    print(percent_correct)
    if percent_correct > best_score:
        best_score = percent_correct
        best_w = w
        
        
print('----------')
print('----------')
print('----------')

print('w_score',percent_correct)
print('best_score',best_score)

print('w',w)
print('w_norm',w/sum(w))

print('best_w',best_w)
print('best_w_norm',best_w/sum(best_w))


comments['score'] = [sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows()]

comments3 = comments[['comment_id', 'score']].copy()

comments3[['comment_id', 'score']]

In [None]:
comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

comments['score1'] = comments1['score']
comments['score2'] = comments2['score']
comments['score3'] = comments3['score']
#comments['score4'] = comments_luke['score']
#comments['score5'] = comments5['score']

my_list = ['original','unbiased','multilingual']#,'comments_luke']#'luke_base','luke_large']


validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

validation_annotations = validation.set_index('more_toxic').join(
    comments[['comment_id', 'text']].set_index('text'), how='inner').set_index('less_toxic').join(
    comments[['comment_id', 'text']].set_index('text'),
    lsuffix='_more', rsuffix='_less', how='inner').reset_index(drop=True)

best_score = 0
best_w = 0
for _ in range(N_TRY):
    w = np.random.rand(len(my_list))
    w = w / np.linalg.norm(w)
    scores =  { c[1].comment_id: sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows() }
    percent_correct = 100.0 * (
        sum(validation_annotations['comment_id_more'].map(scores).gt(
            validation_annotations['comment_id_less'].map(scores)))
        / len(validation_annotations))
    print(percent_correct)
    if percent_correct > best_score:
        best_score = percent_correct
        best_w = w
        
        
print('----------')
print('----------')
print('----------')

print('w_score',percent_correct)
print('best_score',best_score)

print('w',w)
print('w_norm',w/sum(w))

print('best_w',best_w)
print('best_w_norm',best_w/sum(best_w))


comments['score'] = [sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows()]


In [None]:
comments['score'] = (2/3)*comments['score'].rank() + (1/3)*comments_luke['score'].rank()

In [None]:
if len(comments) != comments['score'].nunique():
    print(len(comments) == comments['score'].nunique())
    comments = comments.sample(n=len(comments),random_state=0)
    comments['score'] = comments['score'].rank(method='first')
    
print(len(comments) == comments['score'].nunique())

In [None]:
comments[['comment_id', 'score']].to_csv('submission.csv', index=False)
comments[['comment_id', 'score']]