# Imports

In [None]:
import sys
sys.path.append('../input/fasthugs')

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
import re 
import scipy
from scipy import sparse
import gc 
from IPython.display import display, HTML
from pprint import pprint
import warnings
import joblib
from tqdm.auto import tqdm
from bs4 import BeautifulSoup

warnings.filterwarnings("ignore")

pd.options.display.max_colwidth=300

# Training data 

## Convert the label to SUM of all toxic labels (This might help with maintaining toxicity order of comments)

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')

def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+')  # Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml')  # Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) # Remove special Charecters
    text = re.sub(' +', ' ', text) # Remove Extra Spaces
    text = text.strip().lower() # remove spaces at the beginning and at the end of string and make string lower
    
    # lemmatization
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split(' ')])
    # del stopwords
    text = ' '.join([word for word in text.split(' ') if word not in stop])

    return text

In [None]:
def clean(data, col):
    
    data[col] = data[col].str.replace('https?://\S+|www\.\S+', ' social medium ')      
        
    data[col] = data[col].str.lower()
    data[col] = data[col].str.replace("4", "a") 
    data[col] = data[col].str.replace("2", "l")
    data[col] = data[col].str.replace("5", "s") 
    data[col] = data[col].str.replace("1", "i") 
    data[col] = data[col].str.replace("!", "i") 
    data[col] = data[col].str.replace("|", "i") 
    data[col] = data[col].str.replace("0", "o") 
    data[col] = data[col].str.replace("l3", "b") 
    data[col] = data[col].str.replace("7", "t") 
    data[col] = data[col].str.replace("7", "+") 
    data[col] = data[col].str.replace("8", "ate") 
    data[col] = data[col].str.replace("3", "e") 
    data[col] = data[col].str.replace("9", "g")
    data[col] = data[col].str.replace("6", "g")
    data[col] = data[col].str.replace("@", "a")
    data[col] = data[col].str.replace("$", "s")
    data[col] = data[col].str.replace("#ofc", " of fuckin course ")
    data[col] = data[col].str.replace("fggt", " faggot ")
    data[col] = data[col].str.replace("your", " your ")
    data[col] = data[col].str.replace("self", " self ")
    data[col] = data[col].str.replace("cuntbag", " cunt bag ")
    data[col] = data[col].str.replace("fartchina", " fart china ")    
    data[col] = data[col].str.replace("youi", " you i ")
    data[col] = data[col].str.replace("cunti", " cunt i ")
    data[col] = data[col].str.replace("sucki", " suck i ")
    data[col] = data[col].str.replace("pagedelete", " page delete ")
    data[col] = data[col].str.replace("cuntsi", " cuntsi ")
    data[col] = data[col].str.replace("i'm", " i am ")
    data[col] = data[col].str.replace("offuck", " of fuck ")
    data[col] = data[col].str.replace("centraliststupid", " central ist stupid ")
    data[col] = data[col].str.replace("hitleri", " hitler i ")
    data[col] = data[col].str.replace("i've", " i have ")
    data[col] = data[col].str.replace("i'll", " sick ")
    data[col] = data[col].str.replace("fuck", " fuck ")
    data[col] = data[col].str.replace("f u c k", " fuck ")
    data[col] = data[col].str.replace("shit", " shit ")
    data[col] = data[col].str.replace("bunksteve", " bunk steve ")
    data[col] = data[col].str.replace('wikipedia', ' social medium ')
    data[col] = data[col].str.replace("faggot", " faggot ")
    data[col] = data[col].str.replace("delanoy", " delanoy ")
    data[col] = data[col].str.replace("jewish", " jewish ")
    data[col] = data[col].str.replace("sexsex", " sex ")
    data[col] = data[col].str.replace("allii", " all ii ")
    data[col] = data[col].str.replace("i'd", " i had ")
    data[col] = data[col].str.replace("'s", " is ")
    data[col] = data[col].str.replace("youbollocks", " you bollocks ")
    data[col] = data[col].str.replace("dick", " dick ")
    data[col] = data[col].str.replace("cuntsi", " cuntsi ")
    data[col] = data[col].str.replace("mothjer", " mother ")
    data[col] = data[col].str.replace("cuntfranks", " cunt ")
    data[col] = data[col].str.replace("ullmann", " jewish ")
    data[col] = data[col].str.replace("mr.", " mister ")
    data[col] = data[col].str.replace("aidsaids", " aids ")
    data[col] = data[col].str.replace("njgw", " nigger ")
    data[col] = data[col].str.replace("wiki", " social medium ")
    data[col] = data[col].str.replace("administrator", " admin ")
    data[col] = data[col].str.replace("gamaliel", " jewish ")
    data[col] = data[col].str.replace("rvv", " vanadalism ")
    data[col] = data[col].str.replace("admins", " admin ")
    data[col] = data[col].str.replace("pensnsnniensnsn", " penis ")
    data[col] = data[col].str.replace("pneis", " penis ")
    data[col] = data[col].str.replace("pennnis", " penis ")
    data[col] = data[col].str.replace("pov.", " point of view ")
    data[col] = data[col].str.replace("vandalising", " vandalism ")
    data[col] = data[col].str.replace("cock", " dick ")
    data[col] = data[col].str.replace("asshole", " asshole ")
    data[col] = data[col].str.replace("youi", " you ")
    data[col] = data[col].str.replace("afd", " all fucking day ")
    data[col] = data[col].str.replace("sockpuppets", " sockpuppetry ")
    data[col] = data[col].str.replace("iiprick", " iprick ")
    data[col] = data[col].str.replace("penisi", " penis ")
    data[col] = data[col].str.replace("warrior", " warrior ")
    data[col] = data[col].str.replace("loil", " laughing out insanely loud ")
    data[col] = data[col].str.replace("vandalise", " vanadalism ")
    data[col] = data[col].str.replace("helli", " helli ")
    data[col] = data[col].str.replace("lunchablesi", " lunchablesi ")
    data[col] = data[col].str.replace("special", " special ")
    data[col] = data[col].str.replace("ilol", " i lol ")
    data[col] = data[col].str.replace(r'\b[uU]\b', 'you')
    data[col] = data[col].str.replace(r"what's", "what is ")
    data[col] = data[col].str.replace(r"\'s", " is ")
    data[col] = data[col].str.replace(r"\'ve", " have ")
    data[col] = data[col].str.replace(r"can't", "cannot ")
    data[col] = data[col].str.replace(r"n't", " not ")
    data[col] = data[col].str.replace(r"i'm", "i am ")
    data[col] = data[col].str.replace(r"\'re", " are ")
    data[col] = data[col].str.replace(r"\'d", " would ")
    data[col] = data[col].str.replace(r"\'ll", " will ")
    data[col] = data[col].str.replace(r"\'scuse", " excuse ")
    data[col] = data[col].str.replace('\s+', ' ')  # will remove more than one whitespace character
#     text = re.sub(r'\b([^\W\d_]+)(\s+\1)+\b', r'\1', re.sub(r'\W+', ' ', text).strip(), flags=re.I)  # remove repeating words coming immediately one after another
    data[col] = data[col].str.replace(r'(.)\1+', r'\1\1') # 2 or more characters are replaced by 2 characters
#     text = re.sub(r'((\b\w+\b.{1,2}\w+\b)+).+\1', r'\1', text, flags = re.I)
    data[col] = data[col].str.replace("[:|♣|'|§|♠|*|/|?|=|%|&|-|#|•|~|^|>|<|►|_]", '')
    
    
    data[col] = data[col].str.replace(r"what's", "what is ")    
    data[col] = data[col].str.replace(r"\'ve", " have ")
    data[col] = data[col].str.replace(r"can't", "cannot ")
    data[col] = data[col].str.replace(r"n't", " not ")
    data[col] = data[col].str.replace(r"i'm", "i am ")
    data[col] = data[col].str.replace(r"\'re", " are ")
    data[col] = data[col].str.replace(r"\'d", " would ")
    data[col] = data[col].str.replace(r"\'ll", " will ")
    data[col] = data[col].str.replace(r"\'scuse", " excuse ")
    data[col] = data[col].str.replace(r"\'s", " ")

    # Clean some punctutations
    data[col] = data[col].str.replace('\n', ' \n ')
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters more than 3 times to length of 3
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')    
    # Add space around repeating characters
    data[col] = data[col].str.replace(r'([*!?\']+)',r' \1 ')    
    # patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}',' ').str.strip()   
    data[col] = data[col].str.replace(r'[ ]{2,}',' ').str.strip()   
    data[col] = data[col].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    
    return data

In [None]:
def P1_train(dataset='class', cleaned=False, df_muls=None, n_folds=7, frac_factor=1.5):
    if dataset=='class':
        df = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
        cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    elif dataset=='bias':
        df = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
        cols = ['toxic', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']
    elif dataset=='ruddit':
        df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
    else:
        assert 1==0 # use aformentioned datasets.

    if df_muls is None:
        df_muls = [1, 2, 1, 1, 1, 1]
    df['y'] = 0
    for col, mul in zip(cols, df_muls):
        df['y'] = df['y'] + df[col] * mul
    df['y'] = df['y']/df['y'].max()
    df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
    if cleaned:
        df = clean(df, 'text')
        tqdm.pandas()
        df['text'] = df['text'].progress_apply(text_cleaning)

    frac = 0.4
    for fld in range(n_folds):
        print(f'Fold: {fld}')
        tmp_df = pd.concat([df[df.y>0].sample(frac=frac, random_state = 10*(fld+1)) , 
                            df[df.y==0].sample(n=int(len(df[df.y>0])*frac*frac_factor) , random_state = 10*(fld+1))], axis=0).sample(frac=1, random_state = 10*(fld+1))
        tmp_df.to_csv(f'/kaggle/working/df_fld{fld}.csv', index=False)

    for fld in range(n_folds):
        print("\nTrain:")
        print(f' ****************************** FOLD: {fld} ******************************')
        df = pd.read_csv(f'/kaggle/working/df_fld{fld}.csv')
        print(df.shape)

        features = FeatureUnion([
            #('vect1', LengthTransformer()),
            #('vect2', LengthUpperTransformer()),
            ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))),
            #("vect4", TfidfVectorizer(min_df= 5, max_df=0.5, analyzer = 'word', token_pattern=r'(?u)\b\w{8,}\b')),
        ])
        pipeline = Pipeline([
                ("features", features),  #("clf", RandomForestRegressor(n_esatimators = 5, min_sample_leaf=3)),
                ("clf", Ridge()),  #("clf",LinearRegression())
            ])
        # Train the pipeline
        pipeline.fit(df['text'].values.astype('U'), df['y'])

        # What are the important features for toxicity
        if cleaned:
            joblib.dump(pipeline, f'{dataset}_c_{fld}.pkl')
        else:
            joblib.dump(pipeline, f'{dataset}_d_{fld}.pkl')

In [None]:
#P1_train(dataset='class', cleaned=True , frac_factor=1.5, n_folds=7)
#P1_train(dataset='bias' , cleaned=True,  frac_factor=0.3, n_folds=7)
if False:   # trained pipelines are saved
    P1_train(dataset='class', cleaned=False, frac_factor=1.5, n_folds=7)
    P1_train(dataset='class', cleaned=True , frac_factor=1.5, n_folds=7)

    P1_train(dataset='bias' , cleaned=False, frac_factor=0.3, n_folds=7)
    P1_train(dataset='bias' , cleaned=True,  frac_factor=0.3, n_folds=7)

In [None]:
def P1_eval(dataset='class', cleaned=False, n_folds=7, test_only=False):

    """
    #print('\nTotal number of features:', len(pipeline['features'].get_feature_names()) )
    feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                  np.round(pipeline['clf'].coef_,2) )), 
                         key = lambda x:x[1], 
                         reverse=True)
    #pprint(feature_wts[:30])
    """
    # Validation and Evaluation
    df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
    df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
    val_preds_arr1 = np.zeros((df_val.shape[0], n_folds))
    val_preds_arr2 = np.zeros((df_val.shape[0], n_folds))
    test_preds_arr = np.zeros((df_sub.shape[0], n_folds))
    if cleaned:
        tqdm.pandas()
        df_sub = clean(df_sub, 'text')
        df_sub['text'] = df_sub['text'].progress_apply(text_cleaning)
        if not test_only:
            df_val = clean(df_val, 'less_toxic')
            df_val = clean(df_val, 'more_toxic')
            df_val['less_toxic'] = df_val['less_toxic'].progress_apply(text_cleaning)
            df_val['more_toxic'] = df_val['more_toxic'].progress_apply(text_cleaning)

                
    for fld in range(n_folds):
        print("\nEval:")
        print(f' ****************************** FOLD: {fld} ******************************')

        if cleaned:
            pipeline = joblib.load(f'../input/jrstc-linear-model/{dataset}_c_{fld}.pkl')
        else:
            pipeline = joblib.load(f'../input/jrstc-linear-comp/{dataset}_d_{fld}.pkl')
            
        if not test_only:
            print("\npredict validation data ")
            val_preds_arr1[:,fld] = pipeline.predict(df_val['less_toxic'])
            val_preds_arr2[:,fld] = pipeline.predict(df_val['more_toxic'])

        print("\npredict test data ")
        test_preds_arr[:,fld] = pipeline.predict(df_sub['text'])
    return val_preds_arr1, val_preds_arr2, test_preds_arr

In [None]:
if False:
    c_d_l, c_d_r, c_d_t = P1_eval(dataset='class', cleaned=False, n_folds=7)
    b_d_l, b_d_r, b_d_t = P1_eval(dataset='bias' , cleaned=False, n_folds=7)
    def sort_good_weight(L, R, L_, R_):
        wts_acc = []
        for i in range(1, 100, 1):
            alpha = i/100
            L_wt = alpha * L + (1-alpha) * L_
            R_wt = alpha * R + (1-alpha) * R_
            wts_acc.append( (alpha, 1-alpha, 
                                 np.round((L_wt < R_wt).mean() * 100,2))
                              )
        print(sorted(wts_acc, key=lambda x:x[2], reverse=True)[:5])


In [None]:
#print(np.round((L_1<R_1).mean()*100, 2)) # --> 68.37
#print(np.round((L_3<R_3).mean()*100, 2)) # --> 68.64
# sort_good_weight(L_1, R_1, L_3, R_3) --> [(0.47, 0.53, 69.21), (0.48, 0.52, 69.2), (0.49, 0.51, 69.19), (0.39, 0.61, 69.16), (0.4, 0.6, 69.16)]
# 69.18?

In [None]:
def sort_good_weight(L, R, L_, R_):
    wts_acc = []
    for i in range(1, 100, 1):
        alpha = i/100
        L_wt = alpha * L + (1-alpha) * L_
        R_wt = alpha * R + (1-alpha) * R_
        wts_acc.append( (alpha, 1-alpha, 
                             np.round((L_wt < R_wt).mean() * 100,2))
                          )
    print(sorted(wts_acc, key=lambda x:x[2], reverse=True)[:5])

In [None]:
#c_d_l, c_d_r, c_d_t = P1_eval(dataset='class', cleaned=False, n_folds=7, test_only=True)
#b_d_l, b_d_r, b_d_t = P1_eval(dataset='bias' , cleaned=False, n_folds=7, test_only=True)
#T_1 = c_d_t.mean(axis=1)
#T_3 = b_d_t.mean(axis=1)

test_only = True
if True:
    c_d_l, c_d_r, c_d_t = P1_eval(dataset='class', cleaned=False, n_folds=7, test_only=test_only)
    c_c_l, c_c_r, c_c_t = P1_eval(dataset='class', cleaned=True , n_folds=7, test_only=test_only)

    b_d_l, b_d_r, b_d_t = P1_eval(dataset='bias' , cleaned=False, n_folds=7, test_only=test_only)
    b_c_l, b_c_r, b_c_t = P1_eval(dataset='bias' , cleaned=True , n_folds=7, test_only=test_only)

    L_1 = c_d_l.mean(axis=1)
    R_1 = c_d_r.mean(axis=1)
    L_2 = c_c_l.mean(axis=1)
    R_2 = c_c_r.mean(axis=1)

    L_3 = b_d_l.mean(axis=1)
    R_3 = b_d_r.mean(axis=1)
    L_4 = b_c_l.mean(axis=1)
    R_4 = b_c_r.mean(axis=1)

    T_1 = c_d_t.mean(axis=1)
    T_2 = c_c_t.mean(axis=1)
    T_3 = b_d_t.mean(axis=1)
    T_4 = b_c_t.mean(axis=1)
    
    x = 0.45
    y = 0.68
    z = 0.39
    
    L_c = x * L_1 + (1-x) * L_2
    R_c = x * R_1 + (1-x) * R_2
    L_b = y * L_3 + (1-y) * L_4
    R_b = y * R_3 + (1-y) * R_4
    L_y = z * L_c + (1-z) * L_b
    R_y = z * R_c + (1-z) * R_b
    
    T_c = x * T_1 + (1-x) * T_2
    T_b = y * T_3 + (1-y) * T_4
    T_y = z * T_c + (1-z) * T_b

In [None]:
import sys
sys.path.append('../input/fasthugs')

In [None]:
from fastai.text.all import *
from fasthugs.data import TransformersTextBlock, TextGetter
from fasthugs.learner import TransLearner

from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import StratifiedKFold
import gc
import pandas as pd
from bayes_opt import BayesianOptimization
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator

os.environ["TOKENIZERS_PARALLELISM"] = "false"
SEED=2021

In [None]:
def valid_model(model_name='distilroberta-base', bs=16, idx=1):
    df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
    df_sub['comment_text'] = df_sub.text
    df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
    df['all_tox'] = 0
    model_path = f'../input/roberta-transformers-pytorch/{model_name}'

    dblock = DataBlock(blocks = [TransformersTextBlock(pretrained_model_name=model_path), RegressionBlock(n_out=6)],
               get_x=TextGetter('comment_text'),
               get_y=ItemGetter('all_tox'))

    dls = dblock.dataloaders(df, bs=bs, val_bs=bs*1, num_workers=1)
    p_hdrop = 0.1
    #learn.fit_one_cycle(10, 1e-5, cbs=[SaveModelCallback(), EarlyStoppingCallback(comp=np.less, patience=3)])
    preds_all = []
    for i in range(5):
        model = AutoModelForSequenceClassification.from_pretrained(f'../input/jrstc-test/{model_name}_df{idx}_{i}', 
                    num_labels=6, hidden_dropout_prob=p_hdrop)
        metrics = [rmse, R2Score(), PearsonCorrCoef(), SpearmanCorrCoef()]
        opt_func = Adam
        learn = TransLearner(dls, model, loss_func=BCEWithLogitsLossFlat(), metrics=metrics, opt_func=opt_func)

        test_dl = dls.test_dl(df_sub)
        preds_, _ = learn.tta(dl=test_dl, n=1, beta=0)
        preds_all.append(preds_)
        #preds_l, _ = learn.get_preds(dl=test_dl, n=1, beta=0)
        if False:
            df_sub['comment_text'] = df_val.more_toxic
            test_dl = dls.test_dl(df_val)
            preds_r, _ = learn.tta(dl=test_dl, n=1, beta=0)
            preds_l_all.append(preds_l)
            preds_r_all.append(preds_r)
        learn = None
        gc.collect()
        torch.cuda.empty_cache()
    return preds_all

In [None]:
if True:
    T_c_1 = valid_model(model_name='distilroberta-base', idx=1, bs=16)
    T_b_1 = valid_model(model_name='distilroberta-base', idx=2, bs=16)

    T_c_2 = valid_model(model_name='roberta-base', idx=1, bs=4)
    T_b_2 = valid_model(model_name='roberta-base', idx=2, bs=4)

    T_c_3 = valid_model(model_name='roberta-large', idx=1, bs=4)
    T_b_3 = valid_model(model_name='roberta-large', idx=2, bs=4)
    
    T_c_1 = np.array([ll.numpy() for ll in T_c_1])
    T_b_1 = np.array([ll.numpy() for ll in T_b_1])
    T_c_2 = np.array([ll.numpy() for ll in T_c_2])
    T_b_2 = np.array([ll.numpy() for ll in T_b_2])
    T_c_3 = np.array([ll.numpy() for ll in T_c_3])
    T_b_3 = np.array([ll.numpy() for ll in T_b_3])
    
    
    w = np.array([1, 2, 1, 1, 1, 1])

    T_c_1 = np.einsum('ijk, k->j', T_c_1, w)
    T_b_1 = np.einsum('ijk, k->j', T_b_1, w)
    T_c_2 = np.einsum('ijk, k->j', T_c_2, w)
    T_b_2 = np.einsum('ijk, k->j', T_b_2, w)
    T_c_3 = np.einsum('ijk, k->j', T_c_3, w)
    T_b_3 = np.einsum('ijk, k->j', T_b_3, w)

In [None]:
if True:
    w = np.array([1, 2, 1, 1, 1, 1])
    L_c_1 = np.einsum('ijk, k->j', np.load('../input/jrstc-test/L_c_1.npy'), w)
    L_c_2 = np.einsum('ijk, k->j', np.load('../input/jrstc-test/L_c_2.npy'), w)
    L_c_3 = np.einsum('ijk, k->j', np.load('../input/jrstc-test/L_c_3.npy'), w)
    L_b_1 = np.einsum('ijk, k->j', np.load('../input/jrstc-test/L_b_1.npy'), w)
    L_b_2 = np.einsum('ijk, k->j', np.load('../input/jrstc-test/L_b_2.npy'), w)
    L_b_3 = np.einsum('ijk, k->j', np.load('../input/jrstc-test/L_b_3.npy'), w)

    R_c_1 = np.einsum('ijk, k->j', np.load('../input/jrstc-test/R_c_1.npy'), w)
    R_c_2 = np.einsum('ijk, k->j', np.load('../input/jrstc-test/R_c_2.npy'), w)
    R_c_3 = np.einsum('ijk, k->j', np.load('../input/jrstc-test/R_c_3.npy'), w)
    R_b_1 = np.einsum('ijk, k->j', np.load('../input/jrstc-test/R_b_1.npy'), w)
    R_b_2 = np.einsum('ijk, k->j', np.load('../input/jrstc-test/R_b_2.npy'), w)
    R_b_3 = np.einsum('ijk, k->j', np.load('../input/jrstc-test/R_b_3.npy'), w)
    sort_good_weight(L_b_1, R_b_1, L_c_1, R_c_1) # [(0.57, 0.43000000000000005, 70.49), (0.48, 0.52, 70.48), (0.51, 0.49, 70.48), (0.52, 0.48, 70.48), (0.53, 0.47, 70.48)]
    sort_good_weight(L_b_2, R_b_2, L_c_2, R_c_2) # [(0.61, 0.39, 70.43), (0.64, 0.36, 70.41), (0.6, 0.4, 70.39), (0.62, 0.38, 70.39), (0.63, 0.37, 70.39)]
    sort_good_weight(L_b_3, R_b_3, L_c_3, R_c_3) # [(0.09, 0.91, 69.94), (0.08, 0.92, 69.93), (0.1, 0.9, 69.93), (0.25, 0.75, 69.93), (0.07, 0.9299999999999999, 69.92)]
    x = 0.5
    L__1 = x * L_b_1 + (1-x) * L_c_1
    R__1 = x * R_b_1 + (1-x) * R_c_1

    x = 0.6
    L__2 = x * L_b_2 + (1-x) * L_c_2
    R__2 = x * R_b_2 + (1-x) * R_c_2

    x = 0.1
    L__3 = x * L_b_3 + (1-x) * L_c_3
    R__3 = x * R_b_3 + (1-x) * R_c_3
    #sort_good_weight(L_1, R_1, L_2, R_2) # [(0.85, 0.15000000000000002, 70.53), (0.86, 0.14, 70.53), (0.76, 0.24, 70.52), (0.77, 0.22999999999999998, 70.52), (0.78, 0.21999999999999997, 70.52)]
    #sort_good_weight(L_x, R_x, L_3, R_3) # [(0.59, 0.41000000000000003, 70.56), (0.56, 0.43999999999999995, 70.55), (0.57, 0.43000000000000005, 70.55), (0.61, 0.39, 70.55), (0.58, 0.42000000000000004, 70.54)]
    yy = 0.8
    L_x = L__1 * yy + (1-yy) * L__2
    R_x = R__1 * yy + (1-yy) * R__2
    yy = 0.6
    L_x = L_x * yy + (1-yy) * L__3
    R_x = R_x * yy + (1-yy) * R__3

In [None]:
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
x, y, z = 0.5, 0.5, 0.5 
T_y = z*(x*T_1+(1-x)*T_2) + (1-z)*(y*T_3+(1-y)*T_4)
x_, y_, z_ = 0.5, 0.5, 0.1
xx_, yy_ = 0.5, 0.6
T_x = (xx_*(x_*T_b_1+(1-x_)*(T_c_1))+(1-xx_)*(y_*T_b_2+(1-y_)*(T_c_2))) * yy_ + (1-yy_)*(z_*T_b_3+(1-z_)*(T_c_3))
ll_ = 0.5
T_ = T_x * ll_ + (1-ll_) * T_y
df_sub['score'] = T_
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)
df_sub[['comment_id', 'score']].head()

In [None]:
if False:
    df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
    x, y, z = 0.45, 0.68, 0.39 
    T_y = z*(x*T_1+(1-x)*T_2) + (1-z)*(y*T_3+(1-y)*T_4)
    x_, y_, z_ = 0.5, 0.6, 0.1
    xx_, yy_ = 0.8, 0.6
    T_x = (xx_*(x_*T_b_1+(1-x_)*(T_c_1))+(1-xx_)*(y_*T_b_2+(1-y_)*(T_c_2))) * yy_ + (1-yy_)*(z_*T_b_3+(1-z_)*(T_c_3))
    ll_ = 0.4
    T_ = T_x * ll_ + (1-ll_) * T_y
    df_sub['score'] = T_
    df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)
    df_sub[['comment_id', 'score']].head()

In [None]:
if False:
    # Predict using pipeline
    df_sub['score'] = test_preds_arr.mean(axis=1)
    # Cases with duplicates scores

    df_sub['score'].count() - df_sub['score'].nunique()
    df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)
    df_sub[['comment_id', 'score']].head()

In [None]:
if False:
    wts_acc = []
    for i in range(30,70,1):
        for j in range(0,20,1):
            w1 = i/100
            w2 = (100 - i - j)/100
            w3 = (1 - w1 - w2 )
            p1_wt = w1*p1 + w2*p3 + w3*p5
            p2_wt = w1*p2 + w2*p4 + w3*p6
            wts_acc.append( (w1,w2,w3, 
                             np.round((p1_wt < p2_wt).mean() * 100,2))
                          )
    sorted(wts_acc, key=lambda x:x[3], reverse=True)[:5]