In [None]:
class CFG:
    debug=False
    seed=42
    n_fold = 4
    model_name = "../input/roberta-base-edited"
    max_len = 256
    text="text_clean2"
    target="target"
    target_size = 1
    hidden_size = 768
    fc_dropout = 0.
    batch_size = 32
    num_workers = 4
    model_dir = "../input/roberta-001-train-jigsaw2"

In [None]:
import os
import gc
import re
import gc
import time
import math
import random
import string
import pickle
from collections import Counter
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

from gensim import models
from gensim.models import KeyedVectors,FastText

from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectKBest, mutual_info_regression,chi2, f_regression
from bs4 import BeautifulSoup

from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from scipy import sparse
import scipy
import seaborn as sns

from nltk.stem import PorterStemmer, SnowballStemmer
from unicodedata import category, name, normalize
from nltk.stem.lancaster import LancasterStemmer

import lightgbm as lgb

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import OneCycleLR
# NLP
from transformers import AutoTokenizer, AutoModel,get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup


import warnings
warnings.filterwarnings('ignore')

pd.set_option("max_columns",100)

In [None]:
# Random Seed Initialize
RANDOM_SEED = 42

def seed_everything(seed=RANDOM_SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything()

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    
print(f'Using device: {device}')

## Data Loading

In [None]:
train_ruddit_df = pd.read_csv('../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv')
train_1st_df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')
train_1st_test_df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv')
train_1st_test_lb_df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv')
validation_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

# merge jigsaw 1st test_label
train_1st_test_df = train_1st_test_df.merge(train_1st_test_lb_df, on='id')

# drop "-1"label
drop_idx = (train_1st_test_df.loc[:,"toxic":]==-1).sum(axis = 1) >=1
train_1st_test_df = train_1st_test_df[~drop_idx].reset_index(drop=True)

# concat train and test
train_1st_df = pd.concat([train_1st_df,train_1st_test_df]).reset_index(drop = True)

if CFG.debug:
    train_ruddit_df = train_ruddit_df.sample(n=2000, random_state=CFG.seed).reset_index(drop=True)
    train_1st_df =  train_1st_df.sample(n=2000, random_state=CFG.seed).reset_index(drop=True)

In [None]:
# rename columns
train_ruddit_df = train_ruddit_df.rename(columns = {"txt":"text"})
train_1st_df = train_1st_df.rename(columns = {"comment_text":"text"})
train_ruddit_df = train_ruddit_df.rename(columns = {"offensiveness_score":"target"})

# delete
idx = train_ruddit_df["text"].isin(["[deleted]","[removed]"])
train_ruddit_df = train_ruddit_df[~idx].reset_index(drop=True)

In [None]:
def make_jigsaw1_data(train_1st_df):   
    txt_validation_df = pd.concat([validation_df["less_toxic"],validation_df["more_toxic"]]) 
    txt_1st =train_1st_df["text"] 
    print(f"Before:{len(train_1st_df)}")
    
    val_text_list = list(set(txt_validation_df))
    dup_idx = train_1st_df["text"].isin(val_text_list)
    print(f"Num_duplicate_text_{sum(dup_idx)}")
    
    train_1st_df = train_1st_df[~dup_idx].reset_index(drop=True)
    print(f"After:{len(train_1st_df)}")

    cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
                'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

    for category in cat_mtpl:
        train_1st_df[category] = train_1st_df[category] * cat_mtpl[category]  
    train_1st_df["target"] = train_1st_df.loc[:,"toxic":"identity_hate"].sum(axis=1)
    return train_1st_df

In [None]:
def under_sample(input_df, frac = 0.1):
    '''
        undersampling target == 0 rows
    '''
    print(f"before_shape:{len(input_df)}")
    
    zero_sample_df = input_df[input_df["target"]==0].sample(frac = frac,random_state = 1)
    non_zero_df = input_df[input_df["target"]!=0]
    out_df = pd.concat([zero_sample_df,non_zero_df]).sort_index().reset_index(drop=True)

    print(f"After_shape:{len(out_df)}")
    print(f"Ratio :{len(out_df)/len(input_df)}")
    return out_df

In [None]:
train_1st_df = make_jigsaw1_data(train_1st_df)

In [None]:
train_1st_df = under_sample(train_1st_df, frac = 0.2)

In [None]:
train_1st_df["target"].hist(bins = 40)
train_1st_df["target"].describe()

## preprocess 
- reference
    - https://www.kaggle.com/sunnymarkliu/more-text-cleaning-to-increase-word-coverage

In [None]:
spaces = ['\u200b', '\u200e', '\u202a', '\u202c', '\ufeff', '\uf0d8', '\u2061', '\x10', '\x7f', '\x9d', '\xad', '\xa0']
def remove_space(text):
    """
    remove extra spaces and ending space if any
    """
    for space in spaces:
        text = text.replace(space, ' ')
    text = text.strip()
    text = re.sub('\s+', ' ', text)
    return text

In [None]:
special_punc_mappings = {"—": "-", "–": "-", "_": "-", '”': '"', "″": '"', '“': '"', '•': '.', '−': '-',
                         "’": "'", "‘": "'", "´": "'", "`": "'", '\u200b': ' ', '\xa0': ' ','،':'','„':'',
                         '…': ' ... ', '\ufeff': ''}
def clean_special_punctuations(text):
    for punc in special_punc_mappings:
        if punc in text:
            text = text.replace(punc, special_punc_mappings[punc])
    return text

In [None]:
# number
def clean_number(text):
    text = re.sub(r'(\d+)([a-zA-Z])', '\g<1> \g<2>', text)
    text = re.sub(r'(\d+) (th|st|nd|rd) ', '\g<1>\g<2> ', text)
    text = re.sub(r'(\d+),(\d+)', '\g<1>\g<2>', text)
    return text

In [None]:
rare_words_mapping = {' s.p ': ' ', ' S.P ': ' ', 'U.s.p': '', 'U.S.A.': 'USA', 'u.s.a.': 'USA', 'U.S.A': 'USA',
                      'u.s.a': 'USA', 'U.S.': 'USA', 'u.s.': 'USA', ' U.S ': ' USA ', ' u.s ': ' USA ', 'U.s.': 'USA',
                      ' U.s ': 'USA', ' u.S ': ' USA ', 'fu.k': 'fuck', 'U.K.': 'UK', ' u.k ': ' UK ',
                      ' don t ': ' do not ', 'bacteries': 'batteries', ' yr old ': ' years old ', 'Ph.D': 'PhD',
                      'cau.sing': 'causing', 'Kim Jong-Un': 'The president of North Korea', 'savegely': 'savagely',
                      'Ra apist': 'Rapist', '2fifth': 'twenty fifth', '2third': 'twenty third',
                      '2nineth': 'twenty nineth', '2fourth': 'twenty fourth', '#metoo': 'MeToo',
                      'Trumpcare': 'Trump health care system', '4fifth': 'forty fifth', 'Remainers': 'remainder',
                      'Terroristan': 'terrorist', 'antibrahmin': 'anti brahmin',
                      'fuckboys': 'fuckboy', 'Fuckboys': 'fuckboy', 'Fuckboy': 'fuckboy', 'fuckgirls': 'fuck girls',
                      'fuckgirl': 'fuck girl', 'Trumpsters': 'Trump supporters', '4sixth': 'forty sixth',
                      'culturr': 'culture',
                      'weatern': 'western', '4fourth': 'forty fourth', 'emiratis': 'emirates', 'trumpers': 'Trumpster',
                      'indans': 'indians', 'mastuburate': 'masturbate', 'f**k': 'fuck', 'F**k': 'fuck', 'F**K': 'fuck',
                      ' u r ': ' you are ', ' u ': ' you ', '操你妈': 'fuck your mother', 'e.g.': 'for example',
                      'i.e.': 'in other words', '...': '.', 'et.al': 'elsewhere', 'anti-Semitic': 'anti-semitic',
                      'f***': 'fuck', 'f**': 'fuc', 'F***': 'fuck', 'F**': 'fuc','f*ck':'fuck',
                      'a****': 'assho', 'a**': 'ass', 'h***': 'hole', 'A****': 'assho', 'A**': 'ass', 'H***': 'hole',
                      's***': 'shit', 's**': 'shi', 'S***': 'shit', 'S**': 'shi', 'Sh**': 'shit','sh*t':'shit',
                      'p****': 'pussy', 'p*ssy': 'pussy', 'P****': 'pussy',
                      'p***': 'porn', 'p*rn': 'porn', 'P***': 'porn',
                      'st*up*id': 'stupid',
                      'd***': 'dick', 'di**': 'dick', 'h*ck': 'hack',
                      'b*tch': 'bitch', 'bi*ch': 'bitch', 'bit*h': 'bitch', 'bitc*': 'bitch', 'b****': 'bitch',
                      'b***': 'bitc', 'b**': 'bit', 'b*ll': 'bull'
                      }


def pre_clean_rare_words(text):
    for rare_word in rare_words_mapping:
        if rare_word in text:
            text = text.replace(rare_word, rare_words_mapping[rare_word])
    return text

In [None]:
# de-contract the contraction
def decontracted(text):
    # specific
    text = re.sub(r"(W|w)on(\'|\’)t ", "will not ", text)
    text = re.sub(r"(C|c)an(\'|\’)t ", "can not ", text)
    text = re.sub(r"(Y|y)(\'|\’)all ", "you all ", text)
    text = re.sub(r"(Y|y)a(\'|\’)ll ", "you all ", text)

    # general
    text = re.sub(r"(I|i)(\'|\’)m ", "i am ", text)
    text = re.sub(r"(A|a)in(\'|\’)t ", "is not ", text)
    text = re.sub(r"n(\'|\’)t ", " not ", text)
    text = re.sub(r"(\'|\’)re ", " are ", text)
    text = re.sub(r"(\'|\’)s ", " is ", text)
    text = re.sub(r"(\'|\’)d ", " would ", text)
    text = re.sub(r"(\'|\’)ll ", " will ", text)
    text = re.sub(r"(\'|\’)t ", " not ", text)
    text = re.sub(r"(\'|\’)ve ", " have ", text)
    return text

In [None]:
regular_punct = list(string.punctuation)
extra_punct = [
    ',', '.', '"', ':', ')', '(', '!', '?', '|', ';', "'", '$', '&',
    '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
    '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',
    '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”',
    '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾',
    '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼',
    '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
    'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»',
    '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
    '¹', '≤', '‡', '√', '«', '»', '´', 'º', '¾', '¡', '§', '£', '₤']
all_punct = list(set(regular_punct + extra_punct))
all_punct.remove('-')
all_punct.remove('.')

def spacing_punctuation(text):
    """
    add space before and after punctuation and symbols
    """
    for punc in all_punct:
        if punc in text:
            text = text.replace(punc, f' {punc} ')
    return text
def remove_punctuation(text):
    for punc in all_punct:
        if punc in text:
            text = text.replace(punc, ' ')
    return text

In [None]:
mis_connect_list = ['(W|w)hat', '(W|w)hy', '(H|h)ow', '(W|w)hich', '(W|w)here', '(W|w)ill']
mis_connect_re = re.compile('(%s)' % '|'.join(mis_connect_list))

mis_spell_mapping = {'whattsup': 'WhatsApp', 'whatasapp':'WhatsApp', 'whatsupp':'WhatsApp', 
                      'whatcus':'what cause', 'arewhatsapp': 'are WhatsApp', 'Hwhat':'what',
                      'Whwhat': 'What', 'whatshapp':'WhatsApp', 'howhat':'how that',
                      # why
                      'Whybis':'Why is', 
                      # How
                      "Howddo":"How do", 'Howeber':'However'}
def spacing_some_connect_words(text):
    """
    'Whyare' -> 'Why are'
    """
    ori = text
    for error in mis_spell_mapping:
        if error in text:
            text = text.replace(error, mis_spell_mapping[error])
            
    # what
    text = re.sub(r" (W|w)hat+(s)*[A|a]*(p)+ ", " WhatsApp ", text)
    text = re.sub(r" (W|w)hat\S ", " What ", text)
    text = re.sub(r" \S(W|w)hat ", " What ", text)
    # why
    text = re.sub(r" (W|w)hy\S ", " Why ", text)
    text = re.sub(r" \S(W|w)hy ", " Why ", text)
    # How
    text = re.sub(r" (H|h)ow\S ", " How ", text)
    text = re.sub(r" \S(H|h)ow ", " How ", text)
    # which
    text = re.sub(r" (W|w)hich\S ", " Which ", text)
    text = re.sub(r" \S(W|w)hich ", " Which ", text)
    # where
    text = re.sub(r" (W|w)here\S ", " Where ", text)
    text = re.sub(r" \S(W|w)here ", " Where ", text)
    # 
    text = mis_connect_re.sub(r" \1 ", text)
    text = text.replace("What sApp", 'WhatsApp')
    
    text = remove_space(text)
    return text

In [None]:
def remove_number(text):
    text = re.sub(r'[0-9]+', '', text)
    return text

In [None]:
def get_lower(text):
    text = text.lower()
    return text

In [None]:
# https://www.kaggle.com/manabendrarout/pytorch-roberta-ranking-baseline-jrstc-train/notebook
def text_cleaning_1(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = remove_space(text)
    text = clean_special_punctuations(text)
    text = clean_number(text)
    text = pre_clean_rare_words(text)
    text = decontracted(text)
    text = spacing_punctuation(text)
    text = spacing_some_connect_words(text)
    text = remove_space(text)

    return text

In [None]:
def text_cleaning_2(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [None]:
def text_cleaning_3(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = remove_number(text) 
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string
    text = get_lower(text)

    return text

## basic_feat

In [None]:
def make_basic_feat(input_df):
    out_df = pd.DataFrame()
    out_df["Num_character"] = input_df["text"].apply(lambda x:len(x))
    out_df["Num_word"] = input_df["text"].apply(lambda x:len(x.split()))
    
    return out_df

In [None]:
basic_feat_df = make_basic_feat(train_1st_df)
basic_feat_df_ruddit = make_basic_feat(train_ruddit_df)

display(basic_feat_df.head())
display(basic_feat_df_ruddit.head())

## Tfidf / Countvec

In [None]:
def get_tfidf(input_df, col_name, mode = "char_wb"):
    input_series = input_df[col_name].copy()
    if mode == "char_wb":
        vectorizer = TfidfVectorizer(min_df=3,
                                     max_df=0.5,
                                     analyzer='char_wb',
                                     ngram_range=(3,5)
                                    )
    elif mode == "n_gram":
        vectorizer = TfidfVectorizer(ngram_range=(1,2),
                                     min_df=3,
                                     max_df=0.5
                                    )
    else:
        raise Exception("SelectMode char_wb or n_gram")
    tfidf_mat = vectorizer.fit_transform(input_series)
    return tfidf_mat, vectorizer

In [None]:
def get_countvec(input_df, col_name, mode = "char_wb"):
    input_series = input_df[col_name].copy()
    if mode == "char_wb":
        vectorizer = CountVectorizer(min_df=3, 
                                     max_df=0.5, 
                                     analyzer='char_wb', 
                                     ngram_range=(3,5)
                                    )
    elif mode == "n_gram":
        vectorizer = CountVectorizer(ngram_range=(1,2),
                                     min_df=3,
                                     max_df=0.5
                                    )
    else:
        raise Exception("SelectMode char_wb or n_gram")
    tfidf_mat = vectorizer.fit_transform(input_series)
    return tfidf_mat, vectorizer

## gensim_w2v_embedding

In [None]:
def get_gensim_embed(texts,ndim):
    swem_embedding = np.zeros((len(texts), ndim))
    
    for i, text in enumerate(tqdm(texts)):
        embeddings = [w2v_model.get_vector(word)
                      if w2v_model.key_to_index.get(word) is not None
                      else np.zeros(ndim, dtype=np.float32)
                      for word in text.split()
                     ]
        if len(embeddings) > 0:
            mean_vector = np.mean(np.stack(embeddings), axis=0)
            swem_embedding[i] = mean_vector
    return swem_embedding

## fasttext_embedding

In [None]:
def get_fasttext_embed(texts,ndim):
    swem_embedding = np.zeros((len(texts), ndim))
    
    for i, text in enumerate(tqdm(texts)):
        tokens = [word for word in text.split()]
        if len(tokens)>0:
            mean_vector = np.mean(fmodel.wv[tokens], axis = 0)
            swem_embedding[i] = mean_vector
    return swem_embedding

## decompositon

In [None]:
def make_decompositon_mat(matrix, n_components = 300,mode = "svd",random_state = 0):
    if mode == "svd":
        transfomer = TruncatedSVD(n_components = n_components, random_state=random_state)
    elif mode == "nmf":
        transfomer = NMF(n_components = n_components, random_state=random_state)
    elif mode == "lda":
        transfomer = LatentDirichletAllocation(n_components = n_components, random_state=random_state)
    else:
        raise Exception("check mode")
    out_feature = transfomer.fit_transform(matrix)
    return out_feature, transfomer

In [None]:
def select_chi2_feature(matrix,y, k=1000):
    ch2_selector = SelectKBest(chi2, k=k)
    matrix_filtered = ch2_selector.fit_transform(matrix, y)
    return matrix_filtered, ch2_selector

In [None]:
def select_f_reg_feature(matrix,y, k=1000):
    f_selector = SelectKBest(f_regression, k=k)
    matrix_filtered = f_selector.fit_transform(matrix, y)
    return matrix_filtered, f_selector

## Regression_model

In [None]:
def train(X, y, cv, model):
    oof_pred = np.zeros(len(y))
    val_rmse_list = []
    model_list = []

    for fold, (train_idx, val_idx) in enumerate(cv):
        print("*" * 100)
        print(f"FOLD : {fold + 1} / {len(cv)}")
        tra_x, tra_y = X[train_idx], y[train_idx]
        val_x, val_y = X[val_idx], y[val_idx]
        
        model.fit(tra_x,tra_y)
        
        val_pred = model.predict(val_x)
        oof_pred[val_idx] = val_pred
        
        valid_score = np.sqrt(mean_squared_error(val_y, val_pred))
        val_rmse_list.append(valid_score)
        print(f"RMSE : {valid_score:.5f}")
  
        model_list.append(model)
       
    print("**"*10,"FINISH","**"*10)
    oof_score = np.sqrt(mean_squared_error(y, oof_pred))
    print(f"oof_score:{oof_score}")
    val_score_df = pd.DataFrame({"fold" : np.arange(len(cv)), "RMSE" : val_rmse_list})
    
    return oof_pred, val_score_df, model_list

### lgb

In [None]:
def fit_lgbm(X, y, cv, params: dict=None, verbose=False):
    if params is None:
        params = {}

    oof_pred = np.zeros(len(y))
    feature_importances_list = []
    val_rmse_list = []
    model_list = []

    for fold, (train_idx, val_idx) in enumerate(cv):
        print("*" * 100)
        print(f"FOLD : {fold + 1} / {len(cv)}")
        tra_x, tra_y = X[train_idx], y[train_idx]
        val_x, val_y = X[val_idx], y[val_idx]
        
        dtrain = lgb.Dataset(tra_x, tra_y)
        dvalid = lgb.Dataset(val_x, val_y)
        
        model = lgb.train(
            params, dtrain,
            valid_sets=[dtrain, dvalid],
            valid_names=['train', 'valid'],
            early_stopping_rounds=50,
            verbose_eval=verbose
        ) 
        
        val_pred = model.predict(val_x)
        oof_pred[val_idx] = val_pred
        
        valid_score = np.sqrt(mean_squared_error(val_y, val_pred))
        val_rmse_list.append(valid_score)
        print(f"RMSE : {valid_score:.5f}")
  
        model_list.append(model)
        
    print("**"*10,"FINISH","**"*10)
    oof_score = np.sqrt(mean_squared_error(y, oof_pred))
    print(f"oof_score:{oof_score}")
    val_score_df = pd.DataFrame({"fold" : np.arange(len(cv)), "RMSE" : val_rmse_list})
    
    return oof_pred, val_score_df, model_list

In [None]:
%%time
# cleanning 3 pattern 
tqdm.pandas()
train_1st_df["text_clean1"] = train_1st_df['text'].progress_apply(text_cleaning_1)
train_1st_df["text_clean2"] = train_1st_df['text'].progress_apply(text_cleaning_2)
train_1st_df["text_clean3"] = train_1st_df['text'].progress_apply(text_cleaning_3)

train_ruddit_df["text_clean1"] = train_ruddit_df['text'].progress_apply(text_cleaning_1)
train_ruddit_df["text_clean2"] = train_ruddit_df['text'].progress_apply(text_cleaning_2)
train_ruddit_df["text_clean3"] = train_ruddit_df['text'].progress_apply(text_cleaning_3)

### Tf-idf / CountVec

In [None]:
# use_cols for tfidf
use_cols = [
    "text_clean1",
    "text_clean2",
    "text_clean3"
]

In [None]:
def get_tfidf_dict(input_df, use_cols = use_cols):
    '''
       TF-IDF(char_wb & n_gram) dict for each preprocessed text  
    '''
    
    tfidf_df_dict = {}
    tfidf_vec_dict = {}
    for _col in tqdm(use_cols):
        tfidf_char,tfidf_char_vec = get_tfidf(input_df,_col,mode = "char_wb")
        tfidf_ngram,tfidf_ngram_vec = get_tfidf(input_df,_col,mode = "n_gram")

        print(tfidf_char.shape)
        print(tfidf_ngram.shape)

        tfidf_df_dict[f"tfidf_char_{_col}"] = tfidf_char
        tfidf_df_dict[f"tfidf_ngram_{_col}"]= tfidf_ngram

        tfidf_vec_dict[f"tfidf_char_{_col}"] = tfidf_char_vec
        tfidf_vec_dict[f"tfidf_ngram_{_col}"] = tfidf_ngram_vec
    
    return tfidf_df_dict,tfidf_vec_dict

In [None]:
tfidf_df_dict,tfidf_vec_dict = get_tfidf_dict(train_1st_df, use_cols = use_cols)
tfidf_df_dict_ruddit,tfidf_vec_dict_ruddit = get_tfidf_dict(train_ruddit_df, use_cols = use_cols)

In [None]:
def save_pkl(file_name, processor):
    OUTPUT_DIR = './'
    file_name = os.path.join(OUTPUT_DIR,file_name)
    pickle.dump(processor,open(file_name, 'wb'))
    
    print("FINISH")

In [None]:
def load_pkl(file_path):
    out_object = pickle.load(open(file_path, 'rb'))   
    return out_object

In [None]:
save_pkl("tfidf_vec_dict.pkl",tfidf_vec_dict)
save_pkl("tfidf_vec_dict_ruddit.pkl",tfidf_vec_dict_ruddit)

## SWEM
### Gensim(GoogleNews)

In [None]:
%%time
w2v_model = KeyedVectors.load("../input/gensim-googlenewsvectorsnegative300/GoogleNews-vectors-negative300.gensim")

In [None]:
# get embeddings for each each preprocessed text
gensim_df_dict = {}
gensim_df_dict_ruddit = {}
for _col in tqdm(use_cols):
    gensim_df_dict[f"gensim_{_col}"] = get_gensim_embed(train_1st_df[_col],ndim=300)
    gensim_df_dict_ruddit[f"gensim_{_col}"] = get_gensim_embed(train_ruddit_df[_col],ndim=300)

### Fasttext

In [None]:
%%time
fmodel = FastText.load('../input/jigsaw-regression-based-data/FastText-jigsaw-256D/Jigsaw-Fasttext-Word-Embeddings-256D.bin')

In [None]:
# get embeddings for each each preprocessed text
fasttext_df_dict = {}
fasttext_df_dict_ruddit = {}
for _col in tqdm(use_cols):
    fasttext_df_dict[f"fasttext_{_col}"] = get_fasttext_embed(train_1st_df[_col],ndim=256)
    fasttext_df_dict_ruddit[f"fasttext_{_col}"] = get_fasttext_embed(train_ruddit_df[_col],ndim=256)

## Decomposition

In [None]:
def select_topfeat_and_svd(input_df_dict, y_for_select, classif = True):
    '''
        select top features for regression and SVD
    '''
    tfidf_filtered_SVD_dict = {}
    tfidf_feat_selector = {}
    tfidf_svd_transfomer = {}

    for k,v in tqdm(input_df_dict.items()):
        # select 5000 feats
        if classif:
            matrix_filtered, feat_selector = select_chi2_feature(v,y_for_select, k=5000)
        else:
            matrix_filtered, feat_selector = select_f_reg_feature(v,y_for_select, k=5000)
        # SVD
        out_feature, transfomer = make_decompositon_mat(matrix_filtered,
                                                        n_components = 100,
                                                        mode = "svd",
                                                        random_state = 0 )

        tfidf_filtered_SVD_dict[f"{k}"] = out_feature
        tfidf_feat_selector[f"{k}"] = feat_selector
        tfidf_svd_transfomer[f"{k}"] = transfomer

    return tfidf_filtered_SVD_dict, tfidf_feat_selector, tfidf_svd_transfomer

In [None]:
y_for_chi2 = train_1st_df["target"].values
tfidf_filtered_SVD_dict, tfidf_feat_selector, tfidf_svd_transfomer = select_topfeat_and_svd(tfidf_df_dict, y_for_chi2, classif = False)
print(tfidf_filtered_SVD_dict.keys())
print(tfidf_feat_selector.keys())
print(tfidf_svd_transfomer.keys())

In [None]:
save_pkl("tfidf_feat_selector.pkl",tfidf_feat_selector)
save_pkl("tfidf_svd_transfomer.pkl",tfidf_svd_transfomer)

In [None]:
# ruddit
y_for_reg = train_ruddit_df["target"].values
tfidf_filtered_SVD_dict_ruddit, tfidf_feat_selector_ruddit, tfidf_svd_transfomer_ruddit = select_topfeat_and_svd(tfidf_df_dict_ruddit, y_for_reg, classif = False)
print(tfidf_filtered_SVD_dict_ruddit.keys())
print(tfidf_feat_selector_ruddit.keys())
print(tfidf_svd_transfomer_ruddit.keys())

In [None]:
save_pkl("tfidf_feat_selector_ruddit.pkl",tfidf_feat_selector_ruddit)
save_pkl("tfidf_svd_transfomer_ruddit.pkl",tfidf_svd_transfomer_ruddit)

## Start traingings(jigsaw 1st)

In [None]:
fold = KFold(n_splits=5,shuffle=True,random_state=0)

cv = list(fold.split(train_1st_df)) 
cv_ruddit = list(fold.split(train_ruddit_df)) 

y = train_1st_df["target"]
y_ruddit = train_ruddit_df["target"]

### Ridge(TF-IDF)

In [None]:
alpha = 2
ridge_model_dict_1 = {}
for k,v in tfidf_df_dict.items():
    print("====="*20)
    print("Get Started.")
    print(f"use_data : {k}")
    print("====="*20)
    model = Ridge(alpha=alpha)
    oof_pred, val_score_df, model_list = train(v, y, cv, model)
    display(val_score_df)
    ridge_model_dict_1[k] = model_list
    
print("====="*20)
print("FINISHED.")

In [None]:
save_pkl("ridge_model_dict_1.pkl",ridge_model_dict_1)

### lgb

In [None]:
params ={
    "boosting_type" : "gbdt",
    "objective" : "rmse", 
    "num_boost_round": 10000,
    "max_depth" : 5, 
    "num_leaves" : 28,  
    "learning_rate": 0.05, 
    "feature_fraction" : 0.5, 
    'bagging_fraction': 0.8,
    "bagging_freq" : 5, 
    "seed" : 42, 
    "reg_alpha":1, 
    "reg_lambda":1, 
    'min_child_samples': 20,
    'verbose':-1
}

#### lgb(gensim & Fasttext & basic_feat)

In [None]:
lgb_model_dict_2 = {}
use_basic_feat = ["Num_character", "Num_word"]
for _gen, _fast in zip(gensim_df_dict.items(),fasttext_df_dict.items()):
    
    mat = np.hstack([_gen[1], 
                     _fast[1],
                     basic_feat_df[use_basic_feat].values
                    ])
    
    name = f"{_gen[0]}&{_fast[0]}&basic_feat"
    print("====="*20)
    print("Get Started.")
    print(f"use_data : {name}")
    print("====="*20)
    oof_pred, val_score_df, model_list = fit_lgbm(mat, y,  cv=cv,params=params)
    display(val_score_df)
    lgb_model_dict_2[name] = model_list
    
print("====="*20)
print("FINISHED.")

In [None]:
save_pkl("lgb_model_dict_2.pkl",lgb_model_dict_2)

#### lgb(TF-IDF(SVD))

In [None]:
concat_mat = []
lgb_model_dict_3 = {}
for k,v in tfidf_filtered_SVD_dict.items():
    concat_mat.append(v)
    
concat_mat =  np.hstack(concat_mat)
oof_pred, val_score_df, model_list = fit_lgbm(concat_mat, y,  cv=cv,params=params)
lgb_model_dict_3["TF-IDF(SVD)_600dim"] = model_list
print("====="*20)
print("FINISHED.")

In [None]:
save_pkl("lgb_model_dict_3.pkl",lgb_model_dict_3)

## Start training(ruddit)

### Ridge(TF-IDF)

In [None]:
alpha = 1
ridge_model_dict_1_ruddit = {}
for k,v in tfidf_df_dict_ruddit.items():
    print("====="*20)
    print("Get Started.")
    print(f"use_data : {k}")
    print("====="*20)
    model = Ridge(alpha=alpha)
    oof_pred, val_score_df, model_list = train(v, y_ruddit, cv_ruddit, model)
    display(val_score_df)
    ridge_model_dict_1_ruddit[k] = model_list
    
print("====="*20)
print("FINISHED.")

In [None]:
save_pkl("ridge_model_dict_1_ruddit.pkl",ridge_model_dict_1_ruddit)

#### lgb(gensim & Fasttext & basic_feat)

In [None]:
lgb_model_dict_1_ruddit = {}
use_basic_feat = ["Num_character", "Num_word"]
for _gen, _fast in zip(gensim_df_dict_ruddit.items(),fasttext_df_dict_ruddit.items()):
    
    mat = np.hstack([_gen[1], 
                     _fast[1],
                     basic_feat_df_ruddit[use_basic_feat].values
                    ])
    
    name = f"{_gen[0]}&{_fast[0]}&basic_feat"
    print("====="*20)
    print("Get Started.")
    print(f"use_data : {name}")
    print("====="*20)
    oof_pred, val_score_df, model_list = fit_lgbm(mat, y_ruddit,  cv=cv_ruddit,params=params)
    display(val_score_df)
    lgb_model_dict_1_ruddit[name] = model_list
    
print("====="*20)
print("FINISHED.")

In [None]:
save_pkl("lgb_model_dict_1_ruddit.pkl",lgb_model_dict_1_ruddit)

#### lgb(TF-IDF(SVD))

In [None]:
concat_mat = []
lgb_model_dict_2_ruddit = {}
for k,v in tfidf_filtered_SVD_dict_ruddit.items():
    concat_mat.append(v)
    
concat_mat =  np.hstack(concat_mat)
oof_pred, val_score_df, model_list = fit_lgbm(concat_mat, y_ruddit,  cv=cv_ruddit,params=params)
lgb_model_dict_2_ruddit["TF-IDF(SVD)_600dim"] = model_list
print("====="*20)
print("FINISHED.")

In [None]:
save_pkl("lgb_model_dict_2_ruddit.pkl",lgb_model_dict_2_ruddit)

In [None]:
del (train_1st_df, 
     train_ruddit_df,
     tfidf_df_dict,
     tfidf_df_dict_ruddit,
     gensim_df_dict,
     gensim_df_dict_ruddit,
     fasttext_df_dict, 
     fasttext_df_dict_ruddit, 
     tfidf_filtered_SVD_dict, 
     tfidf_filtered_SVD_dict_ruddit)
gc.collect()

## Validation

In [None]:
# get unique text id of validation_data
whole_unique_sentence = validation_df["less_toxic"].append(validation_df["more_toxic"]).unique()
sentence_master_dict = {_:i for i, _ in enumerate(whole_unique_sentence)}
validation_df["less_toxic_id"] = validation_df["less_toxic"].map(sentence_master_dict)
validation_df["more_toxic_id"] = validation_df["more_toxic"].map(sentence_master_dict)
del whole_unique_sentence
gc.collect()

In [None]:
val_df_for_pred = pd.DataFrame(data = {"id":sentence_master_dict.values(),
                                       "text":sentence_master_dict.keys()
                                      })
del sentence_master_dict
gc.collect()

### preprocess

In [None]:
%%time
tqdm.pandas()
val_df_for_pred["text_clean1"] = val_df_for_pred['text'].progress_apply(text_cleaning_1)
val_df_for_pred["text_clean2"] = val_df_for_pred['text'].progress_apply(text_cleaning_2)
val_df_for_pred["text_clean3"] = val_df_for_pred['text'].progress_apply(text_cleaning_3)

### get tfidf

In [None]:
def get_val_tfidf_dict(vec_dict, val_df_for_pred):
    out_dict = {}
    for _name,_vec in tqdm(vec_dict.items()):
        _col = "_".join(_name.split("_")[-2:]) 
        out_dict[_name] = _vec.transform(val_df_for_pred[_col])
    return out_dict

In [None]:
tfidf_df_val_dict = get_val_tfidf_dict(tfidf_vec_dict, val_df_for_pred)
tfidf_df_val_dict_ruddit = get_val_tfidf_dict(tfidf_vec_dict_ruddit, val_df_for_pred)

### SWEM(gensim /　fasttext)

In [None]:
gensim_df_val_dict = {}
for _col in tqdm(use_cols):
    gensim_df_val_dict[f"gensim_{_col}"] = get_gensim_embed(val_df_for_pred[_col],ndim=300)

In [None]:
fasttext_df_val_dict = {}
for _col in tqdm(use_cols):
    fasttext_df_val_dict[f"fasttext_{_col}"] = get_fasttext_embed(val_df_for_pred[_col],ndim=256)

### basic_feat

In [None]:
basic_feat_val_df = make_basic_feat(val_df_for_pred)
basic_feat_val_df.head()

### select feat and SVD

In [None]:
def get_val_select_topfeat_and_svd(tfidf_df_val_dict,selector_dict,svd_transfomer_dict):
    svd_matrix_list = []
    for i in tqdm(range(6)):
        _mat = list(tfidf_df_val_dict.values())[i]
        _selector = list(selector_dict.values())[i]
        _svd_transformer = list(svd_transfomer_dict.values())[i]

        _matrix_filtered = _selector.transform(_mat)
        _svd_matrix = _svd_transformer.transform(_matrix_filtered)
        svd_matrix_list.append(_svd_matrix)

    val_svd_matrix =  np.hstack(svd_matrix_list)
    return val_svd_matrix

In [None]:
val_svd_matrix = get_val_select_topfeat_and_svd(tfidf_df_val_dict,
                                                tfidf_feat_selector,
                                                tfidf_svd_transfomer)
val_svd_matrix_ruddit = get_val_select_topfeat_and_svd(tfidf_df_val_dict_ruddit,
                                                       tfidf_feat_selector_ruddit,
                                                       tfidf_svd_transfomer_ruddit)

In [None]:
print(val_svd_matrix.shape)
print(val_svd_matrix_ruddit.shape)

In [None]:
def valid(model_dict, df_val_dict):
    preds_list = []
    for _name,_model_list in model_dict.items():
        _preds =[]
        for i in range(len(cv)):
            pred = _model_list[i].predict(df_val_dict[_name])
            _preds.append(pred)
        preds_list.append(np.mean(_preds, axis=0))
    return preds_list

In [None]:
def scoring(pred):
    val_df_for_pred["pred"] = pred
    scoring_dict = val_df_for_pred.set_index("id")["pred"].to_dict()
    
    validation_df["less_toxic_pred"] = validation_df["less_toxic_id"].map(scoring_dict)
    validation_df["more_toxic_pred"] = validation_df["more_toxic_id"].map(scoring_dict)
    validation_df["correct"] = (validation_df["less_toxic_pred"]  < validation_df["more_toxic_pred"]).astype(int)
    # スコア
    print(validation_df["correct"].mean())

### Ridge_TFIDF_1st

In [None]:
_preds_list1 = valid(ridge_model_dict_1, tfidf_df_val_dict)
for _pred in _preds_list1:
    scoring(_pred)
    
print("ensemble")
scoring(np.mean(_preds_list1,axis = 0))

### Ridge_TFIDF_ruddit

In [None]:
_preds_list2 = valid(ridge_model_dict_1_ruddit, tfidf_df_val_dict_ruddit)
for _pred in _preds_list2:
    scoring(_pred)
    
print("ensemble")
scoring(np.mean(_preds_list2,axis = 0))

### lgb(gensim & Fasttext & basic_feat )

In [None]:
def valid2(lgb_model_dict):
    preds_list = []
    for i in tqdm(range(3)):
        _gen = list(gensim_df_val_dict.values())[i]
        _fast = list(fasttext_df_val_dict.values())[i]

        mat = np.hstack([_gen,
                         _fast,
                         basic_feat_val_df[use_basic_feat].values
                        ])

        _preds =[]
        _model_list = list(lgb_model_dict.values())[i]

        for j in range(len(cv)):
            pred = _model_list[j].predict(mat)
            _preds.append(pred)
        preds_list.append(np.mean(_preds, axis=0))
    return preds_list

In [None]:
#jigsaw 1st
preds_list3 = valid2(lgb_model_dict_2)
    
for _pred in preds_list3:
    scoring(_pred)
    
print("ensemble")
scoring(np.mean(preds_list3,axis = 0))

In [None]:
# rudit
preds_list4= valid2(lgb_model_dict_1_ruddit)
    
for _pred in preds_list4:
    scoring(_pred)
    
print("ensemble")
scoring(np.mean(preds_list4,axis = 0))

### lgb TF-IDF(SVD) only

In [None]:
# jigsaw 1st
preds5 = []
for _model in lgb_model_dict_3['TF-IDF(SVD)_600dim']:
    pred = _model.predict(val_svd_matrix)
    preds5.append(pred)
scoring(np.mean(preds5,axis = 0))

In [None]:
# ruddit
preds6 = []
for _model in lgb_model_dict_2_ruddit['TF-IDF(SVD)_600dim']:
    pred = _model.predict(val_svd_matrix_ruddit)
    preds6.append(pred)
scoring(np.mean(preds6,axis = 0))

In [None]:
del w2v_model,fmodel
gc.collect()

### RoBERTa(validation)

In [None]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

class JigsawDataset(Dataset):
    def __init__(self, CFG, input_df, is_train=True):
        self.CFG = CFG
        self.is_train = is_train 
        self.text = input_df[self.CFG.text].values
        self.tokenizer = AutoTokenizer.from_pretrained(self.CFG.model_name)
        if self.is_train:
            self.labels = input_df[self.CFG.target].values       
             
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        text =  self.text[idx]
        encoded = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.CFG.max_len,
            padding='max_length'
        )
        input_ids = torch.tensor(encoded['input_ids'])
        attention_mask = torch.tensor(encoded['attention_mask'])
        
        if self.is_train:
            label = torch.tensor(self.labels[idx])
            return input_ids, attention_mask, label
        return input_ids, attention_mask
    
class JigsawModel(nn.Module):
    def __init__(self, CFG):
        super().__init__()
        self.CFG = CFG
        self.model = AutoModel.from_pretrained(self.CFG.model_name)
        self.fc_dropout = nn.Dropout(self.CFG.fc_dropout)
        self.fc = nn.Linear(self.CFG.hidden_size, self.CFG.target_size)
    
    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids = input_ids, 
                         attention_mask = attention_mask)       
        out = self.fc_dropout(out[1])
        outputs = self.fc(out)
        return outputs

def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    
    for step, (input_ids, attention_mask) in tqdm(enumerate(test_loader), total = len(test_loader)):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        
        with torch.no_grad():
            y_preds = model(input_ids, attention_mask)
        preds.append(y_preds.to("cpu").numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
val_dataset = JigsawDataset(CFG, val_df_for_pred, is_train = False)
val_loader = DataLoader(val_dataset, 
                        batch_size=CFG.batch_size,
                        shuffle=False,
                        num_workers=CFG.num_workers, 
                        pin_memory=True, 
                        drop_last=False)

In [None]:
bert_preds = []
for fold in range(CFG.n_fold):
    model = JigsawModel(CFG)
    state = torch.load(CFG.model_dir+f"/{CFG.model_name.split('/')[-1]}_fold{fold}_best.pth", map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(val_loader, model, device)
    bert_preds.append(prediction)
    del model, state; gc.collect()
    torch.cuda.empty_cache()

In [None]:
scoring(np.mean(bert_preds,axis = 0))

In [None]:
del val_dataset, val_loader; gc.collect()
torch.cuda.empty_cache()

In [None]:
pr1 = np.mean(_preds_list1,axis = 0) # 1st-ridge
pr2 = np.mean(_preds_list2,axis = 0) # ruddit-ridge
pr3 = np.mean(preds_list3,axis = 0) # 1st-lgb
pr4 = np.mean(preds_list4,axis = 0) # ruddit-lgb
pr5 = np.mean(preds5,axis = 0) # 1st-lgb-svd
pr6 = np.mean(preds6,axis = 0) # ruddit-lgb-svd
pr7 = np.mean(bert_preds,axis = 0) # bert 

scaler = MinMaxScaler() 
pr1_scaled = scaler.fit_transform(pr1.reshape(-1,1))
pr2_scaled = scaler.fit_transform(pr2.reshape(-1,1))
pr3_scaled = scaler.fit_transform(pr3.reshape(-1,1))
pr4_scaled = scaler.fit_transform(pr4.reshape(-1,1))
pr5_scaled = scaler.fit_transform(pr5.reshape(-1,1))
pr6_scaled = scaler.fit_transform(pr6.reshape(-1,1))
pr7_scaled = scaler.fit_transform(pr7.reshape(-1,1))

pred_scaled_mat = np.hstack([pr1_scaled,
                             pr2_scaled,
                             pr3_scaled,
                             pr4_scaled,
                             pr5_scaled,
                             pr6_scaled,
                             pr7_scaled
                            ])

### save pred for validation

In [None]:
def make_valid_pred(pred_scaled_mat):
    for i in range(pred_scaled_mat.shape[1]):
        val_df_for_pred["pred"] = pred_scaled_mat[:,i]
        scoring_dict = val_df_for_pred.set_index("id")["pred"].to_dict()  
        validation_df[f"Morita_less_toxic_pred_{i}"] = validation_df["less_toxic_id"].map(scoring_dict)
        validation_df[f"Morita_more_toxic_pred_{i}"] = validation_df["more_toxic_id"].map(scoring_dict)
    return validation_df

In [None]:
validation_df_add_pred = make_valid_pred(pred_scaled_mat)
drop_cols_for_valid = ["less_toxic_id","more_toxic_id","correct","less_toxic_pred","more_toxic_pred"]

In [None]:
validation_df_add_pred = validation_df_add_pred.drop(drop_cols_for_valid,axis = 1)

In [None]:
validation_df_add_pred.to_csv("./validation_df_add_pred.csv",index = False)

In [None]:
validation_df_add_pred.head()