# Model used:
* TFIDF+RIDGE v3 
* TFIDF+RIDGE v4
* Jigsaw - Simple Models v1
*  Fork of Jigsaw - Simple Models cc8ddd (Roberta orig Data) or Fork of Jigsaw - Simple Models (TFID calib data)

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import progressbar
import nltk
import matplotlib.pyplot as plt
import re

from tqdm.auto import tqdm
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, LogisticRegression, RidgeCV, ElasticNet, SGDRegressor

from scipy.sparse import hstack

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from sklearn.naive_bayes import MultinomialNB
from scipy import sparse
import gc

/kaggle/input/jigsaw-toxic-severity-rating/sample_submission.csv
/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv
/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv
/kaggle/input/roberta-base/rust_model.ot
/kaggle/input/roberta-base/config.json
/kaggle/input/roberta-base/merges.txt
/kaggle/input/roberta-base/README.md
/kaggle/input/roberta-base/tokenizer.json
/kaggle/input/roberta-base/vocab.json
/kaggle/input/roberta-base/tf_model.h5
/kaggle/input/roberta-base/dict.txt
/kaggle/input/roberta-base/pytorch_model.bin
/kaggle/input/roberta-base/flax_model.msgpack
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv


# TFIDF+RIDGE v4

In [2]:
TRAIN_DATA_PATH = "/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv"
TEST_DATA_PATH = "/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv"
VALID_DATA_PATH = "../input/jigsaw-toxic-severity-rating/validation_data.csv"
SAMPLE_SUBMISSION = "/kaggle/input/jigsaw-toxic-severity-rating/sample_submission.csv"

In [3]:
df_train = pd.read_csv(TRAIN_DATA_PATH)
df_test = pd.read_csv(TEST_DATA_PATH)
df_test_caps = df_test.copy()
df_validation_data = pd.read_csv(VALID_DATA_PATH)
df_sample_submission = pd.read_csv(SAMPLE_SUBMISSION)
df_validation_data_caps = df_validation_data.copy()

In [4]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+')  # Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml')  # Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) # Remove special Charecters
    text = re.sub(' +', ' ', text) # Remove Extra Spaces
    text = text.strip().lower() # remove spaces at the beginning and at the end of string and make string lower
    
    # lemmatization
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split(' ')])
    # del stopwords
    text = ' '.join([word for word in text.split(' ') if word not in stop])

    return text

In [5]:
def clean(data, col):
    
    data[col] = data[col].str.replace('https?://\S+|www\.\S+', ' social medium ')      
        
    data[col] = data[col].str.lower()
    data[col] = data[col].str.replace("4", "a") 
    data[col] = data[col].str.replace("2", "l")
    data[col] = data[col].str.replace("5", "s") 
    data[col] = data[col].str.replace("1", "i") 
    data[col] = data[col].str.replace("!", "i") 
    data[col] = data[col].str.replace("|", "i") 
    data[col] = data[col].str.replace("0", "o") 
    data[col] = data[col].str.replace("l3", "b") 
    data[col] = data[col].str.replace("7", "t") 
    data[col] = data[col].str.replace("7", "+") 
    data[col] = data[col].str.replace("8", "ate") 
    data[col] = data[col].str.replace("3", "e") 
    data[col] = data[col].str.replace("9", "g")
    data[col] = data[col].str.replace("6", "g")
    data[col] = data[col].str.replace("@", "a")
    data[col] = data[col].str.replace("$", "s")
    data[col] = data[col].str.replace("#ofc", " of fuckin course ")
    data[col] = data[col].str.replace("fggt", " faggot ")
    data[col] = data[col].str.replace("your", " your ")
    data[col] = data[col].str.replace("self", " self ")
    data[col] = data[col].str.replace("cuntbag", " cunt bag ")
    data[col] = data[col].str.replace("fartchina", " fart china ")    
    data[col] = data[col].str.replace("youi", " you i ")
    data[col] = data[col].str.replace("cunti", " cunt i ")
    data[col] = data[col].str.replace("sucki", " suck i ")
    data[col] = data[col].str.replace("pagedelete", " page delete ")
    data[col] = data[col].str.replace("cuntsi", " cuntsi ")
    data[col] = data[col].str.replace("i'm", " i am ")
    data[col] = data[col].str.replace("offuck", " of fuck ")
    data[col] = data[col].str.replace("centraliststupid", " central ist stupid ")
    data[col] = data[col].str.replace("hitleri", " hitler i ")
    data[col] = data[col].str.replace("i've", " i have ")
    data[col] = data[col].str.replace("i'll", " sick ")
    data[col] = data[col].str.replace("fuck", " fuck ")
    data[col] = data[col].str.replace("f u c k", " fuck ")
    data[col] = data[col].str.replace("shit", " shit ")
    data[col] = data[col].str.replace("bunksteve", " bunk steve ")
    data[col] = data[col].str.replace('wikipedia', ' social medium ')
    data[col] = data[col].str.replace("faggot", " faggot ")
    data[col] = data[col].str.replace("delanoy", " delanoy ")
    data[col] = data[col].str.replace("jewish", " jewish ")
    data[col] = data[col].str.replace("sexsex", " sex ")
    data[col] = data[col].str.replace("allii", " all ii ")
    data[col] = data[col].str.replace("i'd", " i had ")
    data[col] = data[col].str.replace("'s", " is ")
    data[col] = data[col].str.replace("youbollocks", " you bollocks ")
    data[col] = data[col].str.replace("dick", " dick ")
    data[col] = data[col].str.replace("cuntsi", " cuntsi ")
    data[col] = data[col].str.replace("mothjer", " mother ")
    data[col] = data[col].str.replace("cuntfranks", " cunt ")
    data[col] = data[col].str.replace("ullmann", " jewish ")
    data[col] = data[col].str.replace("mr.", " mister ")
    data[col] = data[col].str.replace("aidsaids", " aids ")
    data[col] = data[col].str.replace("njgw", " nigger ")
    data[col] = data[col].str.replace("wiki", " social medium ")
    data[col] = data[col].str.replace("administrator", " admin ")
    data[col] = data[col].str.replace("gamaliel", " jewish ")
    data[col] = data[col].str.replace("rvv", " vanadalism ")
    data[col] = data[col].str.replace("admins", " admin ")
    data[col] = data[col].str.replace("pensnsnniensnsn", " penis ")
    data[col] = data[col].str.replace("pneis", " penis ")
    data[col] = data[col].str.replace("pennnis", " penis ")
    data[col] = data[col].str.replace("pov.", " point of view ")
    data[col] = data[col].str.replace("vandalising", " vandalism ")
    data[col] = data[col].str.replace("cock", " dick ")
    data[col] = data[col].str.replace("asshole", " asshole ")
    data[col] = data[col].str.replace("youi", " you ")
    data[col] = data[col].str.replace("afd", " all fucking day ")
    data[col] = data[col].str.replace("sockpuppets", " sockpuppetry ")
    data[col] = data[col].str.replace("iiprick", " iprick ")
    data[col] = data[col].str.replace("penisi", " penis ")
    data[col] = data[col].str.replace("warrior", " warrior ")
    data[col] = data[col].str.replace("loil", " laughing out insanely loud ")
    data[col] = data[col].str.replace("vandalise", " vanadalism ")
    data[col] = data[col].str.replace("helli", " helli ")
    data[col] = data[col].str.replace("lunchablesi", " lunchablesi ")
    data[col] = data[col].str.replace("special", " special ")
    data[col] = data[col].str.replace("ilol", " i lol ")
    data[col] = data[col].str.replace(r'\b[uU]\b', 'you')
    data[col] = data[col].str.replace(r"what's", "what is ")
    data[col] = data[col].str.replace(r"\'s", " is ")
    data[col] = data[col].str.replace(r"\'ve", " have ")
    data[col] = data[col].str.replace(r"can't", "cannot ")
    data[col] = data[col].str.replace(r"n't", " not ")
    data[col] = data[col].str.replace(r"i'm", "i am ")
    data[col] = data[col].str.replace(r"\'re", " are ")
    data[col] = data[col].str.replace(r"\'d", " would ")
    data[col] = data[col].str.replace(r"\'ll", " will ")
    data[col] = data[col].str.replace(r"\'scuse", " excuse ")
    data[col] = data[col].str.replace('\s+', ' ')  # will remove more than one whitespace character
#     text = re.sub(r'\b([^\W\d_]+)(\s+\1)+\b', r'\1', re.sub(r'\W+', ' ', text).strip(), flags=re.I)  # remove repeating words coming immediately one after another
    data[col] = data[col].str.replace(r'(.)\1+', r'\1\1') # 2 or more characters are replaced by 2 characters
#     text = re.sub(r'((\b\w+\b.{1,2}\w+\b)+).+\1', r'\1', text, flags = re.I)
    data[col] = data[col].str.replace("[:|♣|'|§|♠|*|/|?|=|%|&|-|#|•|~|^|>|<|►|_]", '')
    
    
    data[col] = data[col].str.replace(r"what's", "what is ")    
    data[col] = data[col].str.replace(r"\'ve", " have ")
    data[col] = data[col].str.replace(r"can't", "cannot ")
    data[col] = data[col].str.replace(r"n't", " not ")
    data[col] = data[col].str.replace(r"i'm", "i am ")
    data[col] = data[col].str.replace(r"\'re", " are ")
    data[col] = data[col].str.replace(r"\'d", " would ")
    data[col] = data[col].str.replace(r"\'ll", " will ")
    data[col] = data[col].str.replace(r"\'scuse", " excuse ")
    data[col] = data[col].str.replace(r"\'s", " ")

    # Clean some punctutations
    data[col] = data[col].str.replace('\n', ' \n ')
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters more than 3 times to length of 3
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')    
    # Add space around repeating characters
    data[col] = data[col].str.replace(r'([*!?\']+)',r' \1 ')    
    # patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}',' ').str.strip()   
    data[col] = data[col].str.replace(r'[ ]{2,}',' ').str.strip()   
    data[col] = data[col].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    
    return data

In [6]:
def add_FE(df,X, var):
    df['nr_upper'] = df[var].str.findall(r'[A-Z]').str.len()
    df['len_text'] = df[var].str.len()
    df['pct_upper'] = df['nr_upper'] / df['len_text']
    X1 = pd.concat((X,df['pct_upper']), axis=1)
    return X1

In [7]:
def add_FE_np(df,X, var):
    df['nr_upper'] = df[var].str.findall(r'[A-Z]').str.len()
    df['len_text'] = df[var].str.len()
    df['pct_upper'] = df['nr_upper'] / df['len_text']
    X1 = hstack((X,df['pct_upper'].values[:,None]))
    return X1

In [8]:
# Create a score that messure how much toxic is a comment
cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

# cat_mtpl = {'obscene': 3, 'toxic': 4, 'threat': 4, 
#             'insult': 2, 'severe_toxic': 4, 'identity_hate': 2}

for category in cat_mtpl:
    df_train[category] = df_train[category] * cat_mtpl[category]

df_train['score'] = df_train.loc[:, 'toxic':'identity_hate'].sum(axis=1)

df_train['y'] = df_train['score']
'''
min_len = (df_train['y'] > 0).sum()  # len of toxic comments
df_y0_undersample = df_train[df_train['y'] == 0].sample(n=min_len, random_state=201)  # take non toxic comments
df_train_new = pd.concat([df_train[df_train['y'] > 0], df_y0_undersample])  # make new df
df_train_new'''
df_train_caps = df_train.copy()
df_train_new = df_train.copy()

In [9]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')

# tqdm.pandas()
# df_train_new['clean_text'] = df_train_new['comment_text'].progress_apply(text_cleaning)
df_train_new = clean(df_train_new, 'comment_text')

In [10]:
tqdm.pandas()
df_train_new['clean_text'] = df_train_new['comment_text'].progress_apply(text_cleaning)

  0%|          | 0/159571 [00:00<?, ?it/s]

In [11]:
df_train_new = add_FE(df_train_caps,df_train_new, 'comment_text')

In [12]:
labels = df_train_new['y']
comments = df_train_new['clean_text']

vectorizer = TfidfVectorizer(min_df=3, max_df=0.5, analyzer='char_wb', ngram_range=(3,5))
comments_tr = vectorizer.fit_transform(comments)
comments_tr

<159571x202178 sparse matrix of type '<class 'numpy.float64'>'
	with 58327449 stored elements in Compressed Sparse Row format>

In [13]:
comments_tr = add_FE_np(df_train_caps,comments_tr, "comment_text")

In [14]:
regressor = Ridge(random_state=42, alpha=0.8)
regressor.fit(comments_tr, labels)

Ridge(alpha=0.8, random_state=42)

In [15]:
# preprocess val data

'''tqdm.pandas()
df_validation_data = clean(df_validation_data, 'less_toxic')
df_validation_data = clean(df_validation_data, 'more_toxic')
df_validation_data['less_toxic'] = df_validation_data['less_toxic'].progress_apply(text_cleaning)
df_validation_data['more_toxic'] = df_validation_data['more_toxic'].progress_apply(text_cleaning)

less_toxic = vectorizer.transform(df_validation_data['less_toxic'])
more_toxic = vectorizer.transform(df_validation_data['more_toxic'])

less_toxic = add_FE_np(df_validation_data_caps,less_toxic, "less_toxic")
more_toxic = add_FE_np(df_validation_data_caps,more_toxic, "more_toxic")

# make predictions
y_pred_less_model2 = regressor.predict(less_toxic)
y_pred_more_model2 = regressor.predict(more_toxic)

(y_pred_less_model2 < y_pred_more_model2).mean()'''

'tqdm.pandas()\ndf_validation_data = clean(df_validation_data, \'less_toxic\')\ndf_validation_data = clean(df_validation_data, \'more_toxic\')\ndf_validation_data[\'less_toxic\'] = df_validation_data[\'less_toxic\'].progress_apply(text_cleaning)\ndf_validation_data[\'more_toxic\'] = df_validation_data[\'more_toxic\'].progress_apply(text_cleaning)\n\nless_toxic = vectorizer.transform(df_validation_data[\'less_toxic\'])\nmore_toxic = vectorizer.transform(df_validation_data[\'more_toxic\'])\n\nless_toxic = add_FE_np(df_validation_data_caps,less_toxic, "less_toxic")\nmore_toxic = add_FE_np(df_validation_data_caps,more_toxic, "more_toxic")\n\n# make predictions\ny_pred_less_model2 = regressor.predict(less_toxic)\ny_pred_more_model2 = regressor.predict(more_toxic)\n\n(y_pred_less_model2 < y_pred_more_model2).mean()'

In [16]:
'''import matplotlib.pyplot as plt
plt.hist(pd.concat( (pd.Series(y_pred_less_model2),pd.Series(y_pred_more_model2)) ,axis=1))
plt.show()'''

'import matplotlib.pyplot as plt\nplt.hist(pd.concat( (pd.Series(y_pred_less_model2),pd.Series(y_pred_more_model2)) ,axis=1))\nplt.show()'

#### Scaling the output and making it a probability

In [17]:
'''full = pd.concat( (pd.Series(y_pred_less_model2),pd.Series(y_pred_more_model2)) ,axis=0)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-6,+6))
full_scale = scaler.fit_transform(full.values.reshape(-1, 1))

avg_full_scale = np.mean(full_scale)
full_scale = full_scale - avg_full_scale
full_scale = 1/ (1 + np.exp(-full_scale))

y_pred_less_model2_sc = full_scale[:len(y_pred_less_model2),:].reshape(1, -1)[0]
y_pred_more_model2_sc = full_scale[len(y_pred_less_model2):,:].reshape(1, -1)[0]
(y_pred_less_model2_sc < y_pred_more_model2_sc).mean()'''

'full = pd.concat( (pd.Series(y_pred_less_model2),pd.Series(y_pred_more_model2)) ,axis=0)\n\nfrom sklearn.preprocessing import MinMaxScaler\nscaler = MinMaxScaler(feature_range=(-6,+6))\nfull_scale = scaler.fit_transform(full.values.reshape(-1, 1))\n\navg_full_scale = np.mean(full_scale)\nfull_scale = full_scale - avg_full_scale\nfull_scale = 1/ (1 + np.exp(-full_scale))\n\ny_pred_less_model2_sc = full_scale[:len(y_pred_less_model2),:].reshape(1, -1)[0]\ny_pred_more_model2_sc = full_scale[len(y_pred_less_model2):,:].reshape(1, -1)[0]\n(y_pred_less_model2_sc < y_pred_more_model2_sc).mean()'

In [18]:
'''import matplotlib.pyplot as plt
plt.hist(pd.concat( (pd.Series(y_pred_less_model2_sc),pd.Series(y_pred_more_model2_sc)),axis=1))
plt.show()'''

'import matplotlib.pyplot as plt\nplt.hist(pd.concat( (pd.Series(y_pred_less_model2_sc),pd.Series(y_pred_more_model2_sc)),axis=1))\nplt.show()'

In [19]:
df_test = clean(df_test, 'text')
df_test['text'] = df_test['text'].progress_apply(text_cleaning)

  0%|          | 0/7537 [00:00<?, ?it/s]

In [20]:
df_test = vectorizer.transform(df_test['text'])
df_test = add_FE_np(df_test_caps,df_test, "text")
pred = regressor.predict(df_test)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-6,+6))
pred_scale = scaler.fit_transform(pred.reshape(-1, 1))

avg_pred_scale = np.mean(pred_scale)
pred_scale = pred_scale - avg_pred_scale
pred_scale = 1/ (1 + np.exp(-pred_scale))

df_test_model2 = pd.concat((pd.DataFrame(df_test_caps['comment_id']), pd.DataFrame(pred_scale, columns=["score"])), axis=1)

In [21]:
'''import matplotlib.pyplot as plt
plt.hist(pred_scale)
plt.show()'''

'import matplotlib.pyplot as plt\nplt.hist(pred_scale)\nplt.show()'

In [22]:
df_test_model2.to_csv('./submission_model2.csv', index=False)

In [23]:
del df_test, comments_tr, regressor
gc.collect()

1357

# Roberta Model + RandomForest (Orig Data)

In [24]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import torch
#from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

In [25]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0 ).astype(int)
df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})

In [26]:
class Dataset:
    def __init__(self, text, tokenizer, max_len):
        self.text = text
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
        }
    
def generate_predictions(df, var, model_path, max_len):
    model = AutoModel.from_pretrained(model_path, return_dict=False)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model.to("cuda")
    model.eval()
    
    dataset = Dataset(text=df[var].values, tokenizer=tokenizer, max_len=max_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=4, pin_memory=True, shuffle=False
    )

    final_output = []

    for b_idx, data in enumerate(data_loader):
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")
            _, output = model(**data)
            output = output.cpu().numpy()
            final_output.extend(output)
    
    torch.cuda.empty_cache()
    return  np.array(final_output)

In [27]:
model_path =  '../input/roberta-base/'
X = generate_predictions(df, 'text', model_path, max_len=192)

In [28]:
def add_FE2(df,X, var):
    df['nr_upper'] = df[var].str.findall(r'[A-Z]').str.len()
    df['len_text'] = df[var].str.len()
    df['pct_upper'] = df['nr_upper'] / df['len_text']
    X1 = np.hstack((X,df['pct_upper'].values[:,None]))
    return X1
X = add_FE2(df,X,'text')

In [29]:
model = RandomForestClassifier(random_state=0, n_estimators=100, max_depth=32, min_samples_leaf=50, class_weight="balanced")
model.fit(X, df['y']) 

RandomForestClassifier(class_weight='balanced', max_depth=32,
                       min_samples_leaf=50, random_state=0)

In [30]:
del X, df
gc.collect()

64

In [31]:
'''df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
X_less_toxic = generate_predictions(df_val, 'less_toxic', model_path, max_len=192)
X_more_toxic = generate_predictions(df_val, 'more_toxic', model_path, max_len=192)
X_less_toxic = add_FE2(df_val,X_less_toxic,'less_toxic')
X_more_toxic = add_FE2(df_val,X_more_toxic,'more_toxic')
y_pred_less_model4 = model.predict_proba(X_less_toxic)
y_pred_more_model4 = model.predict_proba(X_more_toxic)
(y_pred_less_model4[:, 1] < y_pred_more_model4[:, 1]).mean()'''

'df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")\nX_less_toxic = generate_predictions(df_val, \'less_toxic\', model_path, max_len=192)\nX_more_toxic = generate_predictions(df_val, \'more_toxic\', model_path, max_len=192)\nX_less_toxic = add_FE2(df_val,X_less_toxic,\'less_toxic\')\nX_more_toxic = add_FE2(df_val,X_more_toxic,\'more_toxic\')\ny_pred_less_model4 = model.predict_proba(X_less_toxic)\ny_pred_more_model4 = model.predict_proba(X_more_toxic)\n(y_pred_less_model4[:, 1] < y_pred_more_model4[:, 1]).mean()'

In [32]:
'''import matplotlib.pyplot as plt
plt.hist(pd.concat( (pd.Series(y_pred_less_model4[:, 1]),pd.Series(y_pred_more_model4[:, 1])) ,axis=1))
plt.show()'''

'import matplotlib.pyplot as plt\nplt.hist(pd.concat( (pd.Series(y_pred_less_model4[:, 1]),pd.Series(y_pred_more_model4[:, 1])) ,axis=1))\nplt.show()'

In [33]:
'''y_pred_less_model = 0.8 * y_pred_less_model2_sc + 0.2 * y_pred_less_model4[:,1]
y_pred_more_model = 0.8 * y_pred_more_model2_sc + 0.2 * y_pred_more_model4[:,1]
print((y_pred_less_model < y_pred_more_model).mean())'''

'y_pred_less_model = 0.8 * y_pred_less_model2_sc + 0.2 * y_pred_less_model4[:,1]\ny_pred_more_model = 0.8 * y_pred_more_model2_sc + 0.2 * y_pred_more_model4[:,1]\nprint((y_pred_less_model < y_pred_more_model).mean())'

In [34]:
df_sub = pd.read_csv(TEST_DATA_PATH)
#X_test = vec.transform(df_sub['text'])
X_test = generate_predictions(df_sub, 'text', model_path, max_len=192)
X_test = add_FE2(df_sub,X_test,'text')
df_test_model4 = model.predict_proba(X_test)
df_sub['score'] = df_test_model4[:, 1]

In [35]:
df_sub[['comment_id', 'score']].to_csv("submission_model4.csv", index=False)

In [36]:
'''from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
y_pred_less_model1_norm = scaler.fit_transform(y_pred_less_model1.reshape(-1, 1))
y_pred_more_model1_norm = scaler.fit_transform(y_pred_more_model1.reshape(-1, 1))
y_pred_less_model1_norm = y_pred_less_model1_norm.reshape(1, -1)[0]
y_pred_more_model1_norm = y_pred_more_model1_norm.reshape(1, -1)[0]
scaler = MinMaxScaler()
y_pred_less_model2_norm = scaler.fit_transform(y_pred_less_model2.reshape(-1, 1))
y_pred_more_model2_norm = scaler.fit_transform(y_pred_more_model2.reshape(-1, 1))
y_pred_less_model2_norm = y_pred_less_model2_norm.reshape(1, -1)[0]
y_pred_more_model2_norm = y_pred_more_model2_norm.reshape(1, -1)[0]'''

'from sklearn.preprocessing import MinMaxScaler\nscaler = MinMaxScaler()\ny_pred_less_model1_norm = scaler.fit_transform(y_pred_less_model1.reshape(-1, 1))\ny_pred_more_model1_norm = scaler.fit_transform(y_pred_more_model1.reshape(-1, 1))\ny_pred_less_model1_norm = y_pred_less_model1_norm.reshape(1, -1)[0]\ny_pred_more_model1_norm = y_pred_more_model1_norm.reshape(1, -1)[0]\nscaler = MinMaxScaler()\ny_pred_less_model2_norm = scaler.fit_transform(y_pred_less_model2.reshape(-1, 1))\ny_pred_more_model2_norm = scaler.fit_transform(y_pred_more_model2.reshape(-1, 1))\ny_pred_less_model2_norm = y_pred_less_model2_norm.reshape(1, -1)[0]\ny_pred_more_model2_norm = y_pred_more_model2_norm.reshape(1, -1)[0]'

In [37]:
'''import optuna

optuna.logging.set_verbosity(100)

def objective(trial):
    
    param = {
        "m1": trial.suggest_float("m1", 0.01, 1),
        "m2": trial.suggest_float("m2", 0.01, 1),
        "m3": trial.suggest_float("m3", 0.01, 1),
        "m4": trial.suggest_float("m4", 0.01, 1)}
        
    y_pred_less_model_ens = (param["m1"] * y_pred_less_model1 + param["m2"] * y_pred_less_model2_norm + param["m3"] * y_pred_less_model3[:, 1] + param["m4"] * y_pred_less_model4[:, 1])
    y_pred_more_model_ens = (param["m1"] * y_pred_more_model1 + param["m2"] * y_pred_more_model2_norm + param["m3"] * y_pred_more_model3[:, 1] + param["m4"] * y_pred_more_model4[:, 1])
    
    return (y_pred_less_model_ens < y_pred_more_model_ens).mean()
    
study = optuna.create_study(direction="maximize")
study.optimize(objective , n_trials =2000)

trial = study.best_trial
print("Best trial: {}".format(trial))'''

'import optuna\n\noptuna.logging.set_verbosity(100)\n\ndef objective(trial):\n    \n    param = {\n        "m1": trial.suggest_float("m1", 0.01, 1),\n        "m2": trial.suggest_float("m2", 0.01, 1),\n        "m3": trial.suggest_float("m3", 0.01, 1),\n        "m4": trial.suggest_float("m4", 0.01, 1)}\n        \n    y_pred_less_model_ens = (param["m1"] * y_pred_less_model1 + param["m2"] * y_pred_less_model2_norm + param["m3"] * y_pred_less_model3[:, 1] + param["m4"] * y_pred_less_model4[:, 1])\n    y_pred_more_model_ens = (param["m1"] * y_pred_more_model1 + param["m2"] * y_pred_more_model2_norm + param["m3"] * y_pred_more_model3[:, 1] + param["m4"] * y_pred_more_model4[:, 1])\n    \n    return (y_pred_less_model_ens < y_pred_more_model_ens).mean()\n    \nstudy = optuna.create_study(direction="maximize")\nstudy.optimize(objective , n_trials =2000)\n\ntrial = study.best_trial\nprint("Best trial: {}".format(trial))'

Best trial: FrozenTrial(number=1475, values=[0.6834396173774412], datetime_start=datetime.datetime(2022, 1, 5, 15, 11, 37, 857491), datetime_complete=datetime.datetime(2022, 1, 5, 15, 11, 37, 907676), params={'m1': 0.9554502249585517, 'm2': 0.010723429404399047, 'm3': 0.1600814882140605, 'm4': 0.5083912320129206}, distributions={'m1': UniformDistribution(high=1.0, low=0.01), 'm2': UniformDistribution(high=1.0, low=0.01), 'm3': UniformDistribution(high=1.0, low=0.01), 'm4': UniformDistribution(high=1.0, low=0.01)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=1475, state=TrialState.COMPLETE, value=None)

In [38]:
df_test_model_ens = df_test_model2.copy()
df_test_model_ens['score'] = 0.8 * df_test_model2['score'] +  0.2 * df_test_model4[:, 1]

In [39]:
df_test_model_ens.to_csv('./submission.csv', index=False)