# Toxic Linear Public: LB 0.864
---
Improvements over the previous single model public SOTA (LB: 0.860):  
- Adding pseudo labels by a deep model trained on validation_data.csv
- Adding positive toxic(y>0) comments from test labels
- Limiting the max features to prevent overfitting

I only have 5 submissions a day, so I cannot perform as many experiments as I wish. I believe that the score for this notebook is a local optima and a much higher LB score is possible with the same dataset and a different set of hyperparameters.  

I am requesting the Kaggle philanthropists in this competition to use some of their submissions to perform different experiments and comment the results in this notebook. Feel free to use some of my ideas below. 

### Version 5: 0.864

In [None]:
"""
EXPERIMENTS
-----------
- 0.10 pseudo label weight: 0.861
- 0.025 psuedo label weight: 0.864
- Adding old test positives: 0.860 to 0.862
- Changed max features from 90k to 50k: 0.864
- Add word feature: 0.862 to 0.843
- Changed Weights: 0.860 (Slight Change), 0.846 (Bigger Change)
- Add 2019 competition toxic data: 0.862 to 0.795
- Rearranging score with papi and roberta lb 833: 0.838 to 0.824
- Rearranging score with papi+roberta 833+ensemble 864: 0.821
- Adding 10k word features with 50k ngram features: 0.864 to 0.850

TODO EXPERIMENTS 
----------------
- Normalize the pseudo labels with mean 0 and variance 1 before adding?
- What is the effect of random seed when selecting negatives? What is the variation, mean and the max?
- How to balance regularization while adding more features?
- What is the effect of casing (lowercase/uppercase)?
- How does overlap with valid in the training set effect the score?
- Best way to ensemble multiple rankings? 
- Effect of alpha regularization in Ridge 
- Effect of undersampling, min_df, adding word features, lowercase, etc.
- Effect of different weights and the difference between them. 
- What is the effect of cleaning (no clean vs clean vs very clean)?
- Linearly increasing weights? (obscene: 0.10, toxic: 0.20, ..., severe_toxic: 0.60) ?


TODO FURTHERMORE
----------------
- Add bert embeddings along with tfidf features
- Rearranging rankings with a cross encoder
- Using an unbiased model to detect false positives 
    - comments with words like gay and jew have high scores irrespective of context
"""

In [None]:
# Notebook Imports & Setup
from collections import Counter, defaultdict
from functools import partial
from tqdm.auto import tqdm
from pathlib import Path
from time import time
import pandas as pd
import numpy as np
import sklearn
import joblib
import re

from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.pipeline

In [None]:
from bs4 import BeautifulSoup
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [None]:
def optimize_weights(cols, trials=300000): 
    for col in cols: 
        dict_map = sub.set_index('text')[col].to_dict()
        valid[f'L{col}'] = valid.less_toxic.map(dict_map)
        valid[f'M{col}'] = valid.more_toxic.map(dict_map)
    best_weights = [1] * len(cols)
    best_acc = 0
    less_dict = {p: valid[f'L{p}'].values for p in cols}
    more_dict = {p: valid[f'M{p}'].values for p in cols}

    for _ in tqdm(range(trials)):
        less, more = np.zeros(len(valid)), np.zeros(len(valid))
        weights = [random.random() for _ in range(len(cols))]
        for p, wt in zip(cols, weights): 
            less += wt * less_dict[p]
            more += wt * more_dict[p]

        acc = (more > less).sum() / len(valid)
        if acc > best_acc: 
            print('acc improved from', best_acc , 'to ', acc)
            best_acc = acc
            best_weights = weights

    print('Best Linear Accuracy: ', best_acc)
    print('Best Weights: ', best_weights)
    return np.array(best_weights)

def get_features(pipeline): 
    print('Total features: ', len(pipeline['features'].get_feature_names()))
    feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                  np.round(pipeline['clf'].coef_,2) )), 
                         key = lambda x:x[1], 
                         reverse=True)
    feature_df = pd.DataFrame(feature_wts, columns = ['feat','val']).T
    return feature_df.T

In [None]:
FEATURE_WTS = {
    'severe_toxic': 1.5, 'identity_hate': 1.5, 'threat': 1.5, 
    'insult': 0.64, 'toxic': 0.32, 'obscene': 0.16, 
}
PSEUDO_LABEL_WEIGHT = 0.033

FEATURES = list(FEATURE_WTS.keys())
FEATURES

In [None]:
old_train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')
old_train['y'] = 0
for feat, wt in FEATURE_WTS.items(): 
    old_train.y += wt*old_train[feat]
old_train.y = old_train.y/old_train.y.max()
    
pos = old_train[old_train.y>0]
neg = old_train[old_train.y==0].sample(len(pos), random_state=201)
old_train = pd.concat([pos, neg])
old_train

In [None]:
def read_old_test(): 
    df_test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv')
    df_test_labels = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv')
    df = pd.merge(df_test, df_test_labels, how='left', on = 'id')
    df = df.replace(-1, np.nan).dropna()
    return df

old_test = read_old_test()
old_test['y'] = 0
for feat, wt in FEATURE_WTS.items(): 
    old_test.y += wt * old_test[feat]
old_test.y = old_test.y / old_test.y.max()
old_test_pos = old_test[old_test.y>0]

train = pd.concat([old_train, old_test_pos])

In [None]:
# Add Pseudo Labels
pseudo = pd.read_csv('../input/toxic-public-dataframes/old_pseudo_label.csv')
comment_to_pseudo_label = pseudo.set_index('comment_text').to_dict()['score']
train['soft_pseudo_label_score'] = train.comment_text.map(comment_to_pseudo_label)

train.y = train.y + PSEUDO_LABEL_WEIGHT*train.soft_pseudo_label_score
train.sort_values(by='y')

In [None]:
# Overlap with valid
valid = pd.read_csv('../input/toxic-public-dataframes/valid.csv')
in_valid = train.comment_text.isin(valid.more_toxic)|train.comment_text.isin(valid.less_toxic)
train[in_valid]

In [None]:
tqdm.pandas()
train.comment_text = train.comment_text.progress_apply(text_cleaning)
train

In [None]:
import sklearn.linear_model
import sklearn.pipeline
import joblib

features = sklearn.pipeline.FeatureUnion([
    ('vec', TfidfVectorizer(
        min_df=3, max_df=0.5, 
        analyzer='char_wb', ngram_range = (3,5), 
        lowercase=True, max_features=50000,
    )), 
    # ('vec2', TfidfVectorizer(
    #     min_df=3, max_df=0.75, 
    #     analyzer='word', 
    #     lowercase=False, max_features=10000,
    # ))
])
pipeline = sklearn.pipeline.Pipeline([
    ('features', features), 
    ('clf', sklearn.linear_model.Ridge(alpha=0.5)), 
])
pipeline.fit(train.comment_text, train.y)

joblib.dump(pipeline, 'pipeline.pkl')

In [None]:
# Analyze Predictions 
display(get_features(pipeline))

train['y_pred'] = pipeline.predict(train.comment_text)
print('Train RMSE:', (((train.y_pred-train.y)**2).mean())**0.5)
train['delta'] = abs(train.y_pred - train.y)
train.sort_values(by='delta')

In [None]:
%%time
sub = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
sub['comment_text'] = sub.text
sub.text = sub.text.progress_apply(text_cleaning)
sub['score'] = pipeline.predict(sub.text)
sub

In [None]:
sub[['comment_id', 'score']].to_csv('submission.csv', index=False)

In [None]:
# from tqdm.auto import tqdm
# import random 

# papi = pd.read_csv('../input/toxic-pseudo-labels/perspective_api_lb771.csv')
# roberta = pd.read_csv('../input/toxic-pseudo-labels/robertab_lb833.csv')
# ensemble = pd.read_csv('../input/toxic-pseudo-labels/public_ensemble_lb864.csv')

# papi_cid_to_score = papi.set_index('comment_id').to_dict()['score']
# roberta_cid_to_score = roberta.set_index('comment_id').to_dict()['score']
# ensemble_cid_to_score = ensemble.set_index('comment_id').to_dict()['score']
# comment_id_to_idx = {cid: idx for idx, cid in enumerate(sub.comment_id)}

# comment_ids = list(set(roberta.comment_id.values) & set(sub.comment_id.values))
# sub_scores = list(sub.score.values)
# num_swaps = 0
# for i in tqdm(range(100000000)): 
#     cid1, cid2 = random.choice(comment_ids), random.choice(comment_ids)
#     i1, i2 = comment_id_to_idx[cid1], comment_id_to_idx[cid2]
#     if sub_scores[i1] > sub_scores[i2]: 
#         continue
#     if (roberta_cid_to_score[cid1] > roberta_cid_to_score[cid2]) \
#     and (papi_cid_to_score[cid1] > papi_cid_to_score[cid2]) \
#     and (ensemble_cid_to_score[cid1] > ensemble_cid_to_score[cid2]): 
#         num_swaps += 1
#         sub_scores[i1], sub_scores[i2] = sub_scores[i2], sub_scores[i1]

# print('Number of swaps: ', num_swaps)

In [None]:
# sub['org_score'] = sub.score.rank(method='first')
# sub['score'] = sub_scores
# sub.score = sub.score.rank(method='first')
# sub

# # sub['swap_scores'] = sub_scores
# # sub.swap_scores = sub.swap_scores.rank(method='first')
# # sub.score = sub.score.rank(method='first')

# # sub