- no cross fold
- only a simple test

# Import Detoxify model

In [None]:
from tqdm import tqdm
import pandas as pd
import torch 
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print(f'Using device: {device}')

In [None]:
import torch
import transformers
from transformers import BertForSequenceClassification, BertTokenizer

# models saved by another notebook that has internet
MODEL =  "../input/output-detox/model"
TOKENIZER = "../input/output-detox/tokenizer/"
CLASSES = ['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack']

# modified a bit the code of the __init__ function of https://github.com/unitaryai/detoxify/blob/master/detoxify/detoxify.py
class Detoxify:
    def __init__(self, device="cpu"):
        super(Detoxify, self).__init__()
        self.model = BertForSequenceClassification.from_pretrained(MODEL)
        self.tokenizer = BertTokenizer.from_pretrained(TOKENIZER)
        self.class_names =  CLASSES
        self.device = device
        self.model.to(self.device)

    @torch.no_grad()
    def predict(self, text):
        self.model.eval()
        inputs = self.tokenizer(
            text, return_tensors="pt", truncation=True, padding=True
        ).to(self.model.device)
        out = self.model(**inputs)[0]
        scores = torch.sigmoid(out).cpu().detach().numpy()
        results = {}
        for i, cla in enumerate(self.class_names):
            results[cla] = (
                scores[0][i]
                if isinstance(text, str)
                else [scores[ex_i][i].tolist() for ex_i in range(len(scores))]
            )
        return results

In [None]:
modelD = Detoxify(device=device)

In [None]:
comment_to_score = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
listofcomment_to_score = comment_to_score["text"].tolist()
print("Number of comments to order:",len(listofcomment_to_score))
listofcomment_to_score[0]

In [None]:
def get_scores(listoftext):
    # due to memory issue, only predict 100 by 100
    left = len(listoftext) % 100
    mod = len(listoftext) // 100

    scores = []
    prev = 0 
    for a in tqdm(range(mod+1)):
        if a == mod:
            a = mod*100+left
        else :
            a = prev + 100
        res = modelD.predict(listoftext[prev:a])
        prev += 100 
        for each in range(len(res['toxicity'])):
            one_is = res['toxicity'][each] + res['severe_toxicity'][each]+ res['obscene'][each]\
                    + res['threat'][each] + res['insult'][each]+res['identity_attack'][each]
            scores.append(one_is)
    return scores

In [None]:
scoresD = get_scores(listofcomment_to_score)

Just a trial

In [None]:
import pandas as pd
#mysub = pd.DataFrame({"comment_id": comment_to_score["comment_id"], "score":scores})
#mysub.to_csv("submission.csv", index=False)

## Test on validation dataset 

In [None]:
import pandas as pd
validation = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

In [None]:
less_toxic_scores = pd.Series(get_scores(validation["less_toxic"].tolist()[:1001]))
more_toxic_scores = pd.Series(get_scores(validation["more_toxic"].tolist()[:1001]))

In [None]:
def compute_accuracy(less_toxic_scores, more_toxic_scores):
    return more_toxic_scores.gt(less_toxic_scores, fill_value=0).sum()/len(less_toxic_scores)
    

In [None]:
compute_accuracy(less_toxic_scores, more_toxic_scores)

# More Data

In [None]:
### import random 
import numpy as np

train = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
train['random'] = np.random.randint(0,2,size=len(train['toxic']))
train_without_empty = train[ (train['toxic'] >= 1) | (train['insult'] >= 1)
                           | (train['severe_toxic']>= 1) | (train['obscene']>= 1)
                            | (train['threat']>= 1) | train['random']>=0.5   ]
train_without_empty = train_without_empty.drop(['random'], axis=1)
train_without_empty['y'] = train['toxic'] + train['insult'] + train['severe_toxic'] + \
                            train['obscene'] + train['threat']

In [None]:
train_without_empty

# Pipeline

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
models = [Ridge()]
for m in models:
    pipeline = Pipeline(
        [
            ("tfidf",  TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))),
            ("clf", m),
        ]
    )
    pipeline.fit(train_without_empty['comment_text'], train_without_empty['y'])
    print(m,compute_accuracy(pd.Series(less_preds), pd.Series(more_preds)))

Try on validation set

In [None]:
less_preds = pipeline.predict(validation["less_toxic"].tolist()[:1001])
more_preds = pipeline.predict(validation["more_toxic"].tolist()[:1001])

In [None]:
compute_accuracy(pd.Series(less_preds), pd.Series(more_preds))

In [None]:
def find_best_coef(l1,l2,m1,m2):
    for coef in [0.1,0.2,0.3,0.4,0.5, 0.6, 0.7,0.8,0.9]:
        new_least = l1.apply((lambda x: x * coef)) + l2.apply((lambda x: x * (1-coef)))
        new_most = m1.apply((lambda x: x * coef)) + m2.apply((lambda x: x * (1-coef)))
        print(coef,compute_accuracy(new_least,new_most))

In [None]:
find_best_coef(less_toxic_scores,pd.Series(less_preds),more_toxic_scores,pd.Series(more_preds))

In [None]:
scoresP = pipeline.predict(comment_to_score["text"])

In [None]:
scoresP

In [None]:
import pandas as pd
mysub = pd.DataFrame({"comment_id": comment_to_score["comment_id"], "score":scoresP})
mysub.to_csv("submission.csv", index=False)

In [None]:
mysub