A variation of https://www.kaggle.com/julian3833/jigsaw-incredibly-simple-naive-bayes-0-768 

In [None]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import rankdata

# Create train data

Using data from [Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)

The target was multioutput, we turn it into linear,  using weighted toxic behaviors

The types of toxicity are: 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'

In [None]:
jc_train_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
print(f"jc_train_df:{jc_train_df.shape}")
jc_test_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")

temp_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv")
jc_test_df = jc_test_df.merge ( temp_df, on ="id")
print(f"jc_test_df:{jc_test_df.shape}")
jc_test_df = jc_test_df.query ("toxic != -1")
print(f"jc_test_df:{jc_test_df.shape}")
df = jc_train_df.append(jc_test_df)


df["toxic_flag"] = df[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]].sum(axis=1)
df = df.rename(columns={'comment_text': 'text'})



#undersample non toxic comments  on Toxic Comment Classification Challenge
min_len = (df['toxic_flag'] >= 1).sum() 
df_y0_undersample = df[df['toxic_flag'] == 0].sample(n=int(min_len*2.5),random_state=201)
df = pd.concat([df[df['toxic_flag'] >= 1], df_y0_undersample])

toxic = 0.71
severe_toxic = 0.75
obscene = 1.47
threat = 0.0
insult = 0.66
identity_hate = 1.36 


df['y'] = df[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]].max(axis=1)
df['y'] = df["y"]+df['toxic']*toxic
df['y'] = df["y"]+df['severe_toxic']*severe_toxic
df['y'] = df["y"]+df['obscene']*obscene
df['y'] = df["y"]+df['threat']*threat
df['y'] = df["y"]+df['insult']*insult
df['y'] = df["y"]+df['identity_hate']*identity_hate
y = df['y'].values

# TF-IDF

In [None]:
vec = TfidfVectorizer(analyzer='char_wb', max_df=0.5, min_df=3, ngram_range=(3, 5) )

X = vec.fit_transform(df['text'])

X.shape

# Model

In [None]:
model = Ridge(alpha = 1.0)
model.fit(X, df['y'])


### validate
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

X_less_toxic = vec.transform(df_val['less_toxic'])
X_more_toxic = vec.transform(df_val['more_toxic'])

p1 = model.predict(X_less_toxic)
p2 = model.predict(X_more_toxic)

# Validation Accuracy
(p1< p2).mean()

# Submission

In [None]:
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
X_test = vec.transform(df_sub['text'])
score = model.predict(X_test)


## to enforce unique values on score
df_sub['score'] = rankdata(score, method='ordinal')

df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)

df_sub.head()