baseline:

lightGBM https://www.kaggle.com/yshiml/jigsaw-baseline-lightgbm

Bayes https://www.kaggle.com/julian3833/jigsaw-incredibly-simple-naive-bayes-0-768

# Imports

In [None]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

#lightGBM
import optuna
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

In [None]:
import re
import string

import numpy as np

from textblob import TextBlob
#from afinn import Afinn
#from textstat.textstat import textstat

from nltk.corpus import stopwords

#from preprocess import normalize_texttall

from datetime import datetime

In [None]:
# Basic features
# copy and revise from : https://github.com/peterhurford/kaggle-toxic_comment/blob/master/feature_engineering.py

def print_step(step):
    print('[{}]'.format(datetime.now()) + ' ' + step)
    
def basic_feature(train):
    train2 = train.copy()

    print_step('BASIC FE 1/30')
    train2['num_words'] = train2['comment_text'].apply(lambda x: len(str(x).split()))

    print_step('BASIC FE 2/30')
    train2['num_unique_words'] = train2['comment_text'].apply(lambda x: len(set(str(x).lower().split())))

    print_step('BASIC FE 3/30')
    train2['unique_words_per_word'] = train2['num_unique_words'] / (train2['num_words'] + 0.0001)

    print_step('BASIC FE 4/30')
    train2['num_chars'] = train2['comment_text'].apply(lambda x: len(str(x)))

    print_step('BASIC FE 5/30')
    train2['num_capital'] = train2['comment_text'].apply(lambda x: len([c for c in x if c.isupper()]))

    print_step('BASIC FE 6/30')
    train2['num_lowercase'] = train2['comment_text'].apply(lambda x: len([c for c in x if c.islower()]))

    print_step('BASIC FE 7/30')
    train2['capital_per_char'] = train2['num_capital'] / train2['num_chars']

    print_step('BASIC FE 8/30')
    train2['lowercase_per_char'] = train2['num_lowercase'] / train2['num_chars']

    print_step('BASIC FE 9/30')
    stop_words = {x: 1 for x in stopwords.words('english')}
    train2['num_stopwords'] = train2['comment_text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stop_words]))

    print_step('BASIC FE 10/30')
    train2['num_punctuations'] = train2['comment_text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

    print_step('BASIC FE 11/30')
    train2['punctuation_per_char'] = train2['num_punctuations'] / train2['num_chars']

    print_step('BASIC FE 12/30')
    train2['num_words_upper'] = train2['comment_text'].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

    print_step('BASIC FE 13/30')
    train2['num_words_lower'] = train2['comment_text'].apply(lambda x: len([w for w in str(x).split() if w.islower()]))

    print_step('BASIC FE 14/30')
    train2['num_words_title'] = train2['comment_text'].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

    print_step('BASIC FE 15/30')
    train2['chars_per_word'] = train2['num_chars'] / train2['num_words']

    print_step('BASIC FE 16/30')
    train2['sentence'] = train2['comment_text'].apply(lambda x: [s for s in re.split(r'[.!?\n]+', str(x))])

    print_step('BASIC FE 17/30')
    train2['num_sentence'] = train2['sentence'].apply(lambda x: len(x))

    print_step('BASIC FE 18/30')
    train2['sentence_mean'] = train2.sentence.apply(lambda xs: [len(x) for x in xs]).apply(lambda x: np.mean(x))

    print_step('BASIC FE 19/30')
    train2['sentence_max'] = train2.sentence.apply(lambda xs: [len(x) for x in xs]).apply(lambda x: max(x) if len(x) > 0 else 0)

    print_step('BASIC FE 20/30')
    train2['sentence_min'] = train2.sentence.apply(lambda xs: [len(x) for x in xs]).apply(lambda x: min(x) if len(x) > 0 else 0)

    print_step('BASIC FE 21/30')
    train2['sentence_std'] = train2.sentence.apply(lambda xs: [len(x) for x in xs]).apply(lambda x: np.std(x))

    print_step('BASIC FE 22/30')
    train2['words_per_sentence'] = train2['num_words'] / train2['num_sentence']

    print_step('BASIC FE 23/30')
    train2['num_repeated_sentences'] = train2['sentence'].apply(lambda x: len(x) - len(set(x)))
    train2.drop('sentence', inplace=True, axis=1)

    # From https://www.kaggle.com/ogrellier/lgbm-with-words-and-chars-n-gram
    print_step('BASIC FE 24/30')
    train2['start_with_columns'] = train2['comment_text'].apply(lambda x: 1 if re.search(r'^\:+', x) else 0)

    print_step('BASIC FE 25/30')
    train2['has_timestamp'] = train2['comment_text'].apply(lambda x: 1 if re.search(r'\d{2}|:\d{2}', x) else 0)

    print_step('BASIC FE 26/30')
    train2['has_date_long'] = train2['comment_text'].apply(lambda x: 1 if re.search(r'\D\d{2}:\d{2}, \d{1,2} \w+ \d{4}', x) else 0)

    print_step('BASIC FE 27/30')
    train2['has_date_short'] = train2['comment_text'].apply(lambda x: 1 if re.search(r'\D\d{1,2} \w+ \d{4}', x) else 0)

    print_step('BASIC FE 28/30')
    train2['has_link'] = train2['comment_text'].apply(lambda x: 1 if re.search(r'http[s]{0,1}://\S+', x) else (1 if re.search(r'www\.\S+', x) else 0))

    print_step('BASIC FE 29/30')
    train2['has_email'] = train2['comment_text'].apply(lambda x: 1 if re.search(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', x) else 0)

    print_step('BASIC FE 30/30')
    train2['has_ip_address'] = train2['comment_text'].apply(lambda x: 1 if re.search(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', x) else 0)

    return train2


# Create train data

The competition was multioutput

We turn it into a binary toxic/ no-toxic classification

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
df.head()
df['y5'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)).astype(int)
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0).astype(int)
#df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.sample(5)

In [None]:
df2 = basic_feature(df)

In [None]:
df2.head()

In [None]:
X2 = df2.loc[:,'num_words':'has_ip_address']
y2 = df2['y5']

# Undersample

The dataset is very unbalanced. Here we undersample the majority class. Other strategies might work better.

In [None]:
df['y'].value_counts(normalize=True)

In [None]:
min_len = (df['y'] == 1).sum()

In [None]:
df_y0_undersample = df[df['y'] == 0].sample(n=min_len, random_state=201)

In [None]:
df = pd.concat([df[df['y'] == 1], df_y0_undersample])

In [None]:
df['y'].value_counts()

In [None]:
df

# TF-IDF

In [None]:
vec = TfidfVectorizer()

In [None]:
X = vec.fit_transform(df['comment_text'])
X

In [None]:
y = df['y']

# Fit Naive Bayes

In [None]:
model = MultinomialNB()
model.fit(X, df['y'])

In [None]:
modelb = MultinomialNB()
modelb.fit(X,df['y5'])

# Validate data

In [None]:
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

In [None]:
X_less_toxic = vec.transform(df_val['less_toxic'])
X_more_toxic = vec.transform(df_val['more_toxic'])

In [None]:
pb1 = modelb.predict_proba(X_less_toxic)
pb2 = modelb.predict_proba(X_more_toxic)
lb1 =(pb1 * (np.arange(7)+1)).sum(axis=1) #np.argmax(modelb.predict_proba(X_less_toxic),axis=1)
lb2 = (pb2 * (np.arange(7)+1)).sum(axis=1)
(lb1<lb2).mean()

In [None]:
p11 = model.predict_proba(X_less_toxic)
p12 = model.predict_proba(X_more_toxic)
p11.shape
(p11[:,1]<p12[:,1]).mean()

In [None]:
#df_val.sample(10)
#df_val.head(5)
text_less = df_val.less_toxic.to_frame().rename(columns={'less_toxic':'comment_text'})
text_more = df_val.more_toxic.to_frame().rename(columns={'more_toxic':'comment_text'})
#text_less.head(5)
#text_more.head(5)
X_basic_less = basic_feature(text_less)
X_basic_more = basic_feature(text_more)

* * 

# LightGBM with basic ft

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(X2, y2, test_size=0.2, random_state=233)

def create_model(trial):
    num_leaves = trial.suggest_int("num_leaves", 2, 31)
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    min_child_samples = trial.suggest_int('min_child_samples', 100, 1200)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 5, 90)
    bagging_freq = trial.suggest_int('bagging_freq', 1, 7)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.0001, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.0001, 1.0)
    subsample = trial.suggest_uniform('subsample', 0.1, 1.0)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.1, 1.0)
    
    model = lgb.LGBMRegressor(
        num_leaves=num_leaves,
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth, 
        min_child_samples=min_child_samples, 
        min_data_in_leaf=min_data_in_leaf,
        bagging_freq=bagging_freq,
        bagging_fraction=bagging_fraction,
        feature_fraction=feature_fraction,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=666)
    
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_val)
    score = -mean_squared_error(y_val, y_pred)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40)
params2 = study.best_params

print(params2)

In [None]:
params2

In [None]:
model2 = lgb.LGBMRegressor(**params2)
model2.fit(X2, y2)

In [None]:
mean_squared_error(model2.predict(X2),y2)/mean_squared_error(y2,np.repeat(np.mean(y2),len(y2)))

In [None]:
def evaluateLGB(model,xless,xmore):
    p1 = model.predict(xless)
    p2 = model.predict(xmore)

    return (p1 < p2).mean()

In [None]:
evaluateLGB(model2,X_basic_less.iloc[:,1:],X_basic_more.iloc[:,1:])

## lightGBM

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=666)

def create_model(trial):
    num_leaves = trial.suggest_int("num_leaves", 2, 31)
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    min_child_samples = trial.suggest_int('min_child_samples', 100, 1200)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 5, 90)
    bagging_freq = trial.suggest_int('bagging_freq', 1, 7)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.0001, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.0001, 1.0)
    subsample = trial.suggest_uniform('subsample', 0.1, 1.0)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.1, 1.0)
    
    model = lgb.LGBMClassifier(
        num_leaves=num_leaves,
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth, 
        min_child_samples=min_child_samples, 
        min_data_in_leaf=min_data_in_leaf,
        bagging_freq=bagging_freq,
        bagging_fraction=bagging_fraction,
        feature_fraction=feature_fraction,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=666)
    
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40)
params = study.best_params

print(params)

model3 = lgb.LGBMClassifier(**params)
model3.fit(X, y)

In [None]:
X_less_toxic = vec.transform(df_val['less_toxic'])
X_more_toxic = vec.transform(df_val['more_toxic'])

In [None]:
params

In [None]:
accuracy_score(y, y_pred = model3.predict(X))
p1 = model3.predict_proba(X_less_toxic)
p2 = model3.predict_proba(X_more_toxic)
(p1[:, 1] < p2[:, 1]).mean()
#可能有过拟合的问题，调参试一下...

In [None]:
'''
model4 = lgb.LGBMClassifier(**adjparams)
model4.fit(X, y)

accuracy_score(y, y_pred = model4.predict(X))
p1 = model4.predict_proba(X_less_toxic)
p2 = model4.predict_proba(X_more_toxic)
(p1[:, 1] < p2[:, 1]).mean()
'''

# validation

In [None]:
acc1 = accuracy_score(model.predict(X),y)
acc2 = mean_squared_error(y2, model2.predict(X2))/ mean_squared_error(y2,np.repeat(np.mean(y2),len(y2)))
acc3 = accuracy_score(y, model3.predict(X))
acc1,acc2,acc3

In [None]:
p1 = acc1 * model.predict_proba(X_less_toxic)[:,1]+ 0.5 * acc2 * model2.predict(X_basic_less.iloc[:,1:])+ acc3 * model3.predict_proba(X_less_toxic)[:,1]
p2 = acc1 * model.predict_proba(X_more_toxic)[:,1]+ 0.5 * acc2 * model2.predict(X_basic_more.iloc[:,1:])+ acc3 * model3.predict_proba(X_more_toxic)[:,1]
(p1 < p2).mean()

# Submission

In [None]:
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df_sub.head()
X_test = vec.transform(df_sub['text'])

In [None]:
text_sub = df_sub.text.to_frame().rename(columns={'text':'comment_text'})
X_basic_sub = basic_feature(text_sub)

In [None]:
p3 = acc1 * model.predict_proba(X_test)[:,1]+ 0.5 * acc2 * model2.predict(X_basic_sub.iloc[:,1:])+ acc3 * model3.predict_proba(X_test)[:,1]

In [None]:
df_sub['score'] = p3

In [None]:
df_sub['score'].count()

In [None]:
# 2 comments will fail if compared one with the other
df_sub['score'].count()-df_sub['score'].nunique()

In [None]:
df_sub[['comment_id', 'score']].to_csv('submission3.csv', index=False)

In [None]:
df_sub.head()