In [None]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import gensim

from wordcloud import WordCloud, STOPWORDS
import nltk
nltk.download('stopwords')
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Flatten, Dropout, Dense, LSTM, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
base_path = '../input/'

# RidgeRegression Ensemble

In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from scipy.stats import rankdata

def ridge_cv (vec, X, y, X_test, folds, stratified ):
    kf = StratifiedKFold(n_splits=FOLDS,shuffle=True,random_state=123)
    val_scores = []
    rmse_scores = []
    X_less_toxics = []
    X_more_toxics = []

    preds = []
    for fold, (train_index,val_index) in enumerate(kf.split(X,stratified)):
        X_train, y_train = X[train_index], y[train_index]
        X_val, y_val = X[val_index], y[val_index]
        model = Ridge()
        model.fit(X_train, y_train)

        rmse_score = mean_squared_error ( model.predict (X_val), y_val, squared = False) 
        rmse_scores.append (rmse_score)

        X_less_toxic = vec.transform(df_val['less_toxic'])
        X_more_toxic = vec.transform(df_val['more_toxic'])

        p1 = model.predict(X_less_toxic)
        p2 = model.predict(X_more_toxic)

        X_less_toxics.append ( p1 )
        X_more_toxics.append ( p2 )

        # Validation Accuracy
        val_acc = (p1< p2).mean()
        val_scores.append(val_acc)

        pred = model.predict (X_test)
        preds.append (pred)

        print(f"FOLD:{fold}, rmse_fold:{rmse_score:.5f}, val_acc:{val_acc:.5f}")

    mean_val_acc = np.mean (val_scores)
    mean_rmse_score = np.mean (rmse_scores)

    p1 = np.mean ( np.vstack(X_less_toxics), axis=0 )
    p2 = np.mean ( np.vstack(X_more_toxics), axis=0 )

    val_acc = (p1< p2).mean()

    print(f"OOF: val_acc:{val_acc:.5f}, mean val_acc:{mean_val_acc:.5f}, mean rmse_score:{mean_rmse_score:.5f}")
    
    preds = np.mean ( np.vstack(preds), axis=0 )
    
    return p1, p2, preds

In [None]:
df_val = pd.read_csv(base_path + 'jigsaw-toxic-severity-rating/validation_data.csv')
df_test = pd.read_csv(base_path + 'jigsaw-toxic-severity-rating/comments_to_score.csv')

In [None]:
FOLDS = 5
def TfidfVec(df):
    
    vec = TfidfVectorizer(analyzer='char_wb', max_df=0.5, min_df=3, ngram_range=(4, 6) )
    X = vec.fit_transform(df['text'])
    y = df["y"].values
    X_test = vec.transform(df_test['text'])
    
    return vec, X, y, X_test

In [None]:
jf_train_df = pd.read_csv(base_path + "jigsaw-toxic-comment-classification-challenge/train.csv")
print(f"jf_train_df:{jf_train_df.shape}")

In [None]:
toxic = 1.0
severe_toxic = 2.0
obscene = 1.0
threat = 1.0
insult = 1.0
identity_hate = 2.0

def create_train (df):
    df['y'] = df[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]].max(axis=1)
    df['y'] = df["y"]+df['severe_toxic']*severe_toxic
    df['y'] = df["y"]+df['obscene']*obscene
    df['y'] = df["y"]+df['threat']*threat
    df['y'] = df["y"]+df['insult']*insult
    df['y'] = df["y"]+df['identity_hate']*identity_hate
    
    
    
    df = df[['comment_text', 'y', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].rename(columns={'comment_text': 'text'})

    #undersample non toxic comments  on Toxic Comment Classification Challenge
    min_len = (df['y'] >= 1).sum()
    df_y0_undersample = df[df['y'] == 0].sample(n=int(min_len*1.5),random_state=201)
    df = pd.concat([df[df['y'] >= 1], df_y0_undersample])
                                                
    return df
 
jf_train_df = create_train (jf_train_df)
print(jf_train_df['y'].value_counts())

In [None]:
vec, X, y, X_test = TfidfVec(jf_train_df)
stratified = np.around ( y )
jf_p1, jf_p2, jf_preds =  ridge_cv (vec, X, y, X_test, FOLDS, stratified )

In [None]:
js_train_df = pd.read_csv(base_path + "jigsaw-unintended-bias-in-toxicity-classification/train.csv")
print(f"js_train_df:{js_train_df.shape}")
js_train_df = js_train_df.query ("toxicity_annotator_count > 5")
print(f"juc_train_df:{js_train_df.shape}")

js_train_df['y'] = js_train_df[[ 'severe_toxicity', 'obscene', 'sexual_explicit','identity_attack', 'insult', 'threat']].sum(axis=1)

js_train_df['y'] = js_train_df.apply(lambda row: row["target"] if row["target"] <= 0.5 else row["y"] , axis=1)
js_train_df = js_train_df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
min_len = (js_train_df['y'] > 0.5).sum()
df_y0_undersample = js_train_df[js_train_df['y'] <= 0.5].sample(n=int(min_len*1.5),random_state=201)
js_train_df = pd.concat([js_train_df[js_train_df['y'] > 0.5], df_y0_undersample])

print(js_train_df['y'].value_counts())

In [None]:
vec, X, y, X_test = TfidfVec(js_train_df)

stratified = (np.around (y, decimals=1)*10).astype(int)
js_p1, js_p2, js_preds =  ridge_cv (vec, X, y, X_test, FOLDS, stratified )

In [None]:
rud_df = pd.read_csv(base_path + "ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
print(f"rudd_df:{rud_df.shape}")
rud_df['y'] = rud_df['offensiveness_score'].map(lambda x: 0.0 if x <=0 else x)
rud_df = rud_df[['txt', 'y']].rename(columns={'txt': 'text'})
min_len = (rud_df['y'] < 0.5).sum()
print(rud_df['y'].value_counts())

In [None]:
vec, X, y, X_test = TfidfVec(rud_df)

stratified = (np.around ( y, decimals = 1  )*10).astype(int)
rud_p1, rud_p2, rud_preds =  ridge_cv (vec, X, y, X_test, FOLDS, stratified )

# jigsaw-toxic-comment-classification-challenge Dataset

In [None]:
jf_train_df = pd.read_csv(base_path + 'jigsaw-toxic-comment-classification-challenge/train.csv')
jf_train_df.head()

In [None]:
jf_train_df.shape

In [None]:
jf_train_df['toxicity'] = (jf_train_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0).astype(int)
jf_train_df = jf_train_df[['comment_text', 'toxicity']].rename(columns={'comment_text': 'text'})
jf_train_df.toxicity.value_counts()

In [None]:
jf_train_df = jf_train_df[['text', 'toxicity']]
jf_train_df.head()

# Model Unintended Bias in Toxicity Classification Dataset

In [None]:
js_train_df = pd.read_csv(base_path + 'jigsaw-unintended-bias-in-toxicity-classification/train.csv')
js_train_df.head()

In [None]:
js_train_df.shape

In [None]:
js_train_df['toxicity'] = (js_train_df[['target', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']].sum(axis=1) > 0).astype(int)
js_train_df = js_train_df[['comment_text', 'toxicity']].rename(columns={'comment_text': 'text'})
js_train_df.toxicity.value_counts()

In [None]:
js_train_df = js_train_df[['text', 'toxicity']]
js_train_df.head()

# Model Ruddit: Norms of Offensiveness for English Reddit Comments Dataset

In [None]:
rud_df = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
rud_df.head()

In [None]:
rud_df.shape

In [None]:
rud_df['toxicity'] = rud_df['offensiveness_score'].map(lambda x: 0 if x <=0 else 1)
rud_df = rud_df[['txt', 'toxicity']].rename(columns={'txt': 'text'})
rud_df.toxicity.value_counts()

In [None]:
rud_df = rud_df[['text', 'toxicity']]
rud_df.head()

# Make one train data set

In [None]:
df = pd.concat([jf_train_df, js_train_df, rud_df])
df.toxicity.value_counts()

In [None]:
df.shape

In [None]:
# sampling
min_len = (df['toxicity'] == 1).sum()
df_undersample = df[df['toxicity'] == 0].sample(n=min_len, random_state=201)
df = pd.concat([df_undersample, df[df['toxicity'] == 1]])
df = shuffle(df)

In [None]:
df.toxicity.value_counts()

In [None]:
df.text = df.text.map(lambda x:x.replace('\n', ' '))
df.text[:2]

# Test Pre-Processing

In [None]:
y = df.toxicity
x = df.drop('toxicity', axis=1)

In [None]:
texts = x.copy()
texts.reset_index(inplace=True, drop=True)
texts.head()

In [None]:
print(sys.getrecursionlimit())

In [None]:
sys.setrecursionlimit(6000)

# Stemming

In [None]:
ps = PorterStemmer()
corpus = []

for i in tqdm(range(0, len(texts))):
    cleaned = re.sub('[^a-zA-Z]', ' ', texts['text'][i])
    cleaned = cleaned.lower().split()
    
    cleaned = [ps.stem(word) for word in cleaned if not word in stopwords.words('english')]
    cleaned = ' '.join(cleaned)
    corpus.append(cleaned)

# Embedding

In [None]:
DIM = 100

X = [d.split() for d in corpus]
w2v_model = gensim.models.Word2Vec(sentences = X, vector_size = DIM, window = 10, min_count = 1)

In [None]:
len(w2v_model.wv.key_to_index.keys())

In [None]:
w2v_model.wv.most_similar('toxic')

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

In [None]:
X = tokenizer.texts_to_sequences(X)
X[:3]

In [None]:
X = pad_sequences(X, padding='pre', maxlen=20)
X[:3]

In [None]:
vocab_size = len(tokenizer.word_index) + 1
vocab = tokenizer.word_index

In [None]:
def get_weights_matrix(model):
    weights_matrix = np.zeros((vocab_size, DIM))
    
    for word, i in vocab.items():
        weights_matrix[i] = model.wv[word]
        
    return weights_matrix

embedding_vectors = get_weights_matrix(w2v_model)

# Modeling & Training

In [None]:
model = Sequential()

model.add(Embedding(vocab_size, output_dim=DIM, weights=[embedding_vectors], input_length=20))
model.add(Dropout(0.2))

model.add(LSTM(64))
model.add(Dropout(0.2))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1, activation='linear'))

In [None]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics='accuracy')
model.summary()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

es = EarlyStopping(patience=3,
                  monitor='loss',
                  restore_best_weights=True,
                  mode='min',
                  verbose=1)

hist = model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs=10, callbacks=es, batch_size=32)

In [None]:
plt.style.use('fivethirtyeight')

# visualize the models accuracy
plt.plot(hist.history['accuracy'])
plt.plot(hist.history['val_accuracy'])
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

# Processing predict value for Ensemble

In [None]:
# tokenizer for LSTM
X_less_toxic = tokenizer.texts_to_sequences(df_val['less_toxic'])
X_less_toxic = pad_sequences(X_less_toxic, maxlen=20)
X_more_toxic = tokenizer.texts_to_sequences(df_val['more_toxic'])
X_more_toxic = pad_sequences(X_more_toxic, maxlen=20)
new_text = tokenizer.texts_to_sequences(df_test.text)
new_text = pad_sequences(new_text, maxlen=20)

In [None]:
# make predict value to list for Ensemble
lstm_p1 = model.predict(X_less_toxic)
lstm_p2 = model.predict(X_more_toxic)
lstm_preds = np.hstack(model.predict(new_text))

# Submission

In [None]:
jf_max = max(jf_p1.max() , jf_p2.max())
js_max = max(js_p1.max() , js_p2.max())
rud_max = max(rud_p1.max() , rud_p2.max())
lstm_max = max(lstm_p1.max(), lstm_p2.max())


p1 = jf_p1/jf_max + js_p1/js_max + rud_p1/rud_max + lstm_p1/lstm_max
p2 = jf_p2/jf_max + js_p2/js_max + rud_p2/rud_max + lstm_p2/lstm_max

val_acc = (p1 < p2).mean()
print(f"Ensemble: val_acc:{val_acc:.5f}")

In [None]:
score = jf_preds/jf_max + js_preds/js_max + rud_preds/rud_max + lstm_preds/lstm_max
## to enforce unique values on score
df_test['score'] = rankdata(score, method='ordinal')

df_test[['comment_id', 'score']].to_csv("submission.csv", index=False)

df_test.head()