In [None]:
import numpy as np
import pandas as pd
import os
import time
import gc
import random
import nltk

Using TensorFlow backend.


In [None]:
CRAWL_EMBEDDING_PATH = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
GLOVE_EMBEDDING_PATH = '../input/glove840b300dtxt/glove.840B.300d.txt'

In [None]:
max_features=None

In [None]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    unknown_words = []
    
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

In [None]:
def preprocess(data):
    '''
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    '''
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data

# Preprocessing

In [None]:
valid_size = 100000
num_to_load = 200000
train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
#test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

test = train.tail(valid_size).copy()
train=train.head(num_to_load)

x_train = preprocess(train['comment_text'])
y_train = np.where(train['target'] >= 0.5, 1, 0)
y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
#x_test = preprocess(test['comment_text'])

x_test = preprocess(test['comment_text'])
y_test = np.where(test['target'] >= 0.5, 1, 0)
y_aux_test = test[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]

In [None]:
x_train.shape, y_train.shape, y_aux_train.shape, x_test.shape, y_test.shape, y_aux_test.shape, 

((200000,), (200000,), (200000, 6), (100000,), (100000,), (100000, 6))

In [None]:
x_train

0         This is so cool  It s like   would you want yo...
1         Thank you   This would make my life a lot less...
2         This is such an urgent design problem  kudos t...
3         Is this something I ll be able to install on m...
4                      haha you guys are a bunch of losers 
5                                      ur a sh tty comment 
6                               hahahahahahahahhha suck it 
7                                       FFFFUUUUUUUUUUUUUUU
8         The ranchers seem motivated by mostly by greed...
9         It was a great show  Not a combo I d of expect...
10                                  Wow  that sounds great 
11        This is a great story  Man  I wonder if the pe...
12           This seems like a step in the right direction 
13        It s ridiculous that these guys are being call...
14        This story gets more ridiculous by the hour  A...
15        I agree  I don t want to grant them the legiti...
16        Interesting  I ll be curious t

In [None]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(x_train) + list(x_test))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [None]:
max_features = max_features or len(tokenizer.word_index) + 1
max_features

120755

In [None]:
crawl_matrix, unknown_words_crawl = build_matrix(tokenizer.word_index, CRAWL_EMBEDDING_PATH)
print('n unknown words (crawl): ', len(unknown_words_crawl))

n unknown words (crawl):  36875


In [None]:
glove_matrix, unknown_words_glove = build_matrix(tokenizer.word_index, GLOVE_EMBEDDING_PATH)
print('n unknown words (glove): ', len(unknown_words_glove))

n unknown words (glove):  35435


In [None]:
embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)
embedding_matrix.shape

del crawl_matrix
del glove_matrix
gc.collect()

0

# Training

# 1. TfIDF featurization

### 1.1. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf_model = TfidfVectorizer(min_df=5, max_features=50000)
tf_train = tf_model.fit_transform(train['comment_text'])
tf_test = tf_model.transform(test['comment_text'])

In [None]:
tf_train.shape, tf_test.shape

((200000, 34053), (100000, 34053))

In [None]:
rf  = RandomForestClassifier(n_estimators=200)
rf.fit(tf_train, y_train)
rf_pred = rf.predict_proba(tf_test)
test['rf'] = rf_pred[:,1]

In [None]:
from sklearn import metrics
MODEL_NAME = 'rf'
test[MODEL_NAME]=torch.sigmoid(torch.tensor(test[MODEL_NAME].values)).numpy()
TOXICITY_COLUMN = 'target'
bias_metrics_df = compute_bias_metrics_for_model(test, identity_columns, MODEL_NAME, 'target')
bias_metrics_df
get_final_metric(bias_metrics_df, calculate_overall_auc(test, MODEL_NAME))

0.8712983144401405

In [None]:
bias_metrics_df

Unnamed: 0,bnsp_auc,bpsn_auc,subgroup,subgroup_auc,subgroup_size
6,0.921959,0.765978,black,0.753905,759
7,0.925372,0.775918,white,0.766114,1389
2,0.929267,0.772655,homosexual_gay_or_lesbian,0.771203,735
5,0.930215,0.787986,muslim,0.830109,814
8,0.938255,0.804347,psychiatric_or_mental_illness,0.8411,315
1,0.88177,0.906542,female,0.859397,3501
4,0.893053,0.896062,jewish,0.870143,277
0,0.905913,0.890344,male,0.880728,2560
3,0.891497,0.910346,christian,0.89025,1896


### 1.2. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression(C=10)
lr.fit(tf_train, y_train)
lr_pred = lr.predict_proba(tf_test)
test['lr'] = lr_pred[:,1]

In [None]:
MODEL_NAME = 'lr'
test[MODEL_NAME]=torch.sigmoid(torch.tensor(test[MODEL_NAME].values)).numpy()
TOXICITY_COLUMN = 'target'
bias_metrics_df = compute_bias_metrics_for_model(test, identity_columns, MODEL_NAME, 'target')
bias_metrics_df
get_final_metric(bias_metrics_df, calculate_overall_auc(test, MODEL_NAME))

0.8797335758375741

In [None]:
bias_metrics_df

Unnamed: 0,bnsp_auc,bpsn_auc,subgroup,subgroup_auc,subgroup_size
2,0.930951,0.761614,homosexual_gay_or_lesbian,0.773135,735
6,0.930414,0.762808,black,0.786658,759
5,0.911243,0.81369,muslim,0.799155,814
7,0.941146,0.777626,white,0.811635,1389
1,0.918119,0.884744,female,0.882017,3501
0,0.927893,0.87064,male,0.882635,2560
4,0.936426,0.861494,jewish,0.887701,277
8,0.953043,0.827293,psychiatric_or_mental_illness,0.888695,315
3,0.911464,0.898412,christian,0.892147,1896


# 2. Glove Embeddings

In [None]:
GLOVE_EMBEDDING_PATH = '../input/glove840b300dtxt/glove.840B.300d.txt'

In [None]:
embeddings_dict = {}
errors=[]
with open(GLOVE_EMBEDDING_PATH, 'r') as f:
    for line in f:
        try:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
        except:
            errors.append(line)

In [None]:
def generate_embeddings_from_text(texts):
    emb_mat = []
    for text in texts:
        N=0
        sentence_emb = np.zeros(embeddings_dict['no'].shape[0])
        for word in nltk.tokenize.word_tokenize(text):
            if word in embeddings_dict:
                N+=1
                sentence_emb += embeddings_dict[word]
        if N!=0:
            sentence_emb = sentence_emb/N
        emb_mat.append(sentence_emb)
    return np.array(emb_mat)

In [None]:
glove_train = generate_embeddings_from_text(train['comment_text'])
glove_test = generate_embeddings_from_text(test['comment_text'])

### 2.1. Random Forest

In [None]:
rf  = RandomForestClassifier(n_estimators=100, max_depth=10)
rf.fit(glove_train, y_train)
rf_pred = rf.predict_proba(glove_test)
test['rf_glove'] = rf_pred[:,1]

In [None]:
MODEL_NAME = 'rf_glove'
test[MODEL_NAME]=torch.sigmoid(torch.tensor(test[MODEL_NAME].values)).numpy()
TOXICITY_COLUMN = 'target'
bias_metrics_df = compute_bias_metrics_for_model(test, identity_columns, MODEL_NAME, 'target')
bias_metrics_df
get_final_metric(bias_metrics_df, calculate_overall_auc(test, MODEL_NAME))

0.8070393202733446

In [None]:
bias_metrics_df

Unnamed: 0,bnsp_auc,bpsn_auc,subgroup,subgroup_auc,subgroup_size
2,0.828783,0.736331,homosexual_gay_or_lesbian,0.732546,735
7,0.887964,0.666783,white,0.756811,1389
6,0.871147,0.715847,black,0.774645,759
5,0.875299,0.723779,muslim,0.795292,814
0,0.866337,0.75465,male,0.805607,2560
1,0.858083,0.780047,female,0.821051,3501
4,0.845941,0.801471,jewish,0.829947,277
8,0.839903,0.823579,psychiatric_or_mental_illness,0.843681,315
3,0.865707,0.802261,christian,0.851496,1896


### 2.2. Logistic Regression

In [None]:
lr = LogisticRegression(C=10)
lr.fit(glove_train, y_train)
lr_pred = lr.predict_proba(glove_test)
test['lr_glove'] = lr_pred[:,1]



In [None]:
MODEL_NAME = 'lr_glove'
test[MODEL_NAME]=torch.sigmoid(torch.tensor(test[MODEL_NAME].values)).numpy()
TOXICITY_COLUMN = 'target'
bias_metrics_df = compute_bias_metrics_for_model(test, identity_columns, MODEL_NAME, 'target')
bias_metrics_df
get_final_metric(bias_metrics_df, calculate_overall_auc(test, MODEL_NAME))

0.8332227519593121

In [None]:
bias_metrics_df

Unnamed: 0,bnsp_auc,bpsn_auc,subgroup,subgroup_auc,subgroup_size
6,0.873948,0.728746,black,0.749976,759
2,0.863542,0.759076,homosexual_gay_or_lesbian,0.759111,735
7,0.898582,0.71988,white,0.780258,1389
0,0.891513,0.77191,male,0.816787,2560
5,0.883134,0.772659,muslim,0.818569,814
8,0.866037,0.819641,psychiatric_or_mental_illness,0.831931,315
1,0.891176,0.790959,female,0.834102,3501
4,0.850176,0.858997,jewish,0.85918,277
3,0.861721,0.867139,christian,0.879738,1896
