In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
print(tf.__version__)
import json
import os
import numpy as np
import pandas as pd
from datetime import datetime as dt

embedding_dim = 100
#1804874
TRAIN_SIZE = 1804874
TEST_PORTION = .05
MAX_LENGTH = 200
num_epochs = 30
conv_feature_size = 256
BATCH_SIZE = 1024
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = LSTM_UNITS*4

IDENTITY_COLUMNS = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
AUX_COLUMNS = ['severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']

**Load data**

In [None]:
# corpus = project_lib.load_data_to_list("./data/train.csv")
train_csv = "../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv"
test_csv = "../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv"

train_csv_df = pd.read_csv(train_csv, nrows=TRAIN_SIZE)
test_csv_df = pd.read_csv(test_csv)
print('loaded %d records' % len(train_csv_df))

# Make sure all comment_text values are strings
train_csv_df['comment_text'] = train_csv_df['comment_text'].astype(str) 
test_csv_df['comment_text'] = test_csv_df['comment_text'].astype(str) 

Try to understand the data here, what they actually look like, the distribution, or if some are more useful than the others

In [None]:
# see what the data looks like
print(train_csv_df.head())


import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

#plot the distribution for labels
sns.distplot(train_csv_df['target'].values)

sum_true_label = (train_csv_df['target'].values>=0.5).sum()
print("number of true labels = ",sum_true_label)


train_csv_df = train_csv_df.fillna(0)
sum_aux_label = (train_csv_df[AUX_COLUMNS]>0).sum(axis=0)
print("number of auxiliary labels = \n",sum_aux_label)

num_of_col_without_aux = ((train_csv_df[AUX_COLUMNS]>0).sum(axis=1)==0).sum()
print("number of columns without auxiliary labels = ",num_of_col_without_aux)


sum_identity_label = (train_csv_df[IDENTITY_COLUMNS]>0).sum(axis=0)
print(IDENTITY_COLUMNS)
print("\nnumber of identity labels = \n",sum_identity_label)

num_of_col_without_identity = ((train_csv_df[IDENTITY_COLUMNS]>0).sum(axis=1)==0).sum()
print("number of columns without identity labels = ",num_of_col_without_identity)


num_of_col_without_other_label = ((train_csv_df[AUX_COLUMNS+ IDENTITY_COLUMNS]>0).sum(axis=1)==0).sum()
print("\nnumber of columns without identity/auxiliary labels = ", num_of_col_without_other_label)

num_of_col_without_any_label = ((train_csv_df[['target']+AUX_COLUMNS+ IDENTITY_COLUMNS]>0.0).sum(axis=1)==0).sum()
print("number of columns without any labels = ", num_of_col_without_any_label)

In [None]:
#filter out the rows without any labels, as the competition focus on the unintended bias rather than abosolute label accuracy
train_csv_df = train_csv_df.loc[~(train_csv_df[['target']+AUX_COLUMNS+ IDENTITY_COLUMNS]==0.0).all(axis=1)]

print("length of fitlered dataset = ", len(train_csv_df))

**Data preparation**

In [None]:

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import re
from nltk.corpus import stopwords
import spacy
nlp = spacy.load('en', parse = True, tag=True, entity=True)
import emoji
import multiprocessing
from sklearn import model_selection

# Lemmatization with spacy
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

# Remove everything apart from number and english letters
def remove_special_characters(text, remove_digits=False):
    pattern = '\n+|\n\r+'
    text = re.sub(pattern, ' ', text)
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    pattern = '\s+'
    text = re.sub(pattern, ' ', text)
    return text

# Stopwords from nltk library, exempting a list of negation words due to their sentimental value
def stopwords_removal(sentence):        
    # stop words removal actually lowers the submission score as it potentially removes sentimental information.
    stop_words = set(stopwords.words('english'))
    negation_words = ['not','but',"mightn","haven't","hadn","isn","didn","shan't","weren",
                      "don't","doesn't","mustn't","hadn't","shouldn","wouldn't","ain","mightn't",
                      "no","won't","hasn","needn't","didn't","doesn","against","aren't","hasn't","don",
                      "nor","wasn't","shouldn't","weren't","couldn't","couldn","won"]
    for word in negation_words:
        stop_words.remove(word)
    filtered_sentence = ""
    words = sentence.split()
    for word in words:
        if not word in stop_words:
            filtered_sentence += word + ' '    
    return filtered_sentence

# Some comments use alternative spellings for agressive words
WORDS_REPLACER = [
    ("sh*t", "shit"),
    ("s**t", "shit"),
    ("f*ck", "fuck"),
    ("fu*k", "fuck"),
    ("f**k", "fuck"),
    #("f*****g", "fucking"),
    ("f***ing", "fucking"),
    ("f**king", "fucking"),
    ("p*ssy", "pussy"),
    ("p***y", "pussy"),
    ("pu**y", "pussy"),
    ("p*ss", "piss"),
    ("b*tch", "bitch"),
    ("bit*h", "bitch"),
    ("h*ll", "hell"),
    ("h**l", "hell"),
    ("cr*p", "crap"),
    ("d*mn", "damn"),
    ("stu*pid", "stupid"),
    ("st*pid", "stupid"),
    ("n*gger", "nigger"),
    ("n***ga", "nigger"),
    ("f*ggot", "faggot"),
    ("scr*w", "screw"),
    ("pr*ck", "prick"),
    ("g*d", "god"),
    ("s*x", "sex"),
    ("a*s", "ass"),
    ("a**hole", "asshole"),
    ("a***ole", "asshole"),
    #("a**", " ass"),
]

REGEX_REPLACER = [
    (re.compile('\W'+pat.replace("*", "\S")+'\W', flags=re.IGNORECASE), ' '+repl+' ')
    for pat, repl in WORDS_REPLACER
]

RE_SPACE = re.compile(r"\s")
RE_MULTI_SPACE = re.compile(r"\s+")

EMOJI_REGEXP = emoji.get_emoji_regexp()

UNICODE_EMOJI_MY = {
    k: f" EMJ {v.strip(':').replace('_', ' ')} "
    for k, v in emoji.UNICODE_EMOJI_ALIAS.items()
}


# Convert emoji to words/phrases
def my_demojize(string: str) -> str:
    def replace(match):
        return UNICODE_EMOJI_MY.get(match.group(0), match.group(0))

    return re.sub("\ufe0f", "", EMOJI_REGEXP.sub(replace, string))

# function to call the above processings, lemmatization is commented out
# as it cost too much time and in some cases harms the performance
def text_preprocess(sentence):
    sentence = my_demojize(sentence)
    sentence = remove_special_characters(sentence, True)
    #sentence = RE_SPACE.sub(" ", sentence)
    sentence = sentence.lower()
    for pattern, repl in REGEX_REPLACER:
        sentence = pattern.sub(repl, sentence)
    #sentence = lemmatize_text(sentence)
    sentence = RE_MULTI_SPACE.sub(" ", sentence).strip().replace(r'\n',  ' ')
    return sentence

# Process the loaded data, tokenization, fitting and padding
def generate_dataset(train_set_df, test_set_df):
    print("Generating dataset")
    start_time = dt.now()
    
    train_set_df = train_set_df.sample(frac=1).reset_index(drop=True)
    test_set_df = test_set_df[:int(test_set_df.shape[0]*TRAIN_SIZE/1804874)]

    trunc_type = 'post'
    padding_type = 'post'
    oov_tok = "<OOV>"

    target_labels = train_set_df["target"].values.tolist()
    
    identities = train_set_df[IDENTITY_COLUMNS].fillna(0)
    sample_weights = np.ones(len(identities), dtype=np.float32)
    sample_weights += target_labels * ((identities).sum(axis=1))
    #sample_weights += identities.sum(axis=1)
    #sample_weights += target_labels * ((1-identities).sum(axis=1))
    #sample_weights += (np.ones(len(target_labels))-target_labels) * identities.sum(axis=1) * 5
    sample_weights /= sample_weights.mean()
    train_set_df["sample_weights"] = sample_weights
    
    
    sentences = train_set_df["comment_text"].values.tolist()
    test_sentences = test_set_df["comment_text"].values.tolist()
    print("pre-process text")
    with multiprocessing.Pool(processes=2) as pool:
        processed_sentences = pool.map(text_preprocess, sentences)
        processed_test_sentences = pool.map(text_preprocess, test_sentences)   
    
    print("tokenizing...")
    tokenizer = Tokenizer(oov_token=oov_tok)
    tokenizer.fit_on_texts(processed_sentences+processed_test_sentences)

    word_index = tokenizer.word_index
    no_of_vocab = len(word_index)

    print("fitting...")
    sequences = tokenizer.texts_to_sequences(processed_sentences)
    print("padding...")
    padded = pad_sequences(sequences, padding=padding_type, truncating=trunc_type, maxlen=MAX_LENGTH)
    train_set_df["padded_sequences"] = pd.Series(list(padded), index=train_set_df.index)
        
    training_df, validation_df = model_selection.train_test_split(train_set_df, test_size=TEST_PORTION)
    print('%d train comments, %d validate comments' % (len(training_df), len(validation_df)))
    
    validation_sequences = np.array(validation_df["padded_sequences"].to_numpy().tolist())
    training_sequences = np.array(training_df["padded_sequences"].to_numpy().tolist())

    
    test_sequences = tokenizer.texts_to_sequences(processed_test_sentences)
    test_sequences = pad_sequences(test_sequences, padding=padding_type, truncating=trunc_type,
                                   maxlen=MAX_LENGTH)

    test_set_df["padded_sequences"] = pd.Series(list(test_sequences), index=test_set_df.index)
    
    print("Dataset generated, time elapsed =", dt.now()-start_time)

    return no_of_vocab, word_index, test_set_df, training_df, validation_df





vocab_size, word_indices, test_df, train_df, val_df = generate_dataset(train_csv_df, test_csv_df)

val_sequences = np.array(val_df["padded_sequences"].to_numpy().tolist())
train_sequences = np.array(train_df["padded_sequences"].to_numpy().tolist())
val_labels = val_df['target'].values
aux_val_labels = val_df[['target']+AUX_COLUMNS].values
train_labels = train_df['target'].values
aux_train_labels = train_df[['target']+AUX_COLUMNS].values

padded_test_sequences = np.array(test_df["padded_sequences"].to_numpy().tolist())
test_id = test_df["id"].values.tolist()

In [None]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Dense, Dropout, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D,\
                                    Conv1D, CuDNNLSTM, SpatialDropout1D, add, Input, concatenate

# Setup embedding model from GloVe
def create_embedding_matrix(word_index, embedding_dimension=100):
    embeddings_index = {}
    with open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt') as f:
        for line in f:
            values = line.split();
            word = values[0];
            coefs = np.asarray(values[1:], dtype='float32');
            embeddings_index[word] = coefs;        
            
    embeddings_matrix = np.zeros((len(word_index)+1, embedding_dimension));
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word);
        if embedding_vector is not None:
            embeddings_matrix[i] = embedding_vector;
            
    return embeddings_matrix


# Build the convlutional LSTM model, using the auxiliary labels as loss to blend in the information 
def create_model_functional_api(no_of_vocab, embedding_dimension, word_index):
    words = Input(shape=(None,))

    embedding_matrix = create_embedding_matrix(word_index, embedding_dimension)

    x = Embedding(no_of_vocab+1, embedding_dimension, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.2)(x)
    x = Conv1D(128,5)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    aux_result = Dense(6, activation='sigmoid')(hidden)

    tf_model = Model(inputs=words, outputs=[aux_result])
    tf_model.compile(loss='binary_crossentropy', optimizer='adam')
    tf_model.summary()

    return tf_model

model = create_model_functional_api(vocab_size, embedding_dim, word_indices)

In [None]:
print("Training Start")
start_time = dt.now()

sample_weights = train_df["sample_weights"].values
combined_weigths =[sample_weights]

history = model.fit(train_sequences, [aux_train_labels], epochs=num_epochs, batch_size=BATCH_SIZE,
                    validation_data=(val_sequences, [aux_val_labels]),
                    sample_weight=combined_weigths
                    )

print("Training Complete, time elapsed =", dt.now()-start_time)

In [None]:

from sklearn import metrics

# Convert taget and identity columns to booleans
def convert_to_bool(df, col_name):
    df[col_name] = np.where(df[col_name] >= 0.5, True, False)
    
def convert_dataframe_to_bool(df):
    bool_df = df.copy()
    for col in ['target'] + IDENTITY_COLUMNS:
        convert_to_bool(bool_df, col)
    return bool_df

flat_list = []
for sublist in model.predict(val_sequences):
    flat_list.append(sublist[0])

val_df["prediction"] = flat_list
#val_df["prediction"] = model.predict(val_sequences)[0]

booled_val_df = convert_dataframe_to_bool(val_df)

SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return metrics.roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, model_name):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[model_name])

def compute_bpsn_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[model_name])

def compute_bnsp_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[model_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)

bias_metrics_df = compute_bias_metrics_for_model(booled_val_df, IDENTITY_COLUMNS, 'prediction', 'target')
bias_metrics_df

In [None]:
def calculate_overall_auc(df, model_name):
    true_labels = df['target']
    predicted_labels = df[model_name]
    return metrics.roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    print("SUBGROUP_AUC mean =",power_mean(bias_df[SUBGROUP_AUC], POWER))
    print("BPSN_AUC mean =",power_mean(bias_df[BPSN_AUC], POWER))
    print("BNSP_AUC mean =",power_mean(bias_df[BNSP_AUC], POWER))
    print("OVERALL_AUC mean =",overall_auc)
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)
    
get_final_metric(bias_metrics_df, calculate_overall_auc(booled_val_df, 'prediction'))

In [None]:
#prediction = model.predict(padded_test_sequences)[0]
prediction = model.predict(padded_test_sequences)

flat_list = []
for sublist in prediction:
    flat_list.append(sublist[0])

pd_submission = pd.DataFrame({"id": test_id, "prediction": flat_list})
pd_submission.to_csv("submission.csv", index=False)