# Imports

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle

# Load and Preprocess Train and Test data

In [None]:
train_data = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")
# test_data = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv")

## Cleaning

In [None]:
train_data['comment_text'] = train_data['comment_text'].astype(str) 
# test_data['comment_text'] = test_data['comment_text'].astype(str) 

In [None]:
identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

In [None]:
def convert_to_bool(df, col_name):
    df[col_name] = np.where(df[col_name] >= 0.5, True, False)
    
def convert_dataframe_to_bool(df):
    bool_df = df.copy()
    for col in ['target'] + identity_columns:
        convert_to_bool(bool_df, col)
    return bool_df

train_data = convert_dataframe_to_bool(train_data)

# Train CV Split

In [None]:
from sklearn.model_selection import train_test_split
train_df, validate_df = train_test_split(train_data, test_size=0.2)
print('%d train comments, %d validate comments' % (len(train_df), len(validate_df)))

# Tokenizer and Embedding Matrix

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Conv1D, MaxPool1D, Dense, Dropout, Flatten, Embedding, concatenate, LSTM, BatchNormalization
from tensorflow.keras import Model
from tensorflow.keras.utils import to_categorical

In [None]:
#Train tokenizer, only on training data
MAX_VOCAB_SIZE = 10000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(train_df['comment_text'])

MAX_SEQUENCE_LENGTH = 250
def pad_text(texts, tokenizer):
    return pad_sequences(tokenizer.texts_to_sequences(texts), maxlen=MAX_SEQUENCE_LENGTH)

# Custom Loss and Utility Function

In [None]:
def custom_loss(y_true,y_pred):
    #class 0
    loss1 = tf.keras.losses.BinaryCrossentropy(from_logits=True,reduction=tf.keras.losses.Reduction.NONE)(y_true[:,:1],y_pred[:,:1])
    #class 1
    loss2 = tf.keras.losses.BinaryCrossentropy(from_logits=True,reduction=tf.keras.losses.Reduction.NONE)(y_true[:,1:],y_pred[:,1:])
    
    return (loss1*1+loss2*1)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, model_name):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[model_name])

def compute_bpsn_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[model_name])

def compute_bnsp_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[model_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)

def calculate_overall_auc(df, model_name):
    true_labels = df[TOXICITY_COLUMN]
    predicted_labels = df[model_name]
    return roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)

# Model 1: Simple CNN + Custom Loss

In [None]:
EMBEDDINGS_PATH = '../input/glove6b/glove.6B.200d.txt'
EMBEDDINGS_DIMENSION = 200
DROPOUT_RATE = 0.5
LEARNING_RATE = 0.00001
NUM_EPOCHS = 10
BATCH_SIZE = 128
TEXT_COLUMN='comment_text'
TOXICITY_COLUMN='target'

In [None]:
def get_embedding_matrix(train_df, validate_df):
    print("Padding tokenized sequences...")
    train_text = pad_text(train_df[TEXT_COLUMN], tokenizer)
    train_labels = to_categorical(train_df[TOXICITY_COLUMN])
    validate_text = pad_text(validate_df[TEXT_COLUMN], tokenizer)
    validate_labels = to_categorical(validate_df[TOXICITY_COLUMN])
    
    print("Loading Embeddings...")
    # Create word to embedding vector dictionary
    embeddings_index={}
    with open(EMBEDDINGS_PATH) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    
    # Create embedding matrix with all 0
    embedding_matrix = np.zeros((len(tokenizer.word_index)+1,EMBEDDINGS_DIMENSION))
    
    # Update matrix
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return train_text, train_labels, validate_text, validate_labels,embedding_matrix

In [None]:
train_text, train_labels, validate_text, validate_labels,embedding_matrix = get_embedding_matrix(train_df, validate_df)

In [None]:
def modelone_architecture(embedding_matrix):
    '''Returns model'''
    input_layer = Input(shape=(MAX_SEQUENCE_LENGTH), dtype='int32')
    embedding_layer = Embedding(len(tokenizer.word_index)+1, EMBEDDINGS_DIMENSION, \
                                weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH,\
                               trainable=False)(input_layer)
    conv_layer_1 = Conv1D(128, 2, activation='relu', padding='same')(embedding_layer)
    max_pool_1 = MaxPool1D(5, padding='same')(conv_layer_1)
    conv_layer_2 = Conv1D(128, 3, activation='relu', padding='same')(max_pool_1)
    max_pool_2 = MaxPool1D(5, padding='same')(conv_layer_2)
    conv_layer_3 = Conv1D(128, 4, activation='relu', padding='same')(max_pool_2)
    max_pool_3 = MaxPool1D(40, padding='same')(conv_layer_3)
    flat_layer = Flatten()(max_pool_3)
    drop_layer = Dropout(DROPOUT_RATE)(flat_layer)
    dense_layer = Dense(128, activation='relu')(drop_layer)
    output_layer = Dense(2, activation='softmax')(dense_layer)
    
    model = Model(input_layer, output_layer)
    model.summary()
    dot_img_file = './model_1.png'
    tf.keras.utils.plot_model(model, to_file=dot_img_file, show_shapes=True)
    return model
    

In [None]:
def train_model(train_text, train_labels, validate_text, validate_labels, model):
    
    # model architecture is passed in function argument
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    print("Training model now...")
    model.fit(train_text,
              train_labels,
              batch_size=BATCH_SIZE,
              epochs=NUM_EPOCHS,
              validation_data=(validate_text, validate_labels),
              verbose=2)
    
    return model

In [None]:
# Call architecture
model_1_arch = modelone_architecture(embedding_matrix)

In [None]:
#Train
model_1 = train_model(train_text, train_labels, validate_text, validate_labels, model_1_arch)

In [None]:
MODEL_NAME = 'model_1'
validate_df[MODEL_NAME] = model_1.predict(pad_text(validate_df[TEXT_COLUMN], tokenizer))[:, 1]

In [None]:
bias_metrics_df = compute_bias_metrics_for_model(validate_df, identity_columns, MODEL_NAME, TOXICITY_COLUMN)
bias_metrics_df

In [None]:
print(get_final_metric(bias_metrics_df, calculate_overall_auc(validate_df, MODEL_NAME)))

# Predictions

In [None]:
test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')
submission = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv', index_col='id')

In [None]:
submission['prediction'] = model_1.predict(pad_text(test[TEXT_COLUMN], tokenizer))[:, 1]
submission.to_csv('submission.csv')