In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Imports

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import precision_recall_curve, auc, roc_curve, roc_auc_score
from tqdm import tqdm
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import matplotlib.pyplot as plt
import pickle

# 2. Import data

In [None]:
train = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")

In [None]:
IDENTITY_COLUMNS = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

In [None]:
def convert_to_bool(df, col_name):
    df[col_name] = np.where(df[col_name] >= 0.5, True, False)
    
def convert_dataframe_to_bool(df):
    bool_df = df.copy()
    for col in ['target'] + IDENTITY_COLUMNS:
        convert_to_bool(bool_df, col)
    return bool_df

train = convert_dataframe_to_bool(train)

# 3. Cleaning of comment text

In [None]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]


def preprocess_text(text_data):
    '''text_data is list oc all the comments'''
    preprocessed_text = []
    # tqdm is for printing the status bar
    for sentance in tqdm(text_data):
        sent = decontracted(sentance)
        sent = sent.replace('\\r', ' ')
        sent = sent.replace('\\n', ' ')
        sent = sent.replace('\\"', ' ')
        sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
        # https://gist.github.com/sebleier/554280
        sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)
        preprocessed_text.append(sent.lower().strip())
    return preprocessed_text

In [None]:
train['comment_text'] = preprocess_text(train['comment_text'].values)

# 4. Train-Test split (Main split 80-20 | Then split 80 in equal 40-40)

In [None]:
# y = train['target'].values
# X = train.drop(['target'], axis=1)
train_df, test_df= train_test_split(train, test_size=0.20)

In [None]:
train_d1, train_d2 = train_test_split(train_df, test_size=0.50)

In [None]:
print("Number of points in d1",len(train_d1))
print("Number of points in d1",len(train_d2))

# 5. Create k more datasets (sampling with replacement) from train_d1

In [None]:
K=50

In [None]:
train_d1.head()

In [None]:
sample_list=[]
for i in range(0,K):
    sample_list.append(train_d1.sample(frac=0.02, replace=True))

In [None]:
print('distributions of target is as follows:')
for i in range(0,K):
    print('False in sample ',i, 'are', (sample_list[i]['target'].value_counts().values[1])/(sample_list[i]['target'].value_counts().sum()), 'True in sample is', i, 'are', (sample_list[i]['target'].value_counts().values[0])/(sample_list[i]['target'].value_counts().sum()))


In [None]:
sample_list[0].shape

# 6. Vectorize comment text on train_d1 and use the same vectorizer all across to avoid data leaking

In [None]:
comment_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=10000)
comment_vectorizer.fit(train_d1['comment_text'].values) # fit has to happen only on train data

#---------------------------------------------------------------------------------------------------
comment_tfidf_features = comment_vectorizer.get_feature_names() #We will need it later
#---------------------------------------------------------------------------------------------------

# we use the fitted CountVectorizer to convert the text to vector
# X_train_comment_tfidf = vectorizer.transform(X_train['preprocessed_comment_text'].values)
# X_test_comment_tfidf = vectorizer.transform(X_test['preprocessed_comment_text'].values)
# test_comment_tfidf = vectorizer.transform(test_data['preprocessed_comment_text'].values)

# print("After vectorizations")
# print(X_train_comment_tfidf.shape, y_train.shape)
# print(X_test_comment_tfidf.shape, y_test.shape)
# print(test_comment_tfidf.shape)
# print("="*100)


# 7. Create 'm' models, and train each of them on all k sample datasets above

In [None]:
# Model 1 : Naive Bayes
# Model 2 : LR
# Model 3 : Decision Tree

> Vecotrize comment text for all samples

> sample_list[i] is the i'th sample

In [None]:
X_train_sample=[]
y_train_sample=[]
for i in range(0,K):
    X_train_sample.append(comment_vectorizer.transform\
                          (sample_list[i]['comment_text'].values))
    y_train_sample.append(sample_list[i]['target'])

In [None]:
#Check sizes
for i in range(0,K):
    print(X_train_sample[i].shape, y_train_sample[i].shape)

### 7.1. Naive Bayes 

> Train Naive Bayes on all samples

> nb_sample_i means Naive Bayes model for sample i

In [None]:
def train_naive_bayes(train_data, target_data):
    alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10]
    p_grid_NB = {'alpha': alphas, 'class_prior' : [None, [.1,.9],[.2, .8],[.5, .5],[.7 , .3]]}
    NB_cls= MultinomialNB()
    grid = GridSearchCV(estimator = NB_cls, param_grid = p_grid_NB, \
                        scoring = 'roc_auc', cv = 5, return_train_score=True, verbose=0)
    grid.fit(train_data, target_data)
    
    best_estimator = grid.best_estimator_
    best_estimator.fit(train_data, target_data)
    return best_estimator

In [None]:
%%time
nb_model_lists=[]
print("Training...")
for i in tqdm(range(0,K)):
    nb_model_lists.append(train_naive_bayes(X_train_sample[i], y_train_sample[i]))
print("Training done..")

### 7.2. Logistic Regression

In [None]:
def train_lr(train_data, target_data):
    C = [10 ** x for x in range(-5, 2)]
    p_grid_LR = {'C': C}
    LR_model= LogisticRegression(solver='liblinear')
    grid = GridSearchCV(estimator = LR_model, param_grid = p_grid_LR, \
                        scoring = 'roc_auc', cv = 5, return_train_score=True, verbose=0)
    grid.fit(train_data, target_data)    
    best_estimator = grid.best_estimator_
    best_estimator.fit(train_data, target_data)
    return best_estimator

In [None]:
%%time
lr_model_lists=[]
print("Training...")
for i in tqdm(range(0,K)):
    lr_model_lists.append(train_lr(X_train_sample[i], y_train_sample[i]))
print("Training done..")

### 7.3. Decision Tree

In [None]:
def train_dt(train_data, target_data):
    p_grid_DT = {'max_depth': [1, 5, 10, 50], 'min_samples_split' : [5, 10, 100, 500],\
                  'class_weight': ['balanced']}
    DT_model= DecisionTreeClassifier()
    grid = GridSearchCV(estimator = DT_model, param_grid = p_grid_DT, \
                        scoring = 'roc_auc', cv = 5, return_train_score=True, verbose=0)
    grid.fit(train_data, target_data)
    best_estimator = grid.best_estimator_
    best_estimator.fit(train_data, target_data)
    return best_estimator

In [None]:
%%time
dt_model_lists=[]
print("Training...")
for i in tqdm(range(0,K)):
    dt_model_lists.append(train_dt(X_train_sample[i], y_train_sample[i]))
print("Training done..")

> Now we have 150 models. dt_model_lists, lr_model_lists, nb_model_lists

> We need to get predictions of each of these models for dataset train_d2

# 8. Make predictions on train_d2

In [None]:
#Let's first preprocess train_d2
X_train_d2=comment_vectorizer.transform(train_d2['comment_text'].values)
y_train_d2=train_d2['target']

In [None]:
X_train_d2.shape

In [None]:
#Get the best estimators we have the list
#In lists you have all the best models trained.
#Predict proba on X_train_d2, the o/p will be the new dataset col.

#nb_model_lists
for i in tqdm(range(0,K)):
    train_d2['nb'+str(i)]=nb_model_lists[i].predict_proba(X_train_d2)[:,1]
    
#lr_model_lists
for i in tqdm(range(0,K)):
    train_d2['lr'+str(i)]=lr_model_lists[i].predict_proba(X_train_d2)[:,1]
    
#dt_model_lists
for i in tqdm(range(0,K)):
    train_d2['dt'+str(i)]=dt_model_lists[i].predict_proba(X_train_d2)[:,1]

In [None]:
train_d2.head()

In [None]:
train_d2.columns

In [None]:
train_d1.columns
#We can remove these columns from train_d2, and save rest of the df to train a new model

In [None]:
new_dataset=train_d2.drop(columns=['id','comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count'])

In [None]:
new_dataset.head()

In [None]:
new_dataset.to_csv('metamodel_dataset.csv',index=True)

In [None]:
metamodel_data=pd.read_csv('./metamodel_dataset.csv')

In [None]:
metamodel_data.head()

# 9. Let's train an LR on metamodel data

In [None]:
metamodel_data.columns

In [None]:
y_metamodel = metamodel_data['target']
X_metamodel = metamodel_data.drop(columns=['Unnamed: 0', 'target'])

In [None]:
# x_cfl=XGBClassifier()

# prams={
#     'learning_rate':[0.01,0.03,0.05],
#      'n_estimators':[100,200,500],
#      'max_depth':[3,5]
# }
# random_cfl=RandomizedSearchCV(x_cfl,param_distributions=prams,verbose=10,n_jobs=-1,)
# random_cfl.fit(X_metamodel, y_metamodel)

In [None]:
C = [10 ** x for x in range(-5, 2)]
p_grid_LR = {'C': C}

LR_model= LogisticRegression(solver='liblinear')

grid = GridSearchCV(estimator = LR_model, param_grid = p_grid_LR, scoring = 'roc_auc', cv = 5, return_train_score=True)
grid.fit(X_metamodel, y_metamodel)

In [None]:
grid.best_estimator_

In [None]:
#Train the best estimator, name it as our metamodel
metamodel = LogisticRegression(C=0.01, solver='liblinear')
metamodel.fit(X_metamodel, y_metamodel)

# 10. Metamodel is ready, prepare test_df data to get predictions

In [None]:
X_test_df = comment_vectorizer.transform(test_df['comment_text'].values)
#Pass this X_test_df to get predictions from all 150 models, 
#create a new X_test, pass it to metamodel

In [None]:
test_df_copy = test_df

In [None]:
for i in tqdm(range(0,K)):
    test_df_copy['nb'+str(i)]=nb_model_lists[i].predict_proba(X_test_df)[:,1]
    
#lr_model_lists
for i in tqdm(range(0,K)):
    test_df_copy['lr'+str(i)]=lr_model_lists[i].predict_proba(X_test_df)[:,1]
    
#dt_model_lists
for i in tqdm(range(0,K)):
    test_df_copy['dt'+str(i)]=dt_model_lists[i].predict_proba(X_test_df)[:,1]

In [None]:
test_df_copy.shape

In [None]:
dataset_for_metamodel=test_df_copy.drop(columns=['id','comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count'])

In [None]:
dataset_for_metamodel.columns

In [None]:
y_metamodel = dataset_for_metamodel['target']
X_metamodel = dataset_for_metamodel.drop(columns='target')

In [None]:
y_meta_pred = metamodel.predict_proba(X_metamodel)

In [None]:
y_meta_pred =np.argmax(y_meta_pred,axis=1)

# 11. Evaluation of final metric

In [None]:
test_df['metamodel_lr'] = y_meta_pred
test_df['metamodel_lr'] = test_df['metamodel_lr'].apply(lambda x: False if x==0 else True)

In [None]:
SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, model_name):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[model_name])

def compute_bpsn_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[model_name])

def compute_bnsp_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[model_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)

In [None]:
def calculate_overall_auc(df, model_name):
    true_labels = df[TOXICITY_COLUMN]
    predicted_labels = df[model_name]
    return roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)

In [None]:
MODEL_NAME = 'metamodel_lr'
TOXICITY_COLUMN = 'target'
bias_metrics_df = compute_bias_metrics_for_model(test_df, IDENTITY_COLUMNS, MODEL_NAME, TOXICITY_COLUMN)
bias_metrics_df

In [None]:
get_final_metric(bias_metrics_df, calculate_overall_auc(test_df, MODEL_NAME))

# 12. Submission.csv 

In [None]:
test_data = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv")
submission_data = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv")

In [None]:
test_data['comment_text'] = preprocess_text(test_data['comment_text'].values)
X_test_submission = comment_vectorizer.transform(test_data['comment_text'].values)

In [None]:
for i in tqdm(range(0,K)):
    test_data['nb'+str(i)]=nb_model_lists[i].predict_proba(X_test_submission)[:,1]
    
#lr_model_lists
for i in tqdm(range(0,K)):
    test_data['lr'+str(i)]=lr_model_lists[i].predict_proba(X_test_submission)[:,1]
    
#dt_model_lists
for i in tqdm(range(0,K)):
    test_data['dt'+str(i)]=dt_model_lists[i].predict_proba(X_test_submission)[:,1]

In [None]:
dataset_for_metamodel=test_data.drop(columns=['comment_text', 'id'])

In [None]:
y_meta_sub = metamodel.predict_proba(dataset_for_metamodel)

In [None]:
len(y_meta_sub[:,1])

In [None]:
submission_data['prediction']=y_meta_sub[:,1]

In [None]:
submission_data.head()

In [None]:
submission_data.to_csv('submission.csv', index=False)