In [1]:
#basics
import pandas as pd 
import numpy as np

#misc
import gc
import time
import warnings

#nlp 
import re   
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer #Tokenizer for preprocessing
preprop_tokenizer = RegexpTokenizer(r'\w+')
from sklearn.model_selection import train_test_split
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
# Tweet tokenizer does not split at apostophes which is what we want
from nltk.tokenize import TweetTokenizer  
from collections import defaultdict, Counter

lem = WordNetLemmatizer()
tokenizer=TweetTokenizer()

#settings
start_time=time.time()
warnings.filterwarnings("ignore")
eng_stopwords = set(stopwords.words("english"))


%matplotlib inline

In [2]:
data = pd.read_csv('train.csv')

## Pre-Processing

In [3]:
data['comment_text'].head()

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

In [4]:
#https://drive.google.com/file/d/0B1yuv8YaUVlZZ1RzMFJmc1ZsQmM/view
appos = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not"
}

In [5]:
def clean(comment):
    """
    This function receives comments and returns clean word-list
    """
    #Convert to lower case , so that Hi and hi are the same
    comment=comment.lower()
    #remove \n
    comment=re.sub("\\n"," ",comment)
    # remove leaky elements like ip,user
    comment=re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
    #removing usernames
    comment=re.sub("\[\[.*\]","",comment)
    
    #Split the sentences into words
    words=tokenizer.tokenize(comment)
    
    # aphostophe  replacement (ie)   you're --> you are  
    words=[appos[word] if word in appos else word for word in words]
    words=[lem.lemmatize(word, "v") for word in words]
    # remove stopwords
    words = [w for w in words if not w in eng_stopwords]
    
    clean_sent=" ".join(words)
    #remove any non alphanum,digit character
    clean_sent=re.sub("\W+"," ",clean_sent)
    clean_sent=re.sub("  "," ",clean_sent)
    
    return(clean_sent)

In [6]:
data['comment_text'] = data.iloc[:,0:2].comment_text.apply(lambda x: clean(x))

### Split into train, validation, and test set using Stratified sampling and Oversampling

In [7]:
def get_train_val_test_sizes(data_size, val_ratio, test_ratio):
    val_size = int(data_size * val_ratio)
    test_size = int(data_size * test_ratio)
    train_size = data_size - val_size - test_size
    return train_size, val_size, test_size

In [8]:
def create_sub_sample(df, index_list, n_duplicates):
    subset_df = df[df.index.isin(index_list)]
    # list of lists
    list_of_subsets = [subset_df]*n_duplicates
    # subset_df = combine_subset(pd.DataFrame(), list_of_subsets)
    combined_df = pd.concat(list_of_subsets, axis = 0, ignore_index = False)
    return combined_df

In [9]:
def get_combo_labels_dict(df):
    labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

    # Create different group of toxics 
    labels_idx_dict = defaultdict(list)
    labels_idx_dict["clean"] = df[df["toxic_count"] == 0].sample(frac=1,random_state=9).index
    labels_idx_dict["combo_6_toxic"] = df[df["toxic_count"] == 6].sample(frac=1,random_state=9).index
    labels_idx_dict["toxic"] = df[(df["toxic_count"] == 1) & (df["toxic"] == 1)].sample(frac=1,random_state=9).index
    labels_idx_dict["combo_2_obscene_and_insult"] = df[(df["toxic_count"] == 2) & (df["obscene"] == 1) & (df["insult"] == 1)].sample(frac=1,random_state=9).index
    labels_idx_dict["combo_2_hate_and_insult"] = df[(df["toxic_count"] == 2) & (df["identity_hate"] == 1) & (df["insult"] == 1)].sample(frac=1,random_state=9).index
    labels_idx_dict["combo_3_obscene_and_insult"] = df[(df["toxic_count"] == 3) & (df["obscene"] == 1) & (df["insult"] == 1)].sample(frac=1,random_state=9).index
    labels_idx_dict["combo_4_wo_threat"] = df[(df["toxic_count"] == 4) & (df["threat"] == 0)].sample(frac=1,random_state=9).index
    for label_name in labels[1:]:
        labels_idx_dict[f"combo_2_toxic_and_{label_name}"] = df[(df["toxic_count"] == 2) & (df[label_name] == 1) & (df["toxic"] == 1)].sample(frac=1,random_state=9).index
    return labels_idx_dict

In [10]:
def print_labels_combo_dict(labels_idx_dict):
    for key_name, index_list in labels_idx_dict.items():
        print(f"{key_name.ljust(40,' ' )} {len(index_list)}")

In [11]:
def generate_stratified_train_val_test(df, val_ratio, test_ratio):
    labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    OVERSAMPLING_TRAIN_MAX = 5000
    OVERSAMPLING_VAL_MAX = 500
    #create new column 'toxic_count'
    df["toxic_count"] = df.iloc[:,2:].sum(axis=1)

    labels_idx_dict = get_combo_labels_dict(df)
    print(f"\n{' ' * 10 } DATA DISTRIBUTION")

    over_sampling_group_list = ["combo_6_toxic", "combo_2_obscene_and_insult", "combo_2_hate_and_insult", "combo_3_obscene_and_insult", "combo_2_toxic_and_severe_toxic", "combo_2_toxic_and_obscene", "combo_2_toxic_and_threat", "combo_2_toxic_and_insult", "combo_2_toxic_and_identity_hate", "combo_4_wo_threat"]
    train_indices, val_indices, test_indices, total_index_list = [], [], [], []
    train_df_subset_list, val_df_subset_list, test_df_subset_list = [], [], []
    for key_name, index_list in labels_idx_dict.items():
        print(f"{key_name.ljust(40,' ' )} {len(index_list)}")
        total_index_list.extend(index_list)
        # Calculate size of dataset
        data_size = len(index_list)
        train_size, val_size, test_size = get_train_val_test_sizes(data_size, val_ratio, test_ratio)
        
        # Get indexes for each set
        train_portion = index_list[:train_size]
        val_portion = index_list[train_size:train_size+val_size]
        test_portion = index_list[-test_size:]

        # Append each portion 
        train_indices.extend(train_portion)
        val_indices.extend(val_portion)
        test_indices.extend(test_portion)

        # Oversampling data
        if key_name in over_sampling_group_list:
            train_n_duplicates = OVERSAMPLING_TRAIN_MAX // len(train_portion) if len(train_portion) < OVERSAMPLING_TRAIN_MAX else 1
            train_df_subset_list.append(create_sub_sample(df,train_portion,train_n_duplicates))
            val_n_duplicates = OVERSAMPLING_VAL_MAX // len(val_portion) if len(val_portion) < OVERSAMPLING_VAL_MAX else 1
            val_df_subset_list.append(create_sub_sample(df,val_portion,val_n_duplicates))

    # Get the remaining group of data
    other_toxic_indices = df[~df.index.isin(total_index_list)].sample(frac=1,random_state=9).index
    key_name = "remaing_toxic"
    print(f"{key_name.ljust(40,' ' )} {len(other_toxic_indices)}")

    # Add the remaining data to the train test val indices list
    data_size = len(other_toxic_indices)
    train_size, val_size, test_size = get_train_val_test_sizes(data_size, val_ratio, test_ratio)
     
    # Get indexes for each set
    train_portion = other_toxic_indices[:train_size]
    val_portion = other_toxic_indices[train_size:train_size+val_size]
    test_portion = other_toxic_indices[-test_size:]

    # Append each portion 
    train_indices.extend(train_portion)
    val_indices.extend(val_portion)
    test_indices.extend(test_portion)

    # Oversampling data
    train_n_duplicates = OVERSAMPLING_TRAIN_MAX // len(train_portion) if len(train_portion) < OVERSAMPLING_TRAIN_MAX else 1
    train_df_subset_list.append(create_sub_sample(df,train_portion,train_n_duplicates))
    val_n_duplicates = OVERSAMPLING_VAL_MAX // len(val_portion) if len(val_portion) < OVERSAMPLING_VAL_MAX else 1
    val_df_subset_list.append(create_sub_sample(df,val_portion,val_n_duplicates))

    # Create the dataset from the indices 
    train_df = df[df.index.isin(train_indices)]
    val_df = df[df.index.isin(val_indices)]
    test_df = df[df.index.isin(test_indices)]

    assert(len(df) == len(train_df)+len(val_df)+len(test_df))

    # Add oversampling data
    train_df_subset_list.append(train_df)
    combined_train_df = pd.concat(train_df_subset_list, axis = 0, ignore_index = False)
    val_df_subset_list.append(val_df)
    combined_val_df = pd.concat(val_df_subset_list, axis = 0, ignore_index = False)

    print(f"\n{' ' * 10 } TRAIN DATA DISTRIBUTION")
    labels_idx_dict = get_combo_labels_dict(combined_train_df)
    print_labels_combo_dict(labels_idx_dict)

    print(f"\n{' ' * 10 } DATASET")

    print(f"TRAIN SIZE   {len(combined_train_df)}")
    print(f"VAL SIZE     {len(combined_val_df)}")
    print(f"TEST SIZE    {len(test_df)}")

    return combined_train_df.sample(frac=1,random_state=9), combined_val_df.sample(frac=1,random_state=9), test_df.sample(frac=1,random_state=9)

In [12]:
train_df, val_df, test_df = generate_stratified_train_val_test(data,val_ratio=0.1,test_ratio=0.1)


           DATA DISTRIBUTION
clean                                    143346
combo_6_toxic                            31
toxic                                    5666
combo_2_obscene_and_insult               181
combo_2_hate_and_insult                  28
combo_3_obscene_and_insult               3820
combo_4_wo_threat                        1620
combo_2_toxic_and_severe_toxic           41
combo_2_toxic_and_obscene                1758
combo_2_toxic_and_threat                 113
combo_2_toxic_and_insult                 1215
combo_2_toxic_and_identity_hate          136
remaing_toxic                            1616

           TRAIN DATA DISTRIBUTION
clean                                    114678
combo_6_toxic                            5025
toxic                                    4534
combo_2_obscene_and_insult               5075
combo_2_hate_and_insult                  5016
combo_3_obscene_and_insult               6112
combo_4_wo_threat                        5184
combo_2_toxic_and_s

In [13]:
train_df.to_csv('train_df')

In [14]:
val_df.to_csv('val_df')

In [15]:
test_df.to_csv('test_df')