**** Hi, Thank you for visiting this notebook. If you like any part of this notebook, please upvote :)* 

In [None]:
import re
import gc
import string
from tqdm import tqdm
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import seaborn as sns
color = sns.color_palette()
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 

import numpy as np 
import pandas as pd
from PIL import Image
from wordcloud import WordCloud, STOPWORDS

import spacy
from collections import Counter, defaultdict
import en_core_web_sm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dropout, SpatialDropout1D, Embedding, add, concatenate, Concatenate, Input
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, GlobalMaxPool1D
from tensorflow.compat.v1.keras.layers import CuDNNLSTM

Now let's load train and test data to start with EDA and all.

In [None]:
train_data = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test_data = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')

<a id=eda ></a>
# EDA

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.info()

Check if there is any null values or not

In [None]:
train_data.isna().sum()

There are no such no null values :)

In [None]:
train_data.shape, test_data.shape

Check which category has most of comments and which has least,
basically check category counts

In [None]:
comments = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
comments_count = [train_data.toxic.sum(), train_data.severe_toxic.sum(), 
         train_data.obscene.sum(), train_data.threat.sum(), 
         train_data.insult.sum(), train_data.identity_hate.sum() ]

plt.bar(comments, comments_count, width=0.6)

* It comes out that dataset is imbalanced. Most of the comments are in toxic category and threat, identity_hate and severe_toxic has less.
* One thing is that comment which is in severe_toxic will be in toxic category too. 

In [None]:
sum(comments_count)

Let's see where none of the (toxic, severe toxic, obsence, threat, insult, identity_hate) exists for the given tweet. Basically see how many comments are clean from toxic/hate and all.

In [None]:
normal_comments = train_data.loc[ 
    (train_data["severe_toxic"]==0)
   &(train_data["toxic"]==0)
   &(train_data["obscene"]==0)
   &(train_data["threat"]==0)
   &(train_data["insult"]==0)
   &(train_data["identity_hate"]==0)
]

In [None]:
pd.set_option('display.max_colwidth', None)
normal_comments

In [None]:
print("There are {0} comments which are not toxic in the given categories".format(len(normal_comments)))

In [None]:
print("Summation of normal comments and toxic comments exceed the total number of comments we have {0}".format(len(normal_comments) + sum(comments_count)))

There are normal_comments 143346 and sumamtion of toxic comments is 35098; which means that there are some toxic comments which may lie in mutiple category.

Now Let's check for those alerts which have lie in mutiple category

In [None]:
row_sum = train_data.iloc[:, 2:].sum(axis=1)
plt.bar(row_sum.value_counts().index, row_sum.value_counts().values)

Now let's check which of the category overlaps most
One thing is that a comment which lie in severe toxic category should belong to toxic also. 

In [None]:
df = pd.DataFrame()

In [None]:
#where three category overlaps 
df["unique"] = (row_sum==1)
print("no of comments which lie in unique category/tagged with unique category", df.loc[df["unique"]].shape[0])

df["two_cat"] = (row_sum==2)
print("no of comments which lie in two category/tagged with two category", df.loc[df["two_cat"]].shape[0])

df["three_cat"] = (row_sum==3)
print("no of comments which lie in three category/tagged with three category", df.loc[df["three_cat"]].shape[0])

df["four_cat"] = (row_sum==4)
print("no of comments which lie in four category/tagged with four category", df.loc[df["four_cat"]].shape[0])

df["five_cat"] = (row_sum==5)
print("no of comments which lie in five category/tagged with five category", df.loc[df["five_cat"]].shape[0])

df["six_cat"] = (row_sum==6)
print("no of comments which lie in six category/tagged with six category", df.loc[df["six_cat"]].shape[0])

In [None]:
del df
gc.collect()

In [None]:
pd.crosstab(train_data.loc[row_sum==3]["toxic"], 
[
    train_data.loc[row_sum==3]["threat"],
    train_data.loc[row_sum==3]["severe_toxic"],
    train_data.loc[row_sum==3]["identity_hate"],
    train_data.loc[row_sum==3]["obscene"], 
    train_data.loc[row_sum==3]["insult"],   
],
            rownames=["toxic"], 
            colnames=["threat", "severe_toxic", "identity_hate", "obscene", "insult"]
)

* Highest number of comments which are tagged in three category belongs to group (Obscene, insult, toxic)
* Lowest number of comments which are tagged in three category belongs to group (Threat, obscene, insult

Let's explore the comments which belong only to the threat category 

In [None]:
threat_comments = train_data.loc[ (train_data["threat"]==1) ]

In [None]:
threat_comments

In [None]:
pd.set_option('display.max_colwidth', None)
pd.crosstab(threat_comments["threat"]==1, 
[
    threat_comments["toxic"]==1,
    threat_comments["severe_toxic"]==1,
    threat_comments["identity_hate"]==1,
    threat_comments["obscene"]==1, 
    threat_comments["insult"]==1,   
],
            rownames=["threat"], 
            colnames=["toxic", "severe_toxic", "identity_hate", "obscene", "insult"]
)

Most of the threat comments have (insult, obsence, toxic) category tagged. 
There are also 22 comments which are only tagged as threat.

In [None]:
del threat_comments
gc.collect()

<a id="word_cloud"></a>
# Word Cloud

In [None]:
wc_stopwords=set(STOPWORDS)

In [None]:
normal_comments["comment_text"].values

In [None]:
normal_mask=np.array(Image.open("../input/wordcloud-mask/flower.png"))
wc_normal = WordCloud(max_words=4000, min_font_size=5,
                      mask=normal_mask, 
                      stopwords=wc_stopwords, background_color="black", 
                      margin=10, random_state=1).generate(" ".join(normal_comments["comment_text"].values))
plt.figure(figsize=(12, 15))
plt.title("Word cloud for normal/clean comments")
plt.axis("off")
plt.imshow(wc_normal, interpolation = 'bilinear')

In [None]:
normal_mask=np.array(Image.open("../input/wordcloud-mask/flower.png"))
plt.figure(figsize=(20, 18))

for i, category in enumerate(["toxic", "severe_toxic", "threat", "obscene", "insult", "identity_hate"]):
    plt.subplot(3,2,i+1)
    wc_normal = WordCloud(max_words=4000, min_font_size=5,
                          mask=normal_mask, 
                          stopwords=wc_stopwords, background_color="black", 
                          margin=10, random_state=1).generate(" ".join(train_data[train_data[category]==1].comment_text.values))
    plt.title("Word cloud for {0} comments".format(category).upper(), fontsize=22)
    plt.axis("off")
    plt.imshow(wc_normal, interpolation = 'bilinear')

In [None]:
nlp = en_core_web_sm.load()
print(nlp.entity.labels)
print(nlp.entity.cfg)

In [None]:
%%time

#
#Named entities for  comments
#list which are in ORG,
#NORP Nationalities or religious or political group
#

texts = train_data["comment_text"].values
orgs = []

for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
    orgs.extend([ent.text for ent in doc.ents if not ent.text.isspace() and ent.label_=="ORG"])

In [None]:
#now list out the most frequent ORG used in the data
Counter(orgs).most_common(50)

<a id="data_clean"></a>
# Data Cleaning

In [None]:
#Lemmatize Words


def get_pos_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN

lemmatizer = WordNetLemmatizer()

In [None]:
APPO = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}

In [None]:

REPLACE_URLS = re.compile(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+')
REPLACE_HASH = re.compile(r'#(\w+)')
REPLACE_AT = re.compile(r'@(\w+)')
REPLACE_HTML_TAGS = re.compile(r'<[^>]+>')
REPLACE_DIGITS = re.compile(r'\d+')
REPLACE_PUNCTUATION = re.compile(r'!\"#+$%&\)*,-./:;<=>?@[\\]^_`{|}~\t\(\\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—')     #[,!@\'\"?\.$%_&#*+-:;]")       #[!\"#$%&\'()*+,-\./:;<=>?@[\\]^_`{|}~]")
LEAKY_FEATURE = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}") 

STOPWORDS = set(stopwords.words('english'))

sentences = [] #for Word2Vec model

def clean_text(text):
    text = text.lower()
    text = REPLACE_HTML_TAGS.sub(' ', text)
    
    text = re.sub(r'&amp;', '&', text)
    
    text = REPLACE_URLS.sub('', text)
    #text = REPLACE_HASH.sub('', text)
    text = REPLACE_AT.sub('', text)
    text = REPLACE_DIGITS.sub('', text)
    text = REPLACE_PUNCTUATION.sub(' ' , text)
    text = LEAKY_FEATURE.sub('', text)
    
    text = [APPO[word] if word in APPO else word for word in text.split()]
    
    text = " ".join(lemmatizer.lemmatize(word.strip(), get_pos_tag(pos_tag([word.strip()])[0][1])) for word in text if word not in STOPWORDS)
    
    #sentences.append(text.split())
    return text

Now let's clean tthe data in train and test dataset

In [None]:
%%time
train_data["comment"] = train_data["comment_text"].apply(clean_text)


In [None]:
#train_data.to_csv("clean_jigsaw.csv")

In [None]:
%%time
test_data["comment"] = test_data["comment_text"].apply(clean_text)

<a id="word_count"> </a>
# Word Count Analysis

In [None]:
df_count = train_data.loc[:, ["id", "comment_text", "comment"]]
df_count["word_count"] = df_count["comment_text"].apply(lambda x: len(str(x).split()))
df_count["sent_count"] = df_count["comment_text"].apply(lambda x: len(re.findall("\n", str(x))) +1)
df_count["unique_words"] = df_count["comment_text"].apply(lambda x : len(set(str(x).split())))

In [None]:
#count in comments after data cleaning
df_count["word_count_clean"] = df_count["comment"].apply(lambda x: len(str(x).split()))
#df_count["sent_count_clean"] = df_count["comment"].apply(lambda x: len(re.findall("\n", str(x))) +1)
df_count["unique_words_clean"] = df_count["comment"].apply(lambda x : len(set(str(x).split())))

Word Count per category 

In [None]:
print( "min and max word count in comment before cleaning >> ", min(df_count["word_count"]), max(df_count["word_count"]) )
print( "min and max unique words in comment before cleaning >> ", min(df_count["unique_words"]), max(df_count["unique_words"]) )
print( "min and max  word count in comment after cleaning >> ", min(df_count["word_count_clean"]), max(df_count["word_count_clean"]) )
print( "min and max  unique word in comment after cleaning >> ", min(df_count["unique_words_clean"]), max(df_count["unique_words_clean"]) )

how the Average count and average unique count of each category ("toxic", "severe_toxic", "threat", "obscene", "insult", "identity_hate", "normal") AND unique count percentage. 

In [None]:
toxic_index = train_data[train_data["toxic"]==1]["id"]
severe_toxic_index = train_data[train_data["severe_toxic"]==1]["id"]
threat_index = train_data[train_data["threat"]==1]["id"]
obscene_index = train_data[train_data["obscene"]==1]["id"]
insult_index = train_data[train_data["insult"]==1]["id"]
identity_hate_index = train_data[train_data["identity_hate"]==1]["id"]

In [None]:
category_list = ["toxic", "severe_toxic", "threat", "obscene", "insult", "identity_hate"]
avg_count = []
avg_unique_count = []
unique_to_all = []

for i, category in enumerate([toxic_index, severe_toxic_index, threat_index, obscene_index, insult_index, identity_hate_index]):
    avg_count.append(df_count[df_count.id.isin(category)]["word_count"].mean())
    avg_unique_count.append(df_count[df_count.id.isin(category)]["unique_words"].mean())
    
avg_count.append(df_count[df_count.id.isin(normal_comments.id)]["word_count"].mean())
avg_unique_count.append(df_count[df_count.id.isin(normal_comments.id)]["unique_words"].mean())


for i, category in enumerate([toxic_index, severe_toxic_index, threat_index, obscene_index, insult_index, identity_hate_index]):
    ratio = df_count[df_count.id.isin(category)]["unique_words"]/df_count[df_count.id.isin(category)]["word_count"] 
    unique_to_all.append(ratio.mean()*100)
    
unique_to_all.append( (df_count[df_count.id.isin(normal_comments.id)]["unique_words"]/df_count[df_count.id.isin(normal_comments.id)]["word_count"] ).mean()*100)

category_list.append("normal")


In [None]:
x = np.arange(len(category_list))
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(15,6))
count_bar = ax.bar(x - width/2, avg_count, width, label='avg word count')
unique_count_bar = ax.bar(x + width/2, avg_unique_count, width, label='avg unique word count')
ax.set_ylabel('Counts')
ax.set_title("Avg word count by total and unique")
ax.set_xticks(x)
ax.set_xticklabels(category_list)
ax.legend()

plt.figure(figsize=(15,6))
plt.bar(category_list, unique_to_all, width=0.8)
plt.title("% of unique counts")
plt.show()
    

Most of the gap is in severe_toxic category 

Check the distribution of comments with word_count and unique words 

In [None]:
import seaborn as sns
f, (ax_wc, ax_uwc) = plt.subplots(1,2, figsize=(22,10))

mean_wc = df_count[df_count.id.isin(normal_comments["id"])]["word_count"].mean()
median_wc = df_count[df_count.id.isin(normal_comments["id"])]["word_count"].median()
mode_wc = df_count[df_count.id.isin(normal_comments["id"])]["word_count"].mode()

sns.distplot(df_count[df_count.id.isin(normal_comments["id"])]["word_count"], ax=ax_wc)
ax_wc.axvline(mean_wc, color='r', linestyle='--')
ax_wc.axvline(median_wc, color='g', linestyle='-')

mean_uwc = df_count[df_count.id.isin(normal_comments["id"])]["unique_words"].mean()
median_uwc = df_count[df_count.id.isin(normal_comments["id"])]["unique_words"].median()
mode_uwc = df_count[df_count.id.isin(normal_comments["id"])]["unique_words"].mode()

sns.distplot(df_count[df_count.id.isin(normal_comments["id"])]["unique_words"], ax=ax_uwc)
ax_uwc.axvline(mean_uwc, color='r', linestyle='--')
ax_uwc.axvline(median_uwc, color='g', linestyle='-')

plt.legend({'Mean':mean_wc,'Median':median_wc})
plt.show()

word_count is spread till 1200 while unique_count is limited to 800

In [None]:
del df_count
del toxic_index
del severe_toxic_index
del threat_index
del obscene_index
del insult_index
del identity_hate_index
gc.collect()

<a id=n-gram></a>
# N-gram Analysis

In [None]:
#input_list = ['all', 'this', 'happened', 'more', 'or', 'less']

def find_ngrams(input_list, n):
   return zip(*[input_list[i:] for i in range(n)])

In [None]:
train_tags = train_data.loc[:, ["id", "toxic", "severe_toxic", "threat", "obscene", "insult", "identity_hate"]]

In [None]:
train_data.loc[:, ["id", "toxic", "severe_toxic", "threat", "obscene", "insult", "identity_hate"]].columns[1:]

In [None]:
train_data.iloc[12235]["comment_text"]

In [None]:
train_data.iloc[12235]["comment"]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer_bi = TfidfVectorizer(min_df=100,  max_features=80000, 
                                   strip_accents='unicode', analyzer='word',ngram_range=(2,2),
                                   use_idf=1,smooth_idf=1,sublinear_tf=1,
                                   stop_words = 'english')

tfidf_vectorizer_bi.fit(pd.concat([train_data["comment"], test_data["comment"] ]))
features = np.array(tfidf_vectorizer_bi.get_feature_names())

train_bigrams =  tfidf_vectorizer_bi.transform(train_data["comment"])
test_bigrams = tfidf_vectorizer_bi.transform(test_data["comment"])

In [None]:
##https://buhrmann.github.io/tfidf-analysis.html
##https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda#Feature-engineering:

def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def top_mean_feats(Xtr, features, grp_ids, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    
    D = Xtr[grp_ids].toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

# modified for multilabel milticlass
def top_feats_by_class(Xtr, features, min_tfidf=0.1, top_n=20):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    cols=train_tags.columns[1:]
    for col in cols:
        ids = train_tags.index[train_tags[col]==1]
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = comments
        dfs.append(feats_df)
    ids = train_tags.index[train_tags.id.isin(normal_comments.id)]
    feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
    feats_df.label = "clean"
    dfs.append(feats_df)
    return dfs

In [None]:
%%time
tfidf_top_n_per_lass=top_feats_by_class(train_bigrams,features)

In [None]:
plt.figure(figsize=(16,22))
plt.suptitle("TF_IDF Top words per class(Bigrams)",fontsize=20)
gridspec.GridSpec(4,2)
plt.subplot2grid((4,2),(0,0))
sns.barplot(tfidf_top_n_per_lass[0].feature.iloc[0:5],tfidf_top_n_per_lass[0].tfidf.iloc[0:5],color=color[0])
plt.title("class : Toxic",fontsize=15)
#plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF score', fontsize=12)

plt.subplot2grid((4,2),(0,1))
sns.barplot(tfidf_top_n_per_lass[1].feature.iloc[0:5],tfidf_top_n_per_lass[1].tfidf.iloc[0:5],color=color[1])
plt.title("class : Severe toxic",fontsize=15)
#plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF score', fontsize=12)


plt.subplot2grid((4,2),(1,0))
sns.barplot(tfidf_top_n_per_lass[2].feature.iloc[0:5],tfidf_top_n_per_lass[2].tfidf.iloc[0:5],color=color[2])
plt.title("class : Obscene",fontsize=15)
#plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF score', fontsize=12)


plt.subplot2grid((4,2),(1,1))
sns.barplot(tfidf_top_n_per_lass[3].feature.iloc[0:5],tfidf_top_n_per_lass[3].tfidf.iloc[0:5],color=color[3])
plt.title("class : Threat",fontsize=15)
#plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF score', fontsize=12)


plt.subplot2grid((4,2),(2,0))
sns.barplot(tfidf_top_n_per_lass[4].feature.iloc[0:5],tfidf_top_n_per_lass[4].tfidf.iloc[0:5],color=color[4])
plt.title("class : Insult",fontsize=15)
#plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF score', fontsize=12)


plt.subplot2grid((4,2),(2,1))
sns.barplot(tfidf_top_n_per_lass[5].feature.iloc[0:5],tfidf_top_n_per_lass[5].tfidf.iloc[0:5],color=color[5])
plt.title("class : Identity hate",fontsize=15)
#plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF score', fontsize=12)

plt.subplot2grid((4,2),(3,0),colspan=2)
sns.barplot(tfidf_top_n_per_lass[6].feature.iloc[0:9],tfidf_top_n_per_lass[6].tfidf.iloc[0:9])
plt.title("class : Clean",fontsize=15)
#plt.xlabel('Word', fontsize=12)
plt.ylabel('TF-IDF score', fontsize=12)

plt.show()

In [None]:
del normal_comments
del tfidf_vectorizer_bi
del train_bigrams
del test_bigrams
gc.collect()

<a id="lr_model"></a>
# Logistic Regression

* Tfidf --> Logistic Regresion
* Tfidf --> SMOTE --> Logistic Regresion
* Tfidf --> Weights --> Logistic Regresion

<a id="smote"></a>
> SMOTE --> Synthetic Minority Oversampling Technique

In [None]:
#over sample the minority class by 10% and under sample the majority class by 50%
#as the original paper suggest to combine combining SMOTE with randome undersampling of the majority class 
def apply_SMOTE(X_vec, y_vec):
    over = SMOTE(sampling_strategy=0.2, random_state=777)#, k_neighbors=1)
    under = RandomUnderSampler(sampling_strategy=0.5, random_state=777)#, k_neighbors=1)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    # transform the dataset
    X_smote, y_smote = pipeline.fit_resample(X_vec, y_vec)
    return (X_smote, y_smote)

In [None]:
#all_text = pd.concat([train_data["comment"], test_data["comment"] ])

tfidf_vectorizer = TfidfVectorizer(min_df=100,  max_features=80000, 
                                   strip_accents='unicode', analyzer='word',ngram_range=(1,3),
                                   use_idf=1,smooth_idf=1,sublinear_tf=1,
                                   stop_words = 'english')

tfidf_vectorizer.fit(pd.concat([train_data["comment"], test_data["comment"] ]))
features = np.array(tfidf_vectorizer.get_feature_names())

train_tfidf =  tfidf_vectorizer.transform(train_data["comment"])
test_tfidf = tfidf_vectorizer.transform(test_data["comment"])

In [None]:
def apply_model(splits, X, y, model, average_method): #average_method='macro'
    
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
    accuracy = []
    precision = []
    recall = []
    f1 = []
    for train, test in kfold.split(X, y):
        
        lr_fit = model.fit(X[train], y[train])
        prediction = model.predict(X[test])
        scores = model.score(X[test],y[test])
        
        accuracy.append(scores * 100)
        precision.append(precision_score(y[test], prediction, average=average_method)*100)
        
        recall.append(recall_score(y[test], prediction, average=average_method)*100)
        
        f1.append(f1_score(y[test], prediction, average=average_method)*100)
    
    result = {
        "accuracy": [" %.2f%% (+/- %.2f%%)"% (np.mean(accuracy), np.std(accuracy))],
        "precision": ["%.2f%% (+/- %.2f%%)"% (np.mean(precision), np.std(precision))],
        "recall": ["%.2f%% (+/- %.2f%%)" % (np.mean(recall), np.std(recall))],
        "f1 score": ["%.2f%% (+/- %.2f%%)" % (np.mean(f1), np.std(f1))]
    }
    
    return pd.DataFrame(result)

In [None]:
#apply SMOTE after cross validation split only 
#what has been seen, can not not be unseen  --> overfitting 

def apply_model_with_smote(splits, X, y, model, average_method): #average_method='macro'
    
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
    accuracy = []
    precision = []
    recall = []
    f1 = []
    for train, test in kfold.split(X, y):
        
        X_smote, y_smote = apply_SMOTE(X[train], y[train])
        
        lr_fit = model.fit(X_smote, y_smote)
        prediction = model.predict(X[test])
        scores = model.score(X[test],y[test])
        
        accuracy.append(scores * 100)
        precision.append(precision_score(y[test], prediction, average=average_method)*100)
        
        recall.append(recall_score(y[test], prediction, average=average_method)*100)
        
        f1.append(f1_score(y[test], prediction, average=average_method)*100)
        
    result = {
        "accuracy": [" %.2f%% (+/- %.2f%%)"% (np.mean(accuracy), np.std(accuracy))],
        "precision": ["%.2f%% (+/- %.2f%%)"% (np.mean(precision), np.std(precision))],
        "recall": ["%.2f%% (+/- %.2f%%)" % (np.mean(recall), np.std(recall))],
        "f1 score": ["%.2f%% (+/- %.2f%%)" % (np.mean(f1), np.std(f1))]
    }        
  
    return pd.DataFrame(result)

In [None]:
lr_model = LogisticRegression(C=0.1, solver='sag')
print("~~~~~~~~~~~~ Logistic Regression ~~~~~~~~~~~~")
for class_name in category_list[:-1]:
    y = train_data[class_name]
    print("------RESULT FOR  {0}---------".format(class_name))
    model_metrics = apply_model(5, train_tfidf, y, lr_model, 'macro')
    model_metrics["model"] = ["Logistic Regression"]

In [None]:
#from sklearn.utils.class_weight import compute_class_weight
#class_weights = compute_class_weight('balanced', np.unique(y), y)

In [None]:
lr_model_weights = LogisticRegression(C=0.1, solver='lbfgs', 
                                      class_weight={1:0.8, 0:0.2}
                                       #class_weight={"toxic":2, "severe_toxic":8, "threat":10, "obscene":5, "insult":5, "identity_hate":8}
                                      )
print("~~~~~~~~~~~~ Logistic Regression with weights ~~~~~~~~~~~~")
for class_name in category_list[:-1]:
    y = train_data[class_name]
    print("------RESULT FOR  {0}---------".format(class_name))
    weights_metrics = apply_model(5, train_tfidf, y, lr_model_weights, 'macro')
    weights_metrics["model"] = ["LR with weights"]

**Note:**
> you can use the techniques to calculate the weights using GridSerachCV and 
> compute_class_weight from sklearn.utils.class_weight 

In [None]:
#Cross check this approach i.e. applying SMOTE for every class one by one 
lr_model_smote = LogisticRegression(C=0.1, solver='sag')
print("~~~~~~~~~~~~ Logistic Regression with SMOTE~~~~~~~~~~~~")
for class_name in category_list[:-1]:
    y = train_data[class_name]
    print("------RESULT FOR  {0}---------".format(class_name))
    smote_metrics = apply_model_with_smote(5, train_tfidf, y, lr_model_smote, 'macro')
    smote_metrics["model"] = ["LR with smote"]

In [None]:
#Comparsion 
lr_metrics = pd.concat([model_metrics, weights_metrics, smote_metrics])
lr_metrics

> Precision = true positive/predicted positive --> TP/(TP+FP) --> measure of relevant data points
> Recall = true positive/ actual positive --> TP/(TP+FN) --> how accuractely the model identify the relevant data

From the above compariosnof f1 score it is visible that aftetreating the data imbalance the model performed well 

In [None]:
del tfidf_vectorizer
del train_tfidf
del test_tfidf
gc.collect()

<a id="lstm" ></a>
# LSTM

* > Tokenize the data
* > Text to sequence 
* > padding --> all seq have same shape
* > build embedding matrix
* > build model with CuDNNLSTM



In [None]:
train_data = pd.read_csv("clean_jigsaw.csv")

In [None]:
train_data.info()

In [None]:
max_len = np.max(train_data["comment"].apply(lambda x: len(x)))
max_len

In [None]:
REMOVE_CHARS = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'
max_features = 100000

In [None]:
tokenizer = Tokenizer(filters=REMOVE_CHARS, num_words=100000)

tokenizer.fit_on_texts(list(train_data["comment"].astype(str)))

vocab_size = len(tokenizer.word_index) + 1
X_tokenized = tokenizer.texts_to_sequences(train_data["comment"].astype(str))
X_tokenized = pad_sequences(X_tokenized, maxlen=5000, padding='post')

In [None]:
#pre process test data with param of train data
#X_test_tokenize = tokenizer.texts_to_sequences(test_data["comment"].values)
#X_test_tokenize = pad_sequences(X_test_tokenize, maxlen=max_len, padding='post')

In [None]:
embeddings_index = {}
glovefile = open('../input/glove42b300dtxt/glove.42B.300d.txt','r',encoding='utf-8')
for line in tqdm(glovefile):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
glovefile.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
# create an embedding matrix for the words we have in the dataset
num_words = min(max_features, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((num_words, 300))
for word, index in tqdm(tokenizer.word_index.items()):
    try:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
    except IndexError:
        pass

In [None]:
del embeddings_index
gc.collect()

In [None]:
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS

In [None]:
y_train = train_data[["toxic", "severe_toxic","threat","obscene", "insult", "identity_hate"]].values

In [None]:
embedding_matrix.shape

In [None]:
X_tokenized.shape, y_train.shape

In [None]:
# def build_model(embedding_matrix):
#     words = Input(shape=(5000,))
    
#     x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
#     x = SpatialDropout1D(0.2)(x)
#     x1 = Bidirectional(CuDNNLSTM(256, return_sequences=True))(x)
#     x2 = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x1)
    
#     #avg_pool = GlobalAveragePooling1D()(x)
#     max_pool1 = GlobalMaxPooling1D()(x1)
#     max_pool2 = GlobalMaxPooling1D()(x2)
#     conc = Concatenate()([max_pool1, max_pool2])
    
# #     hidden = add([conc, Dense(DENSE_HIDDEN_UNITS, activation='relu')(conc)])
# #     hidden = Dropout(0.2)(hidden)
# #     hidden = add([conc, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    
#     result = Dense(6, activation='softmax')(conc)
    
#     model = Model(inputs=words, outputs=result)
#     model.compile(loss='binary_crossentropy', optimizer='adam')
    
#     return model




* Not able to fit the above model due to memory constraint here. 
* * "Your notebook tried to allocate more memory than is available. It has restarted"

In [None]:
def build_model(embedding_matrix):
    inp = Input(shape=(max_len,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix])(inp)
    x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
model_lstm = build_model(embedding_matrix)
model_lstm.summary()

In [None]:
epochs = 100
batch_size = 512

In [None]:
#  "toxic":train_data.toxic,
#  "severe_toxic":train_data.severe_toxic,
#  "threat":train_data.threat,
#  "obscene":train_data.obscene,
#  "insult":train_data.insult,
#  "identity_hate":train_data.identity_hate
    

In [None]:
history_lstm = model_lstm.fit(
    X_tokenized, 
    y_train,
    validation_split=0.2,
    epochs=3,
    batch_size=512,
    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
del model_lstm
#del history_lstm
gc.collect()

In [None]:
del embedding_matrix
del tokenizer
del X_tokenized
#del X_test_tokenize
gc.collect()

<a id="bert"> </a>
# BERT 

In [None]:
#https://www.kaggle.com/user123454321/bert-starter-inference
#https://tfhub.dev/s?q=bert



In [None]:
#Get the tokenization script from the officical repo
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
    
import tokenization

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    
    inputs = {"word_ids":input_word_ids, "mask":input_mask, "segment_ids":segment_ids}

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    
    toxic = Dense(1, kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='toxic')(clf_output)
    severe_toxic = Dense(1, kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='severe_toxic')(clf_output)
    threat = Dense(1, kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='threat')(clf_output)
    obscene = Dense(1, kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='obscene')(clf_output)
    insult = Dense(1, kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='insult')(clf_output)
    identity_hate = Dense(1, kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='identity_hate')(clf_output)
    
    out = {"toxic":toxic, "severe_toxic":severe_toxic, "threat":threat, "obscene":obscene, "insult":insult, "identity_hate":identity_hate}
    
    #model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[toxic, severe_toxic, threat, obscene, insult, identity_hate])
    model = Model(inputs=inputs, outputs=out, name="Toxic_BERT_model")
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
%%time
#Load BERT from TF

module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)



In [None]:
#load tokenizer from BERT layer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
#encode text into token, mask and segments 
train_input = bert_encode(train_data.comment.values, tokenizer, max_len=160)
test_input = bert_encode(test_data.comment.values, tokenizer, max_len=160)

#train_labels = train_data.toxic.values

In [None]:
model = build_model(bert_layer, max_len=160)
model.summary()

In [None]:
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)

# train_history = model.fit(
#     train_input, train_labels,
#     validation_split=0.2,
#     epochs=3,
#     callbacks=[checkpoint],
#     batch_size=16
# )

# train_history = model.fit(
#     x = train_input,
#      y = {
#          "toxic":train_data[toxic],
#          "severe_toxic":train_data[severe_toxic],
#          "threat":train_data[threat],
#          "obscene":train_data[obscene],
#          "insult":train_data[insult],
#          "identity_hate":train_data[identity_hate]
#      }
#     validation_split=0.2,
#     epochs=3,
#     callbacks=[checkpoint],
#     batch_size=16
# )

In [None]:
# model.load_weights('model.h5')
# test_pred = model.predict(test_input)