In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
import re 
import scipy
from scipy import sparse
import gc 
import umap
from IPython.display import display, HTML
from pprint import pprint
import scipy.optimize as optimize
from matplotlib import pyplot as plt
import warnings
import seaborn as sns

warnings.filterwarnings("ignore")

# read toxic comments data

In [None]:
df_test = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")
df_test_l = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv")
df_test_l = df_test_l[df_test_l[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult','identity_hate']].sum(axis=1)>=0]
print(df_test_l.shape)
print(df_test.shape)
df_test = pd.merge(df_test, df_test_l, how="inner", on = "id")
df_test.shape

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
print(df.shape)

df = pd.concat([df, df_test])
print(df.shape)

for col in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    print(f'****** {col} *******')
    display(df.loc[df[col]==1,['comment_text',col]].sample(10))

# Find IMPORTANT words for each type of toxicity

In [None]:
labels=['toxic','severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
feature_wts_all = {}
print(df.shape)

for lbl in labels:
    print("*"*30 + lbl.upper() + "*"*30)
    features_tmp = FeatureUnion([
        ("vect1", TfidfVectorizer(min_df= 3, 
                                  max_df=0.5, 
                                  analyzer = 'word', 
                                 )),

    ])
    pipeline_tmp = Pipeline(
        [
            ("features", features_tmp),
            ("clf", Ridge()),
        ]
    )
    print("\nTrain:")
    # Train the pipeline
    pipeline_tmp.fit(df[(df[lbl]>0)|(df[labels].sum(axis=1)==1)]['comment_text'], df[(df[lbl]>0)|(df[labels].sum(axis=1)==1)][lbl])
    
    # What are the important features for toxicity

    print('\nTotal number of features:', len(pipeline_tmp['features'].get_feature_names()) )

    feature_wts = sorted(list(zip(pipeline_tmp['features'].get_feature_names(), 
                                  np.round(pipeline_tmp['clf'].coef_,2) )), 
                         key = lambda x:x[1], 
                         reverse=True)

    print("High score features")
    #pprint(feature_wts[:50])
    feature_wts_all[lbl] = [(x.replace('vect1__',''),y) for x,y in feature_wts if (y > 0.25) & (x.replace('vect1__','').isalpha()) & ( len(x.replace('vect1__','')) > 2)]

pprint(feature_wts_all)

In [None]:
# Put words in DF with toxicity score for each category

In [None]:
df_wts = []
for k in feature_wts_all.keys():
    df_wts.append(pd.DataFrame(feature_wts_all[k], columns = ["word","wt"]).assign(label=k))

imp_words_df = pd.concat(df_wts).pivot(index='word', columns='label', values='wt').fillna(0)#.reset_index()
print(imp_words_df.shape)

imp_words_df.head()


# Bar chart showing words with highest toxicity within each group

In [None]:
for lbl in labels:
    if len(feature_wts_all[lbl]) > 0:
        print(lbl.upper())
        ax = imp_words_df\
                .sort_values(lbl,ascending=False)\
                .head(30)\
                .sort_values(lbl,ascending=True)\
                .plot\
                .barh(rot=0, width=1, figsize = (12,12))
        plt.show()

# Load fasttext vectors

In [None]:
def load_fasttext_model(path):
    embeddings = {}
    f = open(path, encoding='utf-8')
    for line in f:
        values = line.strip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings[word] = coefs
    f.close()
    return embeddings

In [None]:
ft_model = load_fasttext_model('../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec')

In [None]:
# Assign label to word 
imp_words_df['label_max'] = imp_words_df.idxmax(axis=1)
imp_words_df['val_max'] = imp_words_df.max(axis=1)


# Extract sample of IMPORTANT words

In [None]:
imp_words_df_tmp = imp_words_df.sort_values('val_max',ascending=False).head(300).copy()

vect = []
ids = []
lbls = []
for idx in imp_words_df_tmp.index.tolist():
    #print(idx)
    if idx in ft_model:
        vect.append(ft_model[idx])
        ids.append(idx)
        lbls.append(imp_words_df_tmp.loc[idx].label_max)


# Reduce dimensionality with UMAP

In [None]:
vect_arr = np.array(vect)
reducer = umap.UMAP()
vect_arr_red = reducer.fit_transform(vect_arr)
vect_arr_red.shape

# Visualize

## Can see some clear groups among the highly toxic words for each category

In [None]:
color_map = {"identity_hate":"blue",
             "insult":"green",
             "obscene":"red",
             "severe_toxic":"black",
             "threat":"yellow"}
plt.figure(figsize=(15,15))

plt.scatter(
    vect_arr_red[:, 0],
    vect_arr_red[:, 1],
    c=[x for x in pd.Series(lbls).map(color_map)])

#red_patch = mpatches.Patch(color='red', label='The red data')
handlelist = [plt.plot([], marker="o", ls="", color=color)[0] for x,color in color_map.items()]
plt.legend(handlelist,[x for x,y in color_map.items()],loc='upper left')#plt.legend(handles={color_map})

plt.title('UMAP projection of the important words', fontsize=24)
for i, txt in enumerate(ids):
    plt.annotate(txt, 
                 (vect_arr_red[i, 0], vect_arr_red[i, 1]), 
                 textcoords="offset points",  # how to position the text
                 size=10,
                 xytext=(0, 0.3),  # distance from text to points (x,y)
                 ha='left')