### Import libraries to use

In [1]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec
import os
import itertools
from collections import Counter
import plotly.graph_objects as go
import time

### Data Preparation

In [2]:
train = pd.read_csv('data/train.csv.zip')

In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
def clean_text(x):
    res = None
    res = re.sub('\t', ' ', x)
    res = re.sub('\n', ' ', res)
    res = re.sub('\r', ' ', res)
    res = re.sub('"', '', res)
    res = re.sub("'", '', res)
    res = re.sub("[#0-9()\"+-,&.\[\]@*/?!:%}{;`#=|$^\\\\]", " ", res)
    res = re.sub("[\s]+", " ", res)
    res = res.lower()
    res = res.strip()
    return res

In [5]:
train['clean_comment'] = train['comment_text'].apply(clean_text)

In [6]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_comment
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why the edits made under my userna...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,daww he matches this background colour im seem...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man im really not trying to edit war its j...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,more i cant make any real suggestions on impro...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,you sir are my hero any chance you remember wh...


In [7]:
comment_data = train['clean_comment'].values

In [8]:
comment_data.shape

(159571,)

In [9]:
comment_data[:5]

array(['explanation why the edits made under my username hardcore metallica fan were reverted they werent vandalisms just closure on some gas after i voted at new york dolls fac and please dont remove the template from the talk page since im retired now',
       'daww he matches this background colour im seemingly stuck with thanks talk january utc',
       'hey man im really not trying to edit war its just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info',
       'more i cant make any real suggestions on improvement - i wondered if the section statistics should be later on or a subsection of types of accidents -i think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please

In [14]:
comment_data = [x.split(' ') for x in comment_data]
vocab = Counter(itertools.chain(*comment_data))
vocab_size = len(vocab)

In [15]:
df_vocab_summary = pd.DataFrame(vocab.items(), columns=['word', 'count']).sort_values('count', ascending=False)

In [16]:
nwords = 25
print(pd.concat([df_vocab_summary.head(nwords).reset_index(drop=True).rename(columns={'word':'top_word','count':'top_word_count'}), df_vocab_summary.tail(nwords).reset_index(drop=True)], ignore_index=False, axis=1).rename(columns={'word':'bottom_word','count':'bottom_word_count'}))



   top_word  top_word_count         bottom_word  bottom_word_count
0       the          496024      spam_blacklist                  1
1        to          296967         imageboards                  1
2        of          224123             tukaram                  1
3       and          223669             quanify                  1
4         a          216486             outdrew                  1
5       you          205978             wherent                  1
6         i          204294            negivate                  1
7        is          176272              avoide                  1
8      that          154488        genealogydna                  1
9        in          144733  theworldthrumyeyes                  1
10       it          130245           old-world                  1
11      for          102528                wstg                  1
12     this           97561                wqwv                  1
13      not           93617                geok               

In [17]:
nwords = 5000
nwords_start = 5000
nwords_end = nwords_start + nwords
fig = go.Figure()
# Create and style traces
fig.add_trace(go.Scatter(x=df_vocab_summary.loc[nwords_start:nwords_end, 'word'], y=df_vocab_summary.loc[nwords_start:nwords_end, 'count'], name='Word Counts',
                         line=dict(color='firebrick', width=4)))

In [19]:
comment_data[:2]

[['explanation',
  'why',
  'the',
  'edits',
  'made',
  'under',
  'my',
  'username',
  'hardcore',
  'metallica',
  'fan',
  'were',
  'reverted',
  'they',
  'werent',
  'vandalisms',
  'just',
  'closure',
  'on',
  'some',
  'gas',
  'after',
  'i',
  'voted',
  'at',
  'new',
  'york',
  'dolls',
  'fac',
  'and',
  'please',
  'dont',
  'remove',
  'the',
  'template',
  'from',
  'the',
  'talk',
  'page',
  'since',
  'im',
  'retired',
  'now'],
 ['daww',
  'he',
  'matches',
  'this',
  'background',
  'colour',
  'im',
  'seemingly',
  'stuck',
  'with',
  'thanks',
  'talk',
  'january',
  'utc']]

In [20]:
help(Word2Vec)

Help on class Word2Vec in module gensim.models.word2vec:

class Word2Vec(gensim.models.base_any2vec.BaseWordEmbeddingsModel)
 |  Word2Vec(sentences=None, corpus_file=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(), max_final_vocab=None)
 |  
 |  Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/.
 |  
 |  Once you're finished training a model (=no more updates, only querying)
 |  store and use only the :class:`~gensim.models.keyedvectors.KeyedVectors` instance in `self.wv` to reduce memory.
 |  
 |  The model can be stored/loaded via its :meth:`~gensim.models.word2vec.Word2Vec.save` and
 |  :meth:`~gensim.models.word2vec.Word2Vec.load` methods.
 |  
 |  The trained word vectors can a

In [28]:
start=time.time()

In [29]:
# %%time
model = Word2Vec(sentences=comment_data, size=64, sg=0, window=5, negative=5, min_count=5, iter=5, seed=42, workers=4)

In [30]:
print('it took {} seconds'.format(time.time()-start))

it took 31.601624250411987 seconds


In [33]:
model_write_dir = 'w2v_model'
if not os.path.exists(model_write_dir):
    os.makedirs(model_write_dir)
    
model_name = 'toxic_word_model.model'
model_file_path = os.path.join(model_write_path, model_name)

In [34]:
model.save(model_file_path, separately=[], sep_limit=4294967296)

In [20]:
model.wv.most_similar('january', topn=25)

[('december', 0.955875813961029),
 ('october', 0.949402928352356),
 ('april', 0.9484162330627441),
 ('august', 0.9440429210662842),
 ('june', 0.9433221817016602),
 ('november', 0.940642237663269),
 ('july', 0.939682126045227),
 ('february', 0.9385157823562622),
 ('march', 0.9355096220970154),
 ('september', 0.9180770516395569),
 ('jan', 0.8576573729515076),
 ('oct', 0.8331053256988525),
 ('nov', 0.8313380479812622),
 ('feb', 0.8000282049179077),
 ('sep', 0.7769359946250916),
 ('jul', 0.7731983661651611),
 ('dec', 0.7691308259963989),
 ('aug', 0.7483060359954834),
 ('pm', 0.7481406927108765),
 ('sept', 0.7375351190567017),
 ('monday', 0.7247934341430664),
 ('wednesday', 0.7190449237823486),
 ('mar', 0.7176545858383179),
 ('friday', 0.7147082090377808),
 ('thursday', 0.7023900151252747)]

In [21]:
model.wv.most_similar('fuck', topn=25)

[('fucking', 0.6825143098831177),
 ('bitch', 0.6590875387191772),
 ('hell', 0.6548470258712769),
 ('cunt', 0.6499595642089844),
 ('damn', 0.6468105912208557),
 ('motherfucker', 0.6291096210479736),
 ('ass', 0.6108208298683167),
 ('motherfucking', 0.5905293226242065),
 ('jerk', 0.582370400428772),
 ('hahahah', 0.5796732902526855),
 ('piss', 0.5767008662223816),
 ('fascists', 0.5726990699768066),
 ('facists', 0.5722130537033081),
 ('goddamn', 0.5682483911514282),
 ('retard', 0.5644976496696472),
 ('screw', 0.5634350776672363),
 ('shit', 0.5631744265556335),
 ('dumbass', 0.5623525977134705),
 ('faggot', 0.5609458088874817),
 ('fuckin', 0.5602768063545227),
 ('supressing', 0.5559513568878174),
 ('twat', 0.5535650253295898),
 ('guys', 0.5514557957649231),
 ('motherfuckers', 0.543911874294281),
 ('asshole', 0.5414113998413086)]

In [22]:
model.wv.most_similar('sad', topn=25)

[('pathetic', 0.7854645252227783),
 ('funny', 0.7731180191040039),
 ('terrible', 0.7695800065994263),
 ('crazy', 0.7410961985588074),
 ('disgusting', 0.7401156425476074),
 ('cute', 0.7400022149085999),
 ('annoying', 0.7315208911895752),
 ('sick', 0.7297760248184204),
 ('stupid', 0.7281953692436218),
 ('clever', 0.7232526540756226),
 ('amazing', 0.7205700278282166),
 ('tough', 0.719685971736908),
 ('hilarious', 0.7085663080215454),
 ('lonely', 0.7005751729011536),
 ('irritating', 0.6976435780525208),
 ('wow', 0.6973145008087158),
 ('smart', 0.6950938701629639),
 ('silly', 0.6840213537216187),
 ('horrible', 0.68216872215271),
 ('dumb', 0.6801586151123047),
 ('laughable', 0.6673109531402588),
 ('brilliant', 0.6642475128173828),
 ('pity', 0.6626632213592529),
 ('frustrating', 0.6624851822853088),
 ('joke', 0.6619576215744019)]

In [23]:
model.wv.most_similar('moron', topn=25)

[('retard', 0.7224112749099731),
 ('oxymoron', 0.7155731320381165),
 ('hypocrite', 0.7099846601486206),
 ('idiot', 0.6996086239814758),
 ('douchebag', 0.6991301774978638),
 ('asshole', 0.6974294781684875),
 ('queer', 0.6960211396217346),
 ('arsehole', 0.6887933611869812),
 ('coward', 0.6876004338264465),
 ('liar', 0.6863622069358826),
 ('dumb', 0.6860212087631226),
 ('douche', 0.6813151240348816),
 ('bigot', 0.6774395704269409),
 ('pedophile', 0.6756231784820557),
 ('bastard', 0.6731732487678528),
 ('yank', 0.6701412200927734),
 ('deletionist', 0.6677676439285278),
 ('twat', 0.6671178936958313),
 ('fool', 0.6666972637176514),
 ('shill', 0.6651448011398315),
 ('loser', 0.6633561849594116),
 ('deluded', 0.6595900654792786),
 ('fucker', 0.6549838781356812),
 ('retards', 0.653609037399292),
 ('jerk', 0.6487358808517456)]

In [24]:
model.wv.most_similar('man', topn=25)

[('girl', 0.837913990020752),
 ('woman', 0.8136935234069824),
 ('kid', 0.811394453048706),
 ('boy', 0.7758640646934509),
 ('guy', 0.7588706612586975),
 ('boss', 0.6997487545013428),
 ('baby', 0.6980083584785461),
 ('chick', 0.6937598586082458),
 ('cop', 0.6896435022354126),
 ('child', 0.6792041063308716),
 ('genius', 0.6531341075897217),
 ('shot', 0.650833785533905),
 ('douche', 0.650515079498291),
 ('hero', 0.6464110612869263),
 ('jerk', 0.645368218421936),
 ('nerd', 0.6380337476730347),
 ('kettle', 0.6335887908935547),
 ('dictator', 0.6335843205451965),
 ('dying', 0.6299604773521423),
 ('mom', 0.6270955801010132),
 ('bastard', 0.6240851879119873),
 ('mother', 0.6239056587219238),
 ('gentleman', 0.6227595210075378),
 ('fruit', 0.622679591178894),
 ('loser', 0.62100750207901)]

In [25]:
model.wv.most_similar('apple', topn=25)

[('mp', 0.8115946650505066),
 ('clip', 0.8051564693450928),
 ('cafe', 0.7946009635925293),
 ('cbs', 0.7852007746696472),
 ('jacket', 0.7754191756248474),
 ('sonic', 0.7742930054664612),
 ('cd', 0.7716450095176697),
 ('mac', 0.7671593427658081),
 ('allmusic', 0.7558950185775757),
 ('cable', 0.7540907859802246),
 ('emily', 0.7538907527923584),
 ('trailer', 0.7456364631652832),
 ('store', 0.7447930574417114),
 ('producer', 0.7436193227767944),
 ('cards', 0.7423871755599976),
 ('hd', 0.740828275680542),
 ('platinum', 0.7401224970817566),
 ('sky', 0.7388432025909424),
 ('explorer', 0.7385932207107544),
 ('sony', 0.7351382970809937),
 ('soundtrack', 0.7348278760910034),
 ('nintendo', 0.7340067625045776),
 ('compilation', 0.7323013544082642),
 ('eminem', 0.7311521172523499),
 ('icon', 0.7303788661956787)]

In [27]:
model.wv.most_similar('india', topn=25)

[('pakistan', 0.8631523847579956),
 ('turkey', 0.8332419395446777),
 ('afghanistan', 0.8225812911987305),
 ('europe', 0.8163744211196899),
 ('bangladesh', 0.8153061270713806),
 ('albania', 0.8138001561164856),
 ('greece', 0.8089674711227417),
 ('ukraine', 0.8056944608688354),
 ('punjab', 0.8056362271308899),
 ('southern', 0.8035520315170288),
 ('iran', 0.8025564551353455),
 ('africa', 0.7993097901344299),
 ('bulgaria', 0.7985544800758362),
 ('western', 0.7975499629974365),
 ('japan', 0.7973318099975586),
 ('netherlands', 0.795556366443634),
 ('asia', 0.7947572469711304),
 ('egypt', 0.7900804877281189),
 ('canada', 0.7875267267227173),
 ('province', 0.7873293161392212),
 ('bosnia', 0.7843958139419556),
 ('sweden', 0.7834525108337402),
 ('spain', 0.780659556388855),
 ('region', 0.7788646221160889),
 ('thrace', 0.7762434482574463)]

In [44]:
vocab = model.wv.vocab.keys()

In [67]:
def _infer_sentence_vector(x, model):
    res = []
    x = x.split(' ')
    for x_ in x:
        if x_ in vocab:
            res.append(model.wv[x_])
    res = np.mean(res, axis=0)
    return res

In [68]:
train['clean_comment'][1]

'daww he matches this background colour im seemingly stuck with thanks talk january utc'

In [69]:
infer_sentence_vector(train['clean_comment'][1], model)

array([ 0.36160344,  0.8097772 ,  0.1470389 ,  0.21784781, -0.60439837,
       -0.40367442, -1.2552695 ,  0.08163906, -0.23760645, -0.8146129 ,
        0.46737766, -0.1110723 ,  0.4806608 ,  0.35990328,  0.06697381,
       -0.12287373, -0.25773987, -0.06202003,  0.2763552 , -1.0298489 ,
        0.11018182,  0.02407448,  0.02629358,  0.87825054, -1.0181624 ,
        1.1217054 ,  0.00189011, -0.60360044,  0.8187238 , -0.16542107,
        0.41726562, -0.4276864 , -0.2758148 , -0.44891387, -0.8496718 ,
       -0.26610214,  0.7104458 ,  0.23154587, -0.5646891 ,  1.1096057 ,
       -0.02788468, -0.52030265, -0.53515196,  0.06570676,  0.47985464,
        0.69134086, -0.01517652,  0.39734063, -0.74918586,  0.17266813,
        0.49026185, -0.04598913,  0.5647172 , -0.7259926 , -0.44719192,
       -0.03694589,  0.24585494,  0.03238839,  0.18511264, -0.3258923 ,
       -0.05498526,  0.5739397 ,  0.6492077 ,  0.2773991 ], dtype=float32)

In [73]:
train['embedding'] = train['clean_comment'].apply(infer_sentence_vector, model=model)


Mean of empty slice.



https://stackoverflow.com/questions/50492676/visualize-gensim-word2vec-embeddings-in-tensorboard-projector

In [77]:
train.shape

(159571, 10)

In [79]:
train.dropna(axis=0, inplace=True)

In [80]:
train.shape

(159553, 10)

In [133]:
col_names = np.array(['toxic','severe_toxic','obscene','threat','insult', 'identity_hate'])    
train['label'] = train.loc[:, ['toxic','severe_toxic','obscene','threat','insult', 'identity_hate']].apply(lambda x: 'safe' if np.sum(x==1) == 0 else '_'.join(col_names[x==1]), axis = 1)

In [134]:
Counter(train['label'].tolist())

Counter({'safe': 143330,
         'toxic_severe_toxic_obscene_insult': 989,
         'toxic': 5666,
         'toxic_obscene_insult_identity_hate': 618,
         'toxic_obscene_insult': 3798,
         'toxic_obscene': 1758,
         'toxic_threat': 113,
         'toxic_insult': 1215,
         'toxic_obscene_threat_insult_identity_hate': 56,
         'insult': 301,
         'obscene': 317,
         'toxic_severe_toxic_obscene': 158,
         'toxic_obscene_threat_insult': 131,
         'toxic_severe_toxic_obscene_insult_identity_hate': 265,
         'toxic_severe_toxic_obscene_threat_insult_identity_hate': 31,
         'toxic_insult_identity_hate': 134,
         'toxic_identity_hate': 136,
         'obscene_insult': 181,
         'toxic_severe_toxic_obscene_identity_hate': 6,
         'toxic_severe_toxic_obscene_threat_insult': 64,
         'identity_hate': 54,
         'toxic_obscene_identity_hate': 35,
         'threat': 22,
         'obscene_insult_identity_hate': 18,
         'toxic_

In [2]:
model = Word2Vec.load('w2v_model/toxic_word_model.model')

In [15]:
embeddings_vectors = model.wv.vectors
print(embeddings_vectors.shape)

(43179, 64)


In [16]:
embeddings_vectors = embeddings_vectors / np.sqrt(np.sum(embeddings_vectors**2, axis=1, keepdims=True))

In [17]:
vocab = model.wv.vocab.keys()
len(vocab)

43179

In [18]:
# Create some variables.
emb = tf.Variable(embeddings_vectors, name='word_embeddings')

# Add an op to initialize the variable.
init_op = tf.global_variables_initializer()

# Add ops to save and restore all the variables.
saver = tf.train.Saver()

# Later, launch the model, initialize the variables and save the
# variables to disk.
with tf.Session() as sess:
    sess.run(init_op)
    # Save the variables to disk.
    save_path = saver.save(sess, "model_dir/model.ckpt")
    print("Model saved in path: %s" % save_path)

Model saved in path: model_dir/model.ckpt


In [19]:
words = '\n'.join(vocab)
with open(os.path.join('model_dir', 'metadata.tsv'), 'w') as f:
    f.write(words)

# .tsv file written in model_dir/metadata.tsv

In [142]:
df_visualize = train.sample(frac=0.1, random_state=42)

In [143]:
df_visualize.shape

(15955, 11)

In [144]:
embeddings_vectors = np.stack(list(df_visualize['embedding'].values), axis=0)

In [149]:
embeddings_vectors.shape

(15955, 64)

In [146]:
# Create some variables.
emb = tf.Variable(embeddings_vectors, name='sentence_embeddings')

# Add an op to initialize the variable.
init_op = tf.global_variables_initializer()

# Add ops to save and restore all the variables.
saver = tf.train.Saver()

# Later, launch the model, initialize the variables and save the
# variables to disk.
with tf.Session() as sess:
    sess.run(init_op)
    # Save the variables to disk.
    save_path = saver.save(sess, "model_dir/model.ckpt")
    print("Model saved in path: %s" % save_path)

Model saved in path: model_dir/model.ckpt


In [147]:
words = '\n'.join(df_visualize['label'].tolist())
with open(os.path.join('model_dir', 'metadata.tsv'), 'w') as f:
    f.write(words)

# .tsv file written in model_dir/metadata.tsv

In [148]:
df_visualize.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_comment,embedding,label
41499,6eb2262a75f486e1,Font face should be removed per WP:FONTFAMILY.,0,0,0,0,0,0,font face should be removed per wp fontfamily,"[1.2263709, -0.26516834, 1.2580038, -0.0015020...",safe
112777,5b4d7bbb89e06737,Name \n\nWhy is there a citation needed for th...,0,0,0,0,0,0,name why is there a citation needed for the gu...,"[-0.00084633095, 1.4145118, 0.48286387, -0.504...",safe
43946,754ae43b4408a6ae,"""\n\n Working It Out Tour 2010-2011 \n\nThe Wo...",0,0,0,0,0,0,working it out tour - the working it out tour ...,"[0.34258842, 0.87824535, -0.16977233, 0.397243...",safe
102202,22ed14cee7af208f,I'm not sure about a lot of the comments on th...,0,0,0,0,0,0,im not sure about a lot of the comments on thi...,"[-0.059329335, 1.0171596, 0.32411808, 0.362021...",safe
130622,bac2a3efc6b547bd,THE CHINESE MAN SUCxS PRAWN BALLS lol,1,0,0,0,0,0,the chinese man sucxs prawn balls lol,"[-0.90913755, 0.7893841, 0.3654409, -0.4070989...",toxic
