I had some issues to load certain packages in the cloud instance I had, so this notebook will save the embeddings in a separate file, which will be loaded to the cloud instance to train the network

# Import and load dataset

In [1]:
import pandas as pd
import numpy as np
np.random.seed(42)
from sklearn.model_selection import train_test_split
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
import os
import gensim
import pickle
import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#This dataset contains new features, ascii text converted to unicode and standardized
nrows= 5000
train = pd.read_csv('Data/train_processed_stage1.csv')
print('Train dataset contains {} rows and {} columns'.format(*train.shape))

Train dataset contains 159571 rows and 39 columns


In [3]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
meta_features_cols = ['total_length', 'capitals', 'caps_vs_length', 'num_exclamation_marks', 'num_question_marks', \
 'num_punctuation', 'num_symbols', 'num_words', 'num_unique_words', 'words_vs_unique', 'num_smilies', 'ant_slash_n', \
 'nb_fk', 'nb_sk', 'nb_dk', 'nb_you', 'nb_ng', 'nb_mother', 'start_with_columns', 'has_timestamp', 'has_date_long', \
 'has_date_short', 'has_http', 'has_mail', 'has_image', 'has_ip', 'has_emphasize_equal', 'has_emphasize_quotes', \
 'has_star', 'unknown_fasttext']

X_clean_text_df = train[['id','clean_text']].fillna('something')
X_meta_feats_df = train[meta_features_cols].fillna('something')
y = train[class_names]
print ('X_clean_text_df contains {} rows and {} columns'.format(*X_clean_text_df.shape))
print ('X_meta_feats_df contains {} rows and {} columns'.format(*X_meta_feats_df.shape))
print ('y contains {} rows and {} columns'.format(*y.shape))

X_clean_text_df contains 159571 rows and 2 columns
X_meta_feats_df contains 159571 rows and 30 columns
y contains 159571 rows and 6 columns


In [4]:
X_clean_text_df.head()

Unnamed: 0,id,clean_text
0,0000997932d777bf,explanation why the edits made under my userna...
1,000103f0d9cfb60f,d aww ! he matches this background colour i am...
2,000113f07ec002fd,hey man i am really not trying to edit war it ...
3,0001b41b1c6bb37e,more i can not make any real suggestions on im...
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...


In [5]:
X_meta_feats_df.head()

Unnamed: 0,total_length,capitals,caps_vs_length,num_exclamation_marks,num_question_marks,num_punctuation,num_symbols,num_words,num_unique_words,words_vs_unique,...,has_date_long,has_date_short,has_http,has_mail,has_image,has_ip,has_emphasize_equal,has_emphasize_quotes,has_star,unknown_fasttext
0,-0.220195,-0.001654,0.137572,-0.025546,0.346388,-0.154587,-0.107151,-0.244618,-0.130379,0.768235,...,-0.081772,-0.112575,-0.129103,-0.028225,-0.008183,1.340625,-0.044091,-0.00468,-0.051793,-0.20777
1,-0.477509,-0.094891,0.213346,0.013065,-0.282584,-0.232894,-0.107151,-0.506634,-0.571261,1.131109,...,-0.081772,-0.112575,-0.129103,-0.028225,-0.008183,-0.172487,-0.044091,-0.00468,-0.051793,-0.20777
2,-0.272673,-0.13633,-0.371132,-0.025546,-0.282584,-0.3112,-0.107151,-0.254695,-0.167119,0.573838,...,-0.081772,-0.112575,-0.129103,-0.028225,-0.008183,-0.172487,-0.044091,-0.00468,-0.051793,-0.20777
3,0.385847,-0.063812,-0.365558,-0.025546,-0.282584,-0.154587,-0.107151,0.460811,0.622796,-1.009204,...,-0.081772,-0.112575,-0.129103,-0.028225,-0.008183,-0.172487,-0.044091,-0.00468,-0.051793,-0.20777
4,-0.553687,-0.157049,-0.234513,-0.025546,0.346388,-0.389507,-0.107151,-0.546945,-0.644741,1.131109,...,-0.081772,-0.112575,-0.129103,-0.028225,-0.008183,-0.172487,-0.044091,-0.00468,-0.051793,-0.20777


In [6]:
y.head(8)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,1,1,1,0,1,0
7,0,0,0,0,0,0


# Convert clean_text to vectors/tokenize

In [7]:
#This allows to vectorize a text corpus, by turning each text into a sequence of integers 
# src_vocab_size = 10000
src_vocab_size = 283759
def create_tokenizer():
    #consider only the top 10000 words in the dataset
    tokenizer = text.Tokenizer(num_words=src_vocab_size)
    tokenizer.fit_on_texts(list(X_clean_text_df['clean_text']))
    return tokenizer

def find_max_len(percentile=0.9):
    #since each of the reviews can be of varying length. we will convert them to be of same length by padding.
    #lets find out what should be the length of each reviews
    #find the length
    X_clean_text_df['text_length'] = X_clean_text_df['clean_text'].apply(len)

    # use the 90th percentile to pad sequences
    maxlen = int(X_clean_text_df.text_length.quantile(percentile)) # 90th percentile
    return maxlen

# encode and pad sequences
def encode_sequences(tokenizer, length):
    #map words to integers as needed for modelling
    X_train_sequence = tokenizer.texts_to_sequences(X_clean_text_df['clean_text'])

    # pad sequences with 0 values
    #it will take each review and make them to be of 836 words.
    #if the review is less than 836 words it will pad zeroes at the start and append the reviews to the end.
    #if the review is more than 836 words, the words beyond 600 will be truncated
    X_train_padded_seq = sequence.pad_sequences(X_train_sequence, maxlen=length)
    return X_train_padded_seq

In [9]:
tokenizer = create_tokenizer()
print('Number of unique tokens created from the training dataset is {}'.format(len(tokenizer.word_index)))

maxlen = find_max_len(0.91)
print('Sequences will be of length - {}'.format(maxlen))

X_train_padded_seq = encode_sequences(tokenizer, maxlen)

Number of unique tokens created from the training dataset is 169494
Sequences will be of length - 913


In [10]:
X_train_padded_seq[0:2]

array([[   0,    0,    0, ...,   41, 3220,   88],
       [   0,    0,    0, ...,   37,  934,  179]])

# Preprocessing
Build an embedding matrix that can be loaded into an Embedding layer. It must be a matrix of shape (src_vocab_size, embedding_dim)

In [11]:
#understand what the file contains
EMBEDDING_FILE_FASTTEXT="Data/embeddings/crawl-300d-2M.vec"
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')
cnt = 1
for o in open(EMBEDDING_FILE_FASTTEXT,encoding='utf-8'):
    if cnt > 4:
        print(o)
        print(get_coefs(*o.rstrip().rsplit(' ')))
    cnt += 1
    if cnt > 6: break

and -0.1081 0.0191 0.0354 0.0127 0.0664 -0.0126 -0.1882 0.0631 -0.2306 0.0095 0.0917 0.1513 0.0558 -0.0643 -0.0288 -0.0447 0.1603 0.0613 0.0349 -0.0578 0.0003 -0.1399 0.0163 -0.0419 -0.0487 0.0057 0.0017 -0.0268 -0.0170 0.1045 0.0815 -0.0060 0.0635 -0.1710 0.0276 -0.0230 0.0824 0.0304 -0.1595 0.0851 -0.0556 0.0329 -0.0921 0.0196 -0.0404 0.0361 -0.0843 0.1014 -0.0393 -0.1805 -0.0073 0.2247 -0.0621 -0.0575 -0.0316 0.0198 0.0602 0.1312 -0.1278 0.0177 -0.0600 -0.2904 -0.0465 -0.1078 -0.0701 -0.0497 -0.1102 0.0156 -0.0595 0.0762 0.2638 0.0353 0.0745 0.0361 -0.0561 -0.1179 -0.4751 0.0458 -0.0672 -0.0022 0.9264 0.0101 -0.1085 0.0669 -0.0417 -0.0428 0.0391 -0.0581 0.0107 -0.0873 -0.0016 0.5711 0.0345 0.0930 -0.0647 -0.0820 0.0587 -0.1307 0.0480 0.0226 0.0115 0.0296 -0.1675 0.0399 0.2511 0.0423 0.1299 -0.0336 -0.0977 0.0146 0.3527 0.0122 0.1031 -0.0657 -0.3704 0.0445 0.1242 -0.0031 0.0255 -0.0109 0.1575 0.0288 0.2549 -0.0280 0.0462 -0.0282 -0.0154 0.0405 0.3191 0.0113 -0.0136 0.0182 -0.0385 -0.

In [11]:
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

def create_embeddings(source_file):
    embeddings = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(source_file,encoding='utf-8'))
    return embeddings

def save_embeddings_to_file(source_file, target_file_name):
    with open(target_file_name, 'wb') as handle:
        pickle.dump(source_file, handle)

In [12]:
def load_embeddings_from_pickle_file():
    if os.path.isfile(EMBEDDING_DICT_FASTTEXT):
        #load file
        with open(EMBEDDING_DICT_FASTTEXT, 'rb') as handle:
            embeddings_index_ft = pickle.load(handle)
    else:
        embeddings_index_ft = create_embeddings(EMBEDDING_FILE_FASTTEXT)
        save_embeddings_to_file(embeddings_index_ft, EMBEDDING_DICT_FASTTEXT)
        
    if os.path.isfile(EMBEDDING_DICT_TWITTER):
        #load file
        with open(EMBEDDING_DICT_TWITTER, 'rb') as handle:
            embeddings_index_tw = pickle.load(handle)
    else:
        embeddings_index_tw = create_embeddings(EMBEDDING_FILE_TWITTER)
        save_embeddings_to_file(embeddings_index_tw, EMBEDDING_DICT_TWITTER)
    return embeddings_index_ft, embeddings_index_tw

In [13]:
# Load the FastText Web Crawl vectors
# The emebedding file contains words and its corresponding vectors. 
# For instance the word and will be stored in embedding file as
# (and -0.1081 0.0191 0.0354 0.0127 0.0664 -0.0126 -0.1882 0.0631 -0.2306 0.0095 0.0917 0.1513)
# we will create a dictionary where the word will be the key and the vectors will be an array
EMBEDDING_FILE_FASTTEXT="Data/embeddings/crawl-300d-2M.vec"
EMBEDDING_FILE_TWITTER="Data/embeddings/glove.twitter.27B.200d.txt"

EMBEDDING_DICT_FASTTEXT="Data/embeddings/embeddings_index_ft.pickle"
EMBEDDING_DICT_TWITTER="Data/embeddings/embeddings_index_tw.pickle"

# load the fasttext and twitter word embeddings file into memory as a dictionary of word to embedding array.
#This process is slow so it is better to save the files
embeddings_index_ft, embeddings_index_tw = load_embeddings_from_pickle_file()
print('Loaded {} word vectors from fasttext embeddings.'.format(len(embeddings_index_ft)))
print('Loaded {} word vectors from twitter embeddings.'.format(len(embeddings_index_tw)))

Loaded 2000000 word vectors from fasttext embeddings.
Loaded 1193514 word vectors from twitter embeddings.


In [15]:
#load spelling model. This will used to correct words that are not in the fasttext embeddings. 
import datetime
currentDT = datetime.datetime.now()
print('Start time is {}'.format(currentDT.strftime("%I:%M:%S %p")))

spell_model = gensim.models.KeyedVectors.load_word2vec_format(EMBEDDING_FILE_FASTTEXT)

currentDT = datetime.datetime.now()
print('End time is {}'.format(currentDT.strftime("%I:%M:%S %p")))

Start time is 11:41:57 AM
End time is 11:51:52 AM


In [16]:
type(spell_model)

gensim.models.keyedvectors.Word2VecKeyedVectors

In [17]:
# This code is  based on: Spellchecker using Word2vec by CPMP
# https://www.kaggle.com/cpmpml/spell-checker-using-word2vec

#get the list of words
words = spell_model.index2word

w_rank = {}
for i,word in enumerate(words):
    w_rank[word] = i

#word and an index as its value
WORDS = w_rank

# Use fast text as vocabulary
def words(text): return re.findall(r'\w+', text.lower())

def P(word): 
    "Probability of `word`."
    # use inverse of rank as proxy
    # returns 0 if the word isn't in the dictionary
    return - WORDS.get(word, 0)

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

def singlify(word):
    return "".join([letter for i,letter in enumerate(word) if i == 0 or letter != word[i-1]])

In [None]:
#test
word_index = tokenizer.word_index
i = 1
for k,v in word_index.items():
    print(k,v)
    i +=1
    if i > 3: break

In [18]:
len(tokenizer.word_index), src_vocab_size

(169494, 283759)

In [19]:
import datetime
currentDT = datetime.datetime.now()
print('Start time is {}'.format(currentDT.strftime("%I:%M:%S %p")))


#this contain the word and its index, from the training dataset such as
# the 1
# to 2
# i 3
word_index = tokenizer.word_index

#len(word_index) is 22966 and src_vocab_size is 10000
nb_words = min(src_vocab_size, len(word_index)) + 1


# we need to create a matrix of one embedding for each word in the training dataset. We can do that by enumerating 
# all unique words in the word_index and locating the embedding weight vector from the loaded fasttext embedding. 
# The result is a matrix of weights only for words we will see during training.
# if the word is not found in the fasttext embedding we will try to correct the spelling and check
embedding_matrix = np.zeros((nb_words, 501))

#get embeddings for the word 'something' from twitter 
something_tw = embeddings_index_tw.get("something")

#get embeddings for the word 'something' from fasttext 
something_ft = embeddings_index_ft.get("something")

something = np.zeros((501,))
something[:300,] = something_ft
something[300:500,] = something_tw
something[500,] = 0

def all_caps(word):
    return len(word) > 1 and word.isupper()

def embed_word(embedding_matrix, i, word):
    embedding_vector_ft = embeddings_index_ft.get(word)
    if embedding_vector_ft is not None: 
        if all_caps(word):
            last_value = np.array([1])
        else:
            last_value = np.array([0])
        embedding_matrix[i,:300] = embedding_vector_ft
        embedding_matrix[i,500] = last_value
        embedding_vector_tw = embeddings_index_tw.get(word)
        if embedding_vector_tw is not None:
            embedding_matrix[i,300:500] = embedding_vector_tw

            
# Fasttext vector is used by itself if there is no glove vector but not the other way around.
ctr = len(word_index)
for word, i in word_index.items():
    
    if i >= src_vocab_size: continue
    
    if (ctr % 25000==0): print('{} more to go'.format(ctr))
    ctr -= 1
    
    #is the word available in fasttext, create embeddings from fasttext vectors
    #if not try to correct the word
    if embeddings_index_ft.get(word) is not None:
        embed_word(embedding_matrix,i,word)
    else:
        # change to > 20 for better score.
        if len(word) > 20:
            #use the something vectors
            embedding_matrix[i] = something
        else:
            word2 = correction(word)
            if embeddings_index_ft.get(word2) is not None:
                embed_word(embedding_matrix,i,word2)
            else:
                word2 = correction(singlify(word))
                if embeddings_index_ft.get(word2) is not None:
                    embed_word(embedding_matrix,i,word2)
                else:
                    embedding_matrix[i] = something   

currentDT = datetime.datetime.now()
print('End time is {}'.format(currentDT.strftime("%I:%M:%S %p")))

Start time is 12:04:31 PM
150000 more to go
125000 more to go
100000 more to go
75000 more to go
50000 more to go
25000 more to go
End time is 02:21:47 PM


In [21]:
currentDT = datetime.datetime.now()
print('Start time is {}'.format(currentDT.strftime("%I:%M:%S %p")))

target_file_name = "Data/embeddings/embeddings_matrix.pickle"
source_file = embedding_matrix

with open(target_file_name, 'wb') as handle:
    pickle.dump(source_file, handle)

currentDT = datetime.datetime.now()
print('End time is {}'.format(currentDT.strftime("%I:%M:%S %p")))

Start time is 02:42:56 PM
End time is 02:43:03 PM
