In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## What we are going to do?
Pre-Processing the data, so as to make it similar to BERT Vocab.

A lot of material for this kernel is taken from [Theo Viel](https://www.kaggle.com/theoviel) kernel on [Improve your Score with Text Preprocessing](https://www.kaggle.com/theoviel/improve-your-score-with-text-preprocessing-v2). <br>

**Any feedback would be greatly appreciated. Thank you**

In [None]:
import pandas as pd
import numpy as np
import operator 
import re
import gc
import keras
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')

In [None]:
sample_submission = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")
test = pd.read_csv("../input/nlp-getting-started/test.csv")
train = pd.read_csv("../input/nlp-getting-started/train.csv")

In [None]:
df = pd.concat([train, test])

In [None]:
print("Total number of examples: ", df.shape[0])

## Importing BERT Vocab.

In [None]:
!pip install pytorch-pretrained-bert

In [None]:
import torch
from pytorch_pretrained_bert import BertTokenizer

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
with open("vocabulary.txt", 'w') as f:
    
    # For each token...
    for token in tokenizer.vocab.keys():
        
        # Write it out and escape any unicode characters.            
        f.write(token + '\n')

In [None]:
# this buids the vocab. of our dataset
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [None]:
vocab = build_vocab(df['text'])

In [None]:
list(vocab.keys())[:10]

In [None]:
# this check how much of our vocab is similar to the BERT vocab.
def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.3%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.3%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

In [None]:
print("BERT")
oov_bert = check_coverage(vocab, tokenizer.vocab)

In [None]:
oov_bert[:10]

In [None]:
tokenizer.vocab["I"]

In [None]:
tokenizer.vocab["i"]

Above example clearly demonstrate that the word **I** is not in vocab, but the word **i** is this is because we have imported `bert-base-uncased`. Let's make the text lower.

## Lowering the text

In [None]:
df['lowered_text'] = df['text'].apply(lambda x: x.lower())

In [None]:
vocab_lower = build_vocab(df['lowered_text'])
print("BERT EMBEDDINGS")
oov_bert = check_coverage(vocab_lower, tokenizer.vocab)

That's a significant amount of improvement. 

Now let us check what's missing in our vocab. 

In [None]:
oov_bert[:25]

First Faults appearing are: 
* Contractions 
* Punctuations
* Some words like **\x89ûò**
> Let's correct that.

## Dealing with Contractions

In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

In [None]:
def known_contractions(embed):
    known = []
    for contract in contraction_mapping:
        if contract in embed:
            known.append(contract)
    return known

In [None]:
print("- Known Contractions -")
print("   BERT :")
print(known_contractions(tokenizer.vocab))

Oh Shit! Contractions doesn't exist in BERT embeddings. <br>
This is really is big problem for us. Let fix it up! 

In [None]:
def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

In [None]:
df['treated_text'] = df['lowered_text'].apply(lambda x: clean_contractions(x, contraction_mapping))

In [None]:
vocab = build_vocab(df['treated_text'])
print("BERT : ")
oov_bert = check_coverage(vocab, tokenizer.vocab)

## Now let us clean the special Characters.

In [None]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

In [None]:
def unknown_punct(embed, punct):
    unknown = ''
    for p in punct:
        if p not in embed:
            unknown += p
            unknown += ' '
    return unknown

In [None]:
print("BERT :")
print(unknown_punct(tokenizer.vocab, punct))

That's great only 1 unknown punct. from our punct. set.

In [None]:
punct_mapping = { 'à': 'a'}

In [None]:
oov_bert[:10]

Most of the work will be done by our tokenizer that will tokenize the words. For eg. thunderstorm can be broken up as thunder + storm 

In [None]:
tokenizer.vocab["amp"]

In [None]:
print(tokenizer.tokenize("thunderstorm"))
print(tokenizer.tokenize("11-year-old"))
print(tokenizer.tokenize("@youtube"))
print(tokenizer.tokenize("\x89û_"))

Now let us delete some words like \x89...

In [None]:
bad_words = []
for i in range(len(oov_bert)):
    if oov_bert[i][0][0] =="\x89":
        bad_words.append(oov_bert[i])

In [None]:
bad_dict = {}
for i in range(len(bad_words)):
    bad_dict[bad_words[i][0]] = ""

In [None]:
def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = bad_dict  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

In [None]:
df['treated_text'] = df['treated_text'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))

In [None]:
vocab = build_vocab(df['treated_text'])
print("BERT : ")
oov_bert = check_coverage(vocab, tokenizer.vocab)

Again, that's a great improvment.

In [None]:
oov_bert[:25]

For those top 25 words let us do something.

In [None]:
for i in oov_bert[:25]:
    print(tokenizer.tokenize(i[0]))

As we can see above some of them are understood by tokenizer & for rest of them let us explicitly make a mapping dictionary.

In [None]:
explicit_mapping = {"\x89û": "", "mh370" : "flight", "legionnaires": "pneumonia", 
                   "derailment": "railway accident", "inundated": "flood", "deluged": "flood", 
                   "curfew": "stay at home","obliteration": "destruction", 
                   "quarantine": "prevent the spread of disease", "lol": "laugh", 
                   "obliterate": "destroy", "hijacking": "seize", "detonation": "explosion", 
                   "electrocuted": "killed", "destroyd": "destroyed"}

In [None]:
def explicit_changes(text, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    return text

In [None]:
df['treated_text'] = df['treated_text'].apply(lambda x: explicit_changes(x, explicit_mapping))

In [None]:
vocab = build_vocab(df['treated_text'])
print("BERT : ")
oov_bert = check_coverage(vocab, tokenizer.vocab)

In [None]:
oov_bert[:20]

Now we have done most of our job in preprocessing our data.

In [None]:
# lower
train['treated_text'] = train['text'].apply(lambda x: x.lower())
# clean contractions
train['treated_text'] = train['treated_text'].apply(lambda x: clean_contractions(x, contraction_mapping))
# clean special chars - this is optional as most of the punct. are in BERT embed.
train['treated_text'] = train['treated_text'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))
# cleaning some word
train['treated_text'] = train['treated_text'].apply(lambda x: explicit_changes(x, explicit_mapping))

In [None]:
train.head()

In [None]:
test['treated_text'] = test['text'].apply(lambda x: x.lower())
test['treated_text'] = test['treated_text'].apply(lambda x: clean_contractions(x, contraction_mapping))
test['treated_text'] = test['treated_text'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))
test['treated_text'] = test['treated_text'].apply(lambda x: explicit_changes(x, explicit_mapping))

In [None]:
test.head()

In [None]:
# Saving out work
train.to_csv("train_BERT_preprocessed.csv")
test.to_csv("test_BERT_preprocessed.csv")

<h2 style = "color:red" >Please Upvote, if you like this kernel.</h2>

**I will be adding more stuff soon!** <br>
So stay in touch.