## Importing Packages

In [1]:
import pandas as pd
import re
from gensim.models import KeyedVectors
from tqdm import tqdm
import operator 
tqdm.pandas()

## Loading Data

In [2]:
train_data = pd.read_csv('Data/quora-insincere-questions-classification/train.csv')
test_data = pd.read_csv('Data/quora-insincere-questions-classification/test.csv')


print(train_data.shape)
print(test_data.shape)

(1306122, 3)
(375806, 2)


In [3]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

sentences = train_data["question_text"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

100%|██████████| 1306122/1306122 [00:06<00:00, 212657.95it/s]
100%|██████████| 1306122/1306122 [00:05<00:00, 231014.70it/s]

{'How': 261930, 'did': 33489, 'Quebec': 97, 'nationalists': 91, 'see': 9003}





In [4]:
news_path = 'Data/quora-insincere-questions-classification/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

In [5]:
def check_coverage(vocab,embeddings_index):
    a = {}
    no_embeddings = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:
            no_embeddings[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(no_embeddings.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

no_embed_words = check_coverage(vocab,embeddings_index)

100%|██████████| 508823/508823 [00:01<00:00, 262272.22it/s]


Found embeddings for 24.31% of vocab
Found embeddings for  78.75% of all text


In [6]:
no_embed_words[0:10]

[('to', 403183),
 ('a', 402682),
 ('of', 330825),
 ('and', 251973),
 ('India?', 16384),
 ('it?', 12900),
 ('do?', 8753),
 ('life?', 7753),
 ('you?', 6295),
 ('me?', 6202)]

In [7]:
def clean_text(x):
    x = str(x)
    for punct in "/-":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²--':
        x = x.replace(punct, '')
    return x

train_data["question_text"] = train_data["question_text"].progress_apply(lambda x: clean_text(x))
sentences = train_data["question_text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:22<00:00, 58156.27it/s]
100%|██████████| 1306122/1306122 [00:05<00:00, 236508.37it/s]


In [8]:
no_embed_words = check_coverage(vocab,embeddings_index)

100%|██████████| 257503/257503 [00:00<00:00, 268226.82it/s]

Found embeddings for 56.92% of vocab
Found embeddings for  89.77% of all text





Nice! We were able to increase our embeddings ratio from 24% to 57% by just handling punctiation. Ok lets check on thos oov words.

In [9]:
no_embed_words[0:20]

[('to', 406301),
 ('a', 403830),
 ('of', 332964),
 ('and', 254077),
 ('2017', 8771),
 ('2018', 7364),
 ('doesnt', 6786),
 ('10', 6612),
 ('didnt', 3886),
 ('12', 3698),
 ('100', 2897),
 ('20', 2885),
 ('isnt', 2793),
 ('15', 2766),
 ('12th', 2551),
 ('11', 2314),
 ('30', 2131),
 ('18', 2046),
 ('50', 1989),
 ('16', 1582)]

In [10]:
## deal with numbers
def clean_numbers(x):
    x = re.sub('(?<=[0-9][0-9])th', '', x)
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

train_data["question_text"] = train_data["question_text"].progress_apply(lambda x: clean_numbers(x))
sentences = train_data["question_text"].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)

no_embed_words = check_coverage(vocab,embeddings_index)

100%|██████████| 1306122/1306122 [00:24<00:00, 53157.04it/s]
100%|██████████| 1306122/1306122 [00:08<00:00, 155764.54it/s]
100%|██████████| 1306122/1306122 [00:05<00:00, 234078.54it/s]
100%|██████████| 246996/246996 [00:01<00:00, 244688.66it/s]


Found embeddings for 59.86% of vocab
Found embeddings for  90.55% of all text


In [11]:
no_embed_words[0:20]

[('to', 406301),
 ('a', 403830),
 ('of', 332964),
 ('and', 254077),
 ('doesnt', 6786),
 ('didnt', 3886),
 ('isnt', 2793),
 ('Isnt', 1432),
 ('favourite', 1246),
 ('bitcoin', 980),
 ('colour', 976),
 ('centre', 884),
 ('Quorans', 879),
 ('cryptocurrency', 821),
 ('shouldnt', 797),
 ('hasnt', 786),
 ('Snapchat', 785),
 ('wasnt', 743),
 ('travelling', 705),
 ('counselling', 634)]

Ok now we take care of common misspellings when using american/ british vocab and replacing a few "modern" words with "social media" for this task I use a multi regex script I found some time ago on stack overflow. Additionally we will simply remove the words "a","to","and" and "of" since those have obviously been downsampled when training the GoogleNews Embeddings.

In [12]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color', 'centre':'center', 'didnt':'did not', 'doesnt':'does not', 'isnt':'is not', 'shouldnt':'should not', 'favourite':'favorite',
                'travelling':'traveling', 'counselling':'counseling', 'theatre':'theater', 'cancelled':'canceled', 'labour':'labor', 'organisation':'organization',
                'wwii':'world war 2', 'citicise':'criticize', 'instagram': 'social medium', 'whatsapp': 'social medium', 'snapchat': 'social medium', 
                'dont': 'do not', 'Isnt':'is not', 'Doesnt':'does not', 'hasnt':'has not', 'wasnt':'was not', 'behaviour': 'behavior', 
                'cryptocurrencies': 'crypto currency', 'programme': 'program', 'organisations': 'organization', 'licence': 'license',  'organisation': 'organization',
               'Whatis': 'what is', 'favour': 'favor', 'Pinterest': 'social medium', 'learnt': 'learn', 'defence': 'defense', 'recognise': 'recognize',
               'recognised': 'recognize', 'practise': 'practice', 'neighbour': 'neighbor', 'programr': 'programmer', 'realise': 'realize', 'Didnt':'did not',
               'theatre': 'theater', 'travelling': 'traveling', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 
                'demonetisation': 'demonetization', 'narcissit': 'narcissist', 'bigdata': 'big data', 'Qoura': 'Quora', 'sallary': 'salary',
               'analyse': 'analyze'}
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [13]:
train_data["question_text"] = train_data["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
sentences = train_data["question_text"].progress_apply(lambda x: x.split())
to_remove = ['a','to','of','and']
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:16<00:00, 79967.69it/s]
100%|██████████| 1306122/1306122 [00:06<00:00, 197695.59it/s]
100%|██████████| 1306122/1306122 [00:03<00:00, 328521.85it/s]
100%|██████████| 1306122/1306122 [00:05<00:00, 245018.53it/s]


In [15]:
no_embed_words = check_coverage(vocab,embeddings_index)

100%|██████████| 246849/246849 [00:01<00:00, 181430.46it/s]

Found embeddings for 59.88% of vocab
Found embeddings for  98.94% of all text





In [16]:
no_embed_words[:20]

[('bitcoin', 980),
 ('Quorans', 880),
 ('cryptocurrency', 821),
 ('Snapchat', 785),
 ('programr', 684),
 ('btech', 632),
 ('Brexit', 492),
 ('Shouldnt', 478),
 ('blockchain', 474),
 ('upvotes', 433),
 ('Redmi', 378),
 ('KVPY', 349),
 ('programrs', 337),
 ('Paytm', 331),
 ('grey', 299),
 ('Quoras', 292),
 ('mtech', 280),
 ('Btech', 263),
 ('bitcoins', 261),
 ('honours', 254)]

In [17]:
train_data["question_text"] = train_data["question_text"].str.lower()

In [18]:
train_data["question_text"]

0          how did quebec nationalists see their province...
1          do you have an adopted dog how would you encou...
2          why does velocity affect time does velocity af...
3          how did otto von guericke used the magdeburg h...
4          can i convert montra helicon d to a mountain b...
                                 ...                        
1306117    what other technical skills do you need as a c...
1306118    does ms in ece have good job prospects in usa ...
1306119                             is foam insulation toxic
1306120    how can one start a research project based on ...
1306121    who wins in a battle between a wolverine and a...
Name: question_text, Length: 1306122, dtype: object

In [19]:
def remove_words(text):
    to_remove = ['a','to','of','and']
    tokens = text.split(' ')
    new_string = ' '.join([token for token in tokens if token not in to_remove])
    return new_string

In [20]:
train_data["question_text"] = train_data["question_text"].apply(remove_words)

In [21]:
train_data["question_text"] 

0          how did quebec nationalists see their province...
1          do you have an adopted dog how would you encou...
2          why does velocity affect time does velocity af...
3          how did otto von guericke used the magdeburg h...
4          can i convert montra helicon d mountain bike b...
                                 ...                        
1306117    what other technical skills do you need as com...
1306118    does ms in ece have good job prospects in usa ...
1306119                             is foam insulation toxic
1306120    how can one start research project based on bi...
1306121            who wins in battle between wolverine puma
Name: question_text, Length: 1306122, dtype: object

In [22]:
sentences = train_data["question_text"].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:08<00:00, 150574.65it/s]
100%|██████████| 1306122/1306122 [00:06<00:00, 214852.51it/s]


In [23]:
len(vocab)

203064

In [26]:
train_data.to_csv('preprocessed_data_deep_learning.csv', index=False)