In [1]:
import pandas as pd
from tqdm import tqdm
import string
tqdm.pandas()
import numpy as np

In [3]:
train = pd.read_csv("cleaned_train.csv")
test = pd.read_csv("cleaned_test.csv")
print("Train shape : ",train.shape)
print("Test shape : ",test.shape)

Train shape :  (51, 110)
Test shape :  (26, 110)


In [4]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence.split():
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [6]:
vocab = build_vocab(train['text'])
print({k: vocab[k] for k in list(vocab)[:5]})

100%|██████████| 51/51 [00:00<00:00, 4260.98it/s]


{'20': 11, 'bonus': 4, 'for': 286, 'registered': 3, 'developers': 10}


In [21]:
EMBEDDING_FILE = 'glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding = 'utf-8'))

In [8]:
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [9]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████| 5731/5731 [00:00<00:00, 718332.37it/s]


Found embeddings for 0.00% of vocab
Found embeddings for  0.00% of all text


In [10]:
oov[:50]

[('.', 1848),
 ('the', 1414),
 ('in', 487),
 ('is', 398),
 ('you', 370),
 ('that', 358),
 ('i', 337),
 ('it', 314),
 ('for', 286),
 ('with', 271),
 ('we', 218),
 ('on', 213),
 ('as', 212),
 ('this', 196),
 ('be', 196),
 ('are', 188),
 ('can', 164),
 ('your', 158),
 ('have', 155),
 ('my', 148),
 (')', 145),
 ('(', 145),
 ('but', 140),
 ('at', 136),
 ('or', 135),
 ('from', 126),
 ('will', 120),
 ('all', 120),
 ('our', 120),
 ('was', 116),
 ('not', 116),
 ('its', 107),
 ('they', 102),
 ('an', 100),
 ('so', 98),
 ('he', 97),
 ('if', 91),
 ('what', 90),
 ('his', 87),
 ('by', 87),
 ('more', 86),
 ('time', 83),
 ('their', 82),
 ('would', 80),
 ('—', 79),
 ('about', 79),
 ('how', 78),
 ('when', 75),
 ('out', 74),
 ('do', 72)]

In [11]:
#for punct in string.punctuation:
#    print(punct,punct in embeddings_index)

In [12]:
to_remove = ['a','to','of','and']
def remove_words(x):
    x = str(x)
    y = []
    for w in x.split():
        if w not in to_remove:
            y.append(w)    
    return " ".join(y)

def remove_punc(x):
    x = str(x)
    for punct in '\n' + '\r':
        x = x.replace(punct, ' ')
    for punct in string.punctuation:
        x = x.replace(punct, f' {punct} ')   
    return x.lower()

def clean_text(x):
    x = remove_punc(x)
    x = remove_words(x)
    return x

In [14]:
train["text"] = train["text"].progress_apply(lambda x: clean_text(x))
test["text"] = test["text"].progress_apply(lambda x: clean_text(x))
vocab = build_vocab(train["text"])

100%|██████████| 51/51 [00:00<00:00, 3007.90it/s]
100%|██████████| 26/26 [00:00<00:00, 2172.57it/s]
100%|██████████| 51/51 [00:00<00:00, 5115.00it/s]


In [17]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████| 5731/5731 [00:00<00:00, 718396.78it/s]


Found embeddings for 0.00% of vocab
Found embeddings for  0.00% of all text


In [18]:
oov[:50]

[('.', 1848),
 ('the', 1414),
 ('in', 487),
 ('is', 398),
 ('you', 370),
 ('that', 358),
 ('i', 337),
 ('it', 314),
 ('for', 286),
 ('with', 271),
 ('we', 218),
 ('on', 213),
 ('as', 212),
 ('this', 196),
 ('be', 196),
 ('are', 188),
 ('can', 164),
 ('your', 158),
 ('have', 155),
 ('my', 148),
 (')', 145),
 ('(', 145),
 ('but', 140),
 ('at', 136),
 ('or', 135),
 ('from', 126),
 ('will', 120),
 ('all', 120),
 ('our', 120),
 ('was', 116),
 ('not', 116),
 ('its', 107),
 ('they', 102),
 ('an', 100),
 ('so', 98),
 ('he', 97),
 ('if', 91),
 ('what', 90),
 ('his', 87),
 ('by', 87),
 ('more', 86),
 ('time', 83),
 ('their', 82),
 ('would', 80),
 ('—', 79),
 ('about', 79),
 ('how', 78),
 ('when', 75),
 ('out', 74),
 ('do', 72)]

In [19]:
train.to_csv('cleaned_train_punc_brute.csv',index=False)
test.to_csv('cleaned_test_punc_brute.csv',index=False)