In [2]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm_notebook as tqdm
import re
import difflib

%matplotlib inline
import matplotlib.pyplot as plt

DOWNLOAD_PATH = "./datasets/"
TRAIN_FILE = "train.csv"
CORRECTED_TRAIN_FILE = "corrected_train.csv"
TEST_FILE = "test.csv"

# Useful functions
(in house)

In [3]:
#returns similarity measure between two vectorized embeddings (between 0 - orthogonal and 1 - identical)
def dist_embeddings(embedding1, embedding2):
    return np.sum(embedding1 * embedding2)/(np.sqrt(np.sum(embedding1 * embedding1))*np.sqrt(np.sum(embedding2 * embedding2)))

#returns maximum and minimum sentence lengths from a list of sentences. Also returns the corresponding index
def max_min_len(sentence_list):
    max_len = 0
    min_len = np.inf
    index_max = 0
    index_min = 0
    i = 0
    for q in sentence_list:
        if len(q) > max_len:
            max_len = len(q)
            index_max = i
        if len(q) < min_len:
            min_len = len(q)
            index_min = i
        i +=1
    return max_len, index_max, min_len, index_min

#extract data from a csv file in a DataFrame
def load_data(path = DOWNLOAD_PATH, file = TRAIN_FILE):
    csv_path = os.path.join(path,file)
    return pd.read_csv(csv_path)

#nested function returning True if len(X) is in [r_min, r_max] window, False otherwise
def len_in_range(r_min, r_max):
    def in_range(X):
        if len(X)>=r_min and len(X)<=r_max:
            return True
        else:
            return False
    return in_range

#nested function returning True if the prefix of oov and voc are the same, False otherwise
def prefix_comparator(oov, prefix_length = 1):
    def prefix_checker(voc):
        if voc[:prefix_length].lower() == oov[:prefix_length].lower():
            return True
        else:
            return False
    return prefix_checker

#compares oov_word to the vocabulary. If the similarity is over the ratio_treshold, the function will
#return the corresponding token in the vocabulary, otherwise the unknown_token value
#other parameters: 
#    len_window: words in vocabulary of lenght len(oov_word) +- len_window are taken into consideration
#    hard_check: If hard_check is set to "False", function will replace ratio_treshold by 
#                min((len(oov_word)-1)/len(oov_word), unknown_treshold)
#         if len(oov_word) < 1/(1-ratio_treshold), ratio_treshold will never be met because difference
#         of a single letter between vocabulary and oov_word will always have a ratio < ratio_threshold     
#    min_length: if len(oov_word) < min_length, unknown token is returned 
def oov_checker(oov_word, vocab_list, unknown_token = 'UNK', ratio_threshold = 0.9, hard_check = False):
    
    best_ratio = 0.
    best_voc = unknown_token
    
    if hard_check:
        best_treshold = ratio_threshold
    else:
        best_treshold = min((len(oov_word)-1)/len(oov_word), ratio_threshold)
    
    for voc in vocab_list:
        ratio = difflib.SequenceMatcher(None, oov_word, voc).ratio()
        if ratio >= best_treshold and ratio > best_ratio:
            best_ratio = ratio
            best_voc = voc 

    return best_voc, best_ratio

def correction_score_generator(oov_list, vocabulary, unknown_token = 'UNK', len_window = 1):
    
    correction_list = {}
    
    if type(vocabulary) == list:
        vocab_list = vocabulary
    else:
        vocab_list = [*vocabulary]
        
    sorted_vocab_list = sorted(vocab_list, key=len)
    sorted_oov_list = sorted(oov_list, key=len)
    length = 0
    
    for oov in tqdm(sorted_oov_list):
        if length != len(oov):
            length = len(oov)
            min_len = length - len_window
            max_len = length + len_window
            vocab_window = len_in_range(min_len, max_len)
            filtered_vocab_list = list(filter(lambda X: vocab_window(X), sorted_vocab_list))
        prefix_comp = prefix_comparator(oov)
        filtered_vocab = list(filter(lambda X: prefix_comp(X), filtered_vocab_list))
        correction_list[oov] = oov_checker(oov, filtered_vocab, unknown_token, ratio_threshold = 0.)
   
    return correction_list


def sentence_correcter(sentences, embeddings, correction_dict, threshold = 0.9):
    corrected_sentences = []
    unknown = "UNK"
    for sentence in tqdm(sentences):
        corrected_sentence = []
        for word in sentence:
            if word in embeddings: 
                corrected_sentence.append(word)
            else:
                try:
                    if correction_dict[word][1] >= threshold:
                        corrected_sentence.append(correction_dict[word][0])
                    else:
                        corrected_sentence.append(unknown)
                except KeyError:
                    corrected_sentence.append(unknown)
        corrected_sentences.append(corrected_sentence)
    
    return corrected_sentences

def de_tokenize(sentences_list):
    de_sentences_list = []
    for sentence in tqdm(sentences_list):
        de_sentence = ""
        for word in sentence:
            de_sentence += word + " "
        de_sentences_list.append(de_sentence)
        
    return de_sentences_list

(inspired by https://www.kaggle.com/alhalimi/tokenization-and-word-embedding-compatibility/notebook)

In [4]:
# Only extract GloVe embedddings as a first approach
def glove_embeddings(gloveFile = "./embeddings/glove.840B.300d/glove.840B.300d.txt", extract = -1):

    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    
    embeddings_index = {}    
    f = open(gloveFile,'r', encoding="utf8")
    increment = 0
    for line in tqdm(f, total = 2196017):
        word, vect = get_coefs(*line.split(" "))
        embeddings_index[word] = vect
        if increment == extract - 1:
            break
        elif extract != -1:
            increment += 1
            
    return embeddings_index

#returns a list of lists containing the tokenized version of the sentences contained in the sentences_list
#adds token 'EOS' to each sentence in the list
def tokenize(sentences_list):
    return [re.findall(r"[\w]+|[']|[.,!?;]", x) for x in tqdm(sentences_list)]

#return a dict containing as keys all unique words from a tokenized sentences list, and as value the number of instances
#for each unique word
def get_vocab(sentences):
    """
    :param sentences: a list of list of words
    :return: a dictionary of words and their frequency 
    """
    vocab={}
    for sentence in tqdm(sentences):
        for word in sentence:
            try:
                vocab[word] +=1
            except KeyError:
                vocab[word] = 1
    return vocab

#find words in common between a given embedding and our vocabulary
def compare_vocab_and_embeddings(vocab, embeddings_index):
    """
    :params vocab: our corpus vocabulary (a dictionary of word frquencies)
            embeddings_index: a genim object containing loaded embeddings.
    :returns in_common: words in common,
             in_common_freq: total frequency in the corpus vocabulary of 
                             all words in common
             oov: out of vocabulary words
             oov_frequency: total frequency in vocab of oov words
    """
    oov=[]
    in_common=[]
    in_common_freq = 0
    oov_freq = 0

    for word in tqdm(vocab):
        if word in embeddings_index:
            in_common.append(word)
            in_common_freq += vocab[word]
        else: 
            oov.append(word)
            oov_freq += vocab[word]
    
    print('Found embeddings for {:.2%} of vocab'.format(len(in_common) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(in_common_freq / (in_common_freq + oov_freq)))

    return sorted(in_common)[::-1], sorted(oov)[::-1], in_common_freq, oov_freq

# print the list of out-of-vocabulary words sorted by their frequency in the training text
def sort_oov_words(oov, vocab, threshold = 0.5, min_len = 5):
    # Sort oov words by their frequency in the text
    sorted_oov= sorted(oov, key =lambda x: vocab[x], reverse=True )
    nr_tokens = 0
    i = 0
    ratio = 0.
    pruned_sorted_oov = []
    # Show oov words and their frequencies
    if (len(sorted_oov)>0):
        for word in sorted_oov:
            if len(word) >= min_len:
                if  re.search(r'[0-9]+', word, flags=0) == None:
                    nr_tokens +=vocab[word]
                    pruned_sorted_oov.append(word)
        print("Total number of oov instances: {}".format(nr_tokens))
        for word in pruned_sorted_oov:
            i += 1
            #print("%s\t%s"%(word, vocab[word]))
            ratio += vocab[word]
            if ratio/nr_tokens >= threshold:
                break       
    else:
        print("No words were out of vocabulary.")
    print("Number of oov words selected: {}/{} corresponding to {} instances".format(i, len(pruned_sorted_oov),ratio))  
    return pruned_sorted_oov[:i]

# Extraction of datasets

In [5]:
#extracts test data from test.csv
test_data = load_data(file=TEST_FILE)
test_data.head()

Unnamed: 0,qid,question_text
0,00014894849d00ba98a9,My voice range is A2-C5. My chest voice goes u...
1,000156468431f09b3cae,How much does a tutor earn in Bangalore?
2,000227734433360e1aae,What are the best made pocket knives under $20...
3,0005e06fbe3045bd2a92,Why would they add a hypothetical scenario tha...
4,00068a0f7f41f50fc399,What is the dresscode for Techmahindra freshers?


In [6]:
#extracts train data from train.csv
train_data = load_data()
train_data.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [7]:
questions_list = train_data["question_text"].values

In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306122 entries, 0 to 1306121
Data columns (total 3 columns):
qid              1306122 non-null object
question_text    1306122 non-null object
target           1306122 non-null int64
dtypes: int64(1), object(2)
memory usage: 29.9+ MB


In [8]:
train_data["target"].value_counts()

0    1225312
1      80810
Name: target, dtype: int64

# Extraction of embeddings

In [9]:
embeddings = glove_embeddings()

HBox(children=(IntProgress(value=0, max=2196017), HTML(value='')))




In [10]:
print("The GloVe embedding contains {} unique tokens".format(len(embeddings.keys())))

The GloVe embedding contains 2196016 unique tokens


# Prepare train_data

In [8]:
tokenized_questions = tokenize(questions_list)
max_length, index_max, min_length, index_min = max_min_len(tokenized_questions)
print("Max sentence length: {} at index {}, min sentence length: {} at index {}".format(max_length, index_max, min_length, index_min))

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))


Max sentence length: 203 at index 443216, min sentence length: 0 at index 420816


In [11]:
token_dict = get_vocab(tokenized_questions)
print("The training dataset contains {} unique tokens".format(len(token_dict)))

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))


The training dataset contains 238306 unique tokens


In [12]:
in_common, oov, _, _ = compare_vocab_and_embeddings(token_dict, embeddings)

HBox(children=(IntProgress(value=0, max=238306), HTML(value='')))


Found embeddings for 75.25% of vocab
Found embeddings for  99.58% of all text


In [13]:
oov_words = sort_oov_words(oov, token_dict, threshold = 0.2)

Total number of oov instances: 64726
Number of oov words selected: 1194/48571 corresponding to 12948.0 instances


In [14]:
correction_scored_dict = correction_score_generator(oov_words, embeddings)

HBox(children=(IntProgress(value=0, max=1194), HTML(value='')))




In [15]:
val = list(correction_scored_dict.values())
val_ = []
for values in val:
    val_.append(values[1])
    
np.mean(val_), np.std(val_), np.min(val_), np.max(val_), np.percentile(val_, 20)

(0.8506893721145692, 0.08027703067439415, 0.0, 0.967741935483871, 0.8)

In [16]:
corrected_questions = sentence_correcter(tokenized_questions, embeddings, correction_scored_dict, threshold = 0.8)

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))




In [17]:
de_corrected_questions = de_tokenize(corrected_questions)

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))




In [18]:
de_corrected_questions

['How did Quebec nationalists see their province as a nation in the 1960s ? ',
 'Do you have an adopted dog , how would you encourage people to adopt and not shop ? ',
 'Why does velocity affect time ? Does velocity affect space geometry ? ',
 'How did Otto von Guericke used the Magdeburg hemispheres ? ',
 'Can I convert montra helicon D to a mountain bike by just changing the tyres ? ',
 'Is Gaza slowly becoming Auschwitz , Dachau or Treblinka for Palestinians ? ',
 'Why does Quora automatically ban conservative opinions when reported , but does not do the same for liberal views ? ',
 'Is it crazy if I wash or wipe my groceries off ? Germs are everywhere . ',
 'Is there such a thing as dressing moderately , and if so , how is that different than dressing modestly ? ',
 'Is it just me or have you ever been in this phase wherein you became ignorant to the people you once loved , completely disregarding their feelings lives so you get to have something go your way and feel temporarily at

In [19]:
corrected_train_data = train_data.copy()

In [20]:
corrected_train_data['corrected_question_text'] = de_corrected_questions

In [1]:
corrected_train_data.head()

NameError: name 'corrected_train_data' is not defined

In [22]:
corrected_train_data = corrected_train_data.drop(columns= ["question_text"])

In [23]:
corrected_train_data.to_csv(DOWNLOAD_PATH + CORRECTED_TRAIN_FILE, index = False)

# Laboratory

In [19]:
a = "whatsap"
b = "whatsapp"

seq = difflib.SequenceMatcher(None, a, b)

d = seq.ratio()*100
print(d)

93.33333333333333


In [20]:
new_dict = {"je":["a"], "mange":["b"], "une":["c"], "pomme":["d"]}
new_dict["pomme"].append("e")
new_dict

{'je': ['a'], 'mange': ['b'], 'une': ['c'], 'pomme': ['d', 'e']}

In [21]:
keys = [*new_dict]
keys

['je', 'mange', 'une', 'pomme']

In [22]:
if type(oov) == list:
    print("ok")

ok


In [23]:
ret = ["ab", "abcd", "abcdef", "abcdefgh", "abcdefghij", "abcdefghijklmn"]
test_in_range = len_in_range(4,8)
filtered = filter(lambda X: test_in_range(X), ret)
for i in filtered:
    print(i)

abcd
abcdef
abcdefgh


In [24]:
 re.search(r'[0-9]+', "se8nse", flags=0)

<_sre.SRE_Match object; span=(2, 3), match='8'>