# Imports

In [1]:
import numpy as np;
import pandas as pd;

# Data Loads.

We only need to look at the Text column.

In [421]:
quora_train_text = pd.read_csv('data/train.csv');
train_text = quora_train_text['question_text'];

# Word Embeddings

We are using the 100D Glove Word Embeddings. Load into a dictionary.

In [22]:
embeddings_dict = {};
with open('../Embeddings/glove.6B/glove.6B.100d.txt', 'rb') as f:
    for line in f:
        values = line.split()
        word = values[0].decode('utf-8')
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

Example of a word embedding

In [27]:
%%time
embeddings_dict['how']

Wall time: 0 ns


array([-0.23769  ,  0.59392  ,  0.58697  , -0.041788 , -0.86803  ,
       -0.0051122, -0.4493   , -0.027985 ,  0.065674 , -0.37901  ,
        0.31175  ,  0.20528  ,  0.431    , -0.30913  , -0.084523 ,
        0.056902 ,  0.34704  ,  0.32332  , -0.60413  ,  0.23123  ,
       -0.095476 , -0.50009  ,  0.024528 , -0.5819   ,  0.0075945,
       -0.13871  , -0.26417  , -0.7827   , -0.24024  ,  0.081827 ,
       -0.1867   ,  0.95404  , -0.057412 ,  0.0089747, -0.20492  ,
        0.060902 , -0.018265 ,  0.3349   ,  0.2373   , -0.17605  ,
       -0.54013  , -0.32745  , -0.088963 , -0.38417  , -1.0903   ,
        0.17028  , -0.019269 , -0.35687  , -0.0084772, -0.81525  ,
        0.23428  ,  0.11118  ,  0.42356  ,  1.111    , -0.35535  ,
       -2.3889   ,  0.6203   , -0.16352  ,  1.445    ,  0.58333  ,
       -0.22946  ,  1.4375   , -0.063997 , -0.31511  ,  1.0875   ,
        0.39221  ,  0.73147  ,  0.57236  , -0.050781 , -0.027736 ,
        0.41085  , -0.25076  , -0.26331  , -0.38682  ,  0.5066

# Check Coverage

A function to check how many of the words in the corpus are covered by embedings.
We want to get as close to 100% as possible.

In [30]:
def check_coverage(text, embeddings_dict):
    known_words, unknown_words = {}, {};
    total_known, total_unknown = 0, 0;
    
    for sentence in text:
        for word in sentence.split(' '):
            if word in known_words:
                total_known = total_known + 1;
            elif word in embeddings_dict:
                known_words[word] = embeddings_dict[word];
                total_known = total_known + 1;
            else:
                unknown_words[word] = None;
                total_unknown = total_unknown + 1;
            
    print('Total coverage of Vocabulary %.2f'%(len(known_words) / len(embeddings_dict)))
    print('Total coverage of Dataset %.2f'%(total_known / (total_known + total_unknown)));
    
    return known_words, unknown_words;

In [31]:
%%time
known_words, unknown_words = check_coverage(train_text, embeddings_dict);

Total coverage of Vocabulary 0.18
Total coverage of Dataset 0.71


Smaller Sample.

Sampling 10% of the dataset for quicker testing.

In [34]:
rand_sample = np.random.choice(np.arange(len(quora_train_text)), int(0.10 * len(quora_train_text)))
train_text = quora_train_text.iloc[rand_sample]['question_text']

In [36]:
%%time
known_words, unknown_words = check_coverage(train_text, embeddings_dict);

Total coverage of Vocabulary 0.08
Total coverage of Dataset 0.71
Wall time: 565 ms


# Preprocessing

## Step 1: Lower Casing

In [238]:
def lower_case(texts):
    lower_texts = texts.apply(lambda text: text.lower())
    return lower_texts;

In [239]:
def preprocess(texts):
    lower_texts = lower_case(texts);
    return lower_texts;

In [240]:
cleaned_text = preprocess(train_text);

In [41]:
check_coverage(cleaned_text, embeddings_dict);

Total coverage of Vocabulary 0.10
Total coverage of Dataset 0.87


Coverage is up from 71% to 87% with lower casing.

## Step 2. Unpunctuating Text

Find special non-alphanumeric characters that are in the dataset.

In [96]:
import re;

def find_special_chars(text_sentences):
    pattern = r'[^a-zA-Z0-9 ]';

    special_char_matches = set();
    for text in text_sentences:
        re_matches = re.findall(pattern, text);
        special_char_matches.update(re_matches);
    return special_char_matches;

In [84]:
special_char_matches = find_special_chars(train_text)
pd.DataFrame([(char, ord(char), char.encode('unicode_escape')) for char in special_char_matches]).sort_values(0)

Unnamed: 0,0,1,2
40,\n,10,b'\\n'
191,!,33,b'!'
338,"""",34,"b'""'"
176,#,35,b'#'
328,$,36,b'$'
2,%,37,b'%'
209,&,38,b'&'
6,',39,"b""'"""
146,(,40,b'('
136,),41,b')'


There are a lot of Asian and Cyrrylic characters. Since we don't have embeddings for those, we can filter them out.

In [113]:
eastasian_re_match = r'[\u30a2-\ud588]+';
num_nonenglish_sentences = 0;
for text in train_text:
    re_matches = re.findall(eastasian_re_match, text);
    if len(re_matches) > 0:
        num_nonenglish_sentences = num_nonenglish_sentences + 1;
num_nonenglish_sentences

26

In [114]:
southasian_re_match = r'[\u05ea-\ude48]+'
for text in train_text:
    re_matches = re.findall(southasian_re_match, text);
    if len(re_matches) > 0:
        num_nonenglish_sentences = num_nonenglish_sentences + 1;
num_nonenglish_sentences

2390

In [118]:
cyrrylic_re_match = r'[\u0400-\u04ff]+';
for text in train_text:
    re_matches = re.findall(cyrrylic_re_match, text);
    if len(re_matches) > 0:
        num_nonenglish_sentences = num_nonenglish_sentences + 1;
num_nonenglish_sentences

2395

In [155]:
special_char_matches = find_special_chars(filtered_train_text)
pd.options.display.max_rows = 435;
pd.DataFrame([(char, ord(char), char.encode('unicode_escape')) for char in special_char_matches]).sort_values(0)[0].values

array(['\n', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',',
       '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']',
       '^', '_', '`', '{', '|', '}', '~', '\x7f', '\xa0', '¢', '£', '§',
       '©', '«', '®', '¯', '°', '±', '²', '³', '´', 'º', '»', '½', '¿',
       'Á', 'Ã', 'Ç', 'Î', 'Ö', '×', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'æ',
       'ç', 'è', 'é', 'ë', 'í', 'î', 'ð', 'ñ', 'ò', 'ó', 'ö', '÷', 'ú',
       'ü', 'Ā', 'ā', 'ă', 'ć', 'Č', 'č', 'ğ', 'ī', 'İ', 'ı', 'ō', 'ř',
       'ş', 'š', 'ū', 'ž', 'ș', 'ɖ', 'ɾ', 'ʒ', 'ʻ', '˚', '̃', 'Δ', 'Σ',
       'Φ', 'ά', 'α', 'δ', 'θ', 'ι', 'μ', 'ξ', 'ο', 'π', 'ρ', 'σ', 'χ',
       'ω', 'ϵ', 'ח', 'ר', 'ﷺ', '️', '\ufeff', '，', '？'], dtype=object)

The special character set is much smaller now (although it must be remembered we are only using 10% of the dataset).

From the above list, we can separate out ones we want to keep and could be useful, such as greek charcters.

In [357]:
replacing_chars_map = {'\n' : "",
                       '"' : '"',
                       "'" : "'",
                       '`' : "'",
                       '´' : "'",
                       '\x7f' : "",
                       '\xa0' : "",
                       'ș' : "cedillia",
                       'ʒ' : "ezh",
                       'ʻ' : "'",
                       '˚' : "degree", 
                        "α" : "alpha",
                        "β" : "beta",
                        "γ" : "gamma",
                        "Γ" : "gamma",
                        "δ" : "delta",
                        "Δ" : "delta",
                        "ϵ" : "epsilon",
                        "ζ" : "zeta",
                        "η" : "eta",
                        "θ" : "theta",
                        "Θ" : "theta",
                        "ι" : "iota",
                        "λ" : "lambda",
                        "Λ" : "lambda",
                        "μ" : "mu",
                        "ξ" : "xi",
                        "Ξ" : "xi",
                        "π" : "pi",
                        "Π" : "pi",
                        "ρ" : "rho",
                        "σ" : "sigma",
                        "Σ" : "sigma",
                        "τ" : "tau",
                        "υ" : "upsilon",
                        "ϒ" : "upsilon",
                        "ϕ" : "phi",
                        "Φ" : "phi",
                        "χ" : "chi",
                        "X" : "chi",
                        "ψ" : "psi",
                        "Ψ" : "psi",
                        "ω" : "omega",
                        "Ω" : "omega",
                       'ﷺ' : "",
                       "ר" : "resh",
                       "ח" : "heth",
                       '️' : "",
                       '\ufeff' : "",
                       '，' : ",",
                       '？' : "?",
                       '¿' : "",
                       '?' : "",
                       '"' : "",
                       ',' : ","
                      }

And ones we do do not need, and can remove entirely.

In [456]:
special_repl =  r'[!#\$%&\(\)\*\+,-\.\:;<>\?@\[\\\]\^_\{\|\}\~¢’‘…′–“”£\§©«\®¯°±²³º»½¿\/\=]';

In [164]:
filtered_train_text = filtered_train_text.apply(lambda text: re.sub(special_repl, "", text));

In [165]:
special_char_matches = find_special_chars(filtered_train_text)
pd.options.display.max_rows = 435;
pd.DataFrame([(char, ord(char), char.encode('unicode_escape')) for char in special_char_matches]).sort_values(0)[0].values

array(['\n', '"', "'", '`', '\x7f', '\xa0', '´', 'Á', 'Ã', 'Ç', 'Î', 'Ö',
       '×', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'æ', 'ç', 'è', 'é', 'ë', 'í',
       'î', 'ð', 'ñ', 'ò', 'ó', 'ö', '÷', 'ú', 'ü', 'Ā', 'ā', 'ă', 'ć',
       'Č', 'č', 'ğ', 'ī', 'İ', 'ı', 'ō', 'ř', 'ş', 'š', 'ū', 'ž', 'ș',
       'ɖ', 'ɾ', 'ʒ', 'ʻ', '˚', '̃', 'Δ', 'Σ', 'Φ', 'ά', 'α', 'δ', 'θ',
       'ι', 'μ', 'ξ', 'ο', 'π', 'ρ', 'σ', 'χ', 'ω', 'ϵ', 'ח', 'ר', 'ﷺ',
       '️', '\ufeff', '，', '？'], dtype=object)

Function to replace instances of character intances from a map.

In [166]:
def replace_with_map(main_text, repl_map):
    for key in repl_map.keys():
        main_text = main_text.replace(key, repl_map[key])
    return main_text;

In [174]:
filtered_train_text = filtered_train_text.apply(lambda text: replace_with_map(text, replacing_chars_map))

In [175]:
special_char_matches = find_special_chars(filtered_train_text)
pd.options.display.max_rows = 435;
pd.DataFrame([(char, ord(char), char.encode('unicode_escape')) for char in special_char_matches]).sort_values(0)[0].values

array(["'", ',', 'Á', 'Ã', 'Ç', 'Î', 'Ö', '×', 'ß', 'à', 'á', 'â', 'ã',
       'ä', 'æ', 'ç', 'è', 'é', 'ë', 'í', 'î', 'ð', 'ñ', 'ò', 'ó', 'ö',
       '÷', 'ú', 'ü', 'Ā', 'ā', 'ă', 'ć', 'Č', 'č', 'ğ', 'ī', 'İ', 'ı',
       'ō', 'ř', 'ş', 'š', 'ū', 'ž', 'ɖ', 'ɾ', '̃'], dtype=object)

Finally, using the above steps, we can write a final method to replace all special charcters.

We will:
1. filter out all asian (east and southern) characters
2. filter out all cyryllic characters
3. remove unneccessary punctuation
4. replace additional characters, including greek letters and different styles of quotations

In [434]:
def filter_specialchars(texts):
    filtered_train_text = texts.apply(lambda text: re.sub(special_repl, "", text));
    filtered_train_text = filtered_train_text.apply(lambda text: replace_with_map(text, replacing_chars_map))

    filtered_train_text = filtered_train_text[~filtered_train_text.str.contains(eastasian_re_match)]
    filtered_train_text = filtered_train_text[~filtered_train_text.str.contains(southasian_re_match)]
    filtered_train_text = filtered_train_text[~filtered_train_text.str.contains(cyrrylic_re_match)]
            
    return filtered_train_text;

Update Preprocess step

In [247]:
def preprocess(texts):
    lower_texts = lower_case(texts);
    unpunctuated_texts = filter_specialchars(lower_texts)
    return unpunctuated_texts;

In [248]:
cleaned_text = preprocess(train_text)

Our Coverage went from 87% to 98% after fixing special characters.

In [251]:
%%time
_, _ = check_coverage(cleaned_text, embeddings_dict)

Total coverage of Vocabulary 0.11
Total coverage of Dataset 0.98
Wall time: 412 ms


# Step 3. Contractions

In the next step, we will replace all contractions with their expanded versions. We will use the following dictionary.

In [334]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"i'd": "i had",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it had",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that had",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there had",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they had",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": " what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

We can reuse the replace map method from above.

In [194]:
def correct_contractions(texts):
    corrected_text = texts.apply(lambda text: replace_with_map(text, contractions))
    return corrected_text;

Updated preprocess step to include contractions

In [252]:
def preprocess(texts):
    lower_texts = lower_case(texts);
    unpunctuated_texts = filter_specialchars(lower_texts)
    expanded_texts = correct_contractions(unpunctuated_texts)
    return expanded_texts;

Coverage stayed at 98%, which means that the difference is negligible.

In [253]:
cleaned_text = preprocess(train_text)
_, unknown_words = check_coverage(cleaned_text, embeddings_dict)

Total coverage of Vocabulary 0.11
Total coverage of Dataset 0.98


Let's look at a few of the unknown words to see what else can we fix.

In [255]:
unknown_words

{'infp': None,
 'cruency': None,
 "person's": None,
 'hisher': None,
 'handwtiting': None,
 '': None,
 "thiel's": None,
 'pferdestarke': None,
 'cotenant': None,
 '18yearold': None,
 '17yearold': None,
 'microperforate': None,
 "kejriwal's": None,
 'ccurse': None,
 "jobs'": None,
 'newdelhi': None,
 'photogenetic': None,
 'heshe': None,
 'x210': None,
 "bloomer's": None,
 'darkskinned': None,
 "i'm": None,
 'sciencentific': None,
 'directoriesto': None,
 'selfimprovement': None,
 'bhagavadgita': None,
 "'water": None,
 "level'": None,
 "karen's": None,
 "bangli's": None,
 "byju's": None,
 'ibps': None,
 'adwordskeyword': None,
 "man's": None,
 '16yearold': None,
 'babumoshai': None,
 '18cm': None,
 '13yearold': None,
 'perfectionismimposter': None,
 'doobay': None,
 'mrtrump': None,
 '÷7': None,
 'hedicine': None,
 'laravel': None,
 'unballanced': None,
 "today's": None,
 'vinnytsya': None,
 'y6': None,
 'powerfulskilled': None,
 'no46': None,
 'range4': None,
 '5090': None,
 'xmen': N

I see two errors we can fix here.

1. There are instances with mixed numericals and letters. For example, `780mgday` and `1year` or `tr85`.
2. There are contractions on nouns which aren't captured by our map, such as `israel's` and `personality's` or `glover's`.

We will write regular expressions for each of these, and write a method to split a word into it's proper parts:

In [362]:
alpha_re_match = r"([a-zA-Z]+)([0-9]+)";
numbr_re_match = r"([0-9]+)([a-zA-Z]+)";
apost_re_match = r"[a-zA-Z]+\'s";

In [367]:
def split_nums(text):
    new_text = [];
    for word in text.split(' '):
        alpha_match = re.match(alpha_re_match, word);
        numbr_match = re.match(numbr_re_match, word);
        apost_match = re.match(apost_re_match, word);    
        
        if alpha_match:
            new_text.append(' '.join(alpha_match.groups()));
        elif numbr_match:
            new_text.append(' '.join(numbr_match.groups()));
        elif apost_match:
            new_text.append(word.split("\'")[0] + " is");
        else:
            new_text.append(word);
    return ' '.join(new_text);

Update the preprocess method to include word splitting:

In [409]:
def preprocess(texts):
    lower_texts = lower_case(texts);
    unpunctuated_texts = filter_specialchars(lower_texts)
    expanded_texts = correct_contractions(unpunctuated_texts)
    split_text = expanded_texts.apply(lambda text: split_nums(text))
    
    return split_text;

In [331]:
%%time
cleaned_text = preprocess(train_text)
_, unknown_words = check_coverage(cleaned_text, embeddings_dict)

Total coverage of Vocabulary 0.11
Total coverage of Dataset 0.98
Wall time: 8.47 s


Coverage did not change from 98%, but it is still good to have these preprocessing steps on the text.

Let's run the method on the remainder of the dataset to see how well it adjusts:

In [457]:
%%time
cleaned_text = preprocess(quora_train_text['question_text'])
_, unknown_words = check_coverage(cleaned_text, embeddings_dict)

Total coverage of Vocabulary 0.27
Total coverage of Dataset 0.98
Wall time: 1min 37s


We still get 98% coverage of the dataset. It also shows that a larger portion of the vocabulary is also being used.

In [458]:
quora_train_text['cleaned_text'] = cleaned_text

In [463]:
quora_train_text = quora_train_text.dropna()

In [464]:
quora_train_text = quora_train_text.drop('question_text', axis=1)

In [465]:
quora_train_text.to_csv('data/cleaned_train.txt')