In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

from nltk.tokenize import word_tokenize
import gensim.downloader
import re

This is an adaptation from [Notebook link](https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings) applied to this dataset.

This aims to make sure that there are word embeddings that correspond to our corpus

What we will try to fix is that sometimes due to different ways of writing and embedding styles, embeddings aren't found for words

Example:

If the embedding dictionary $x$ has the embedding for the word 'me' but not the word 'Me', we inspect that and apply the preprocessing we need to match these two words

In [None]:
train_df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
train_df.sample(n=3)

First we check for missing data

In [None]:
train_df.info()

Since there was only 1 row missing information, we removed it as it had no text which made it hard to use

In [None]:
train_df.dropna(inplace=True)

**Helper functions to track our preprocessing progress** 

In [None]:
def gen_count_matrix(text) -> dict():
    """Return a dictionary that has the number of times each word appeared in text.
    
    Args:
        text: Array of strings that contains the sentences(tweets)
            that are in our corpus
    
    Returns:
        count: Dictionary containing the number of times each word was mentioned in our corpus
        example:
        
        count = {
        'you': 123,
        'are': 320,
        'awesome': 100
        }
        
        count['you'] # 123
        means that you was mentioned 123 times in our corpus.
    """
    count = {}
    for i, sentence in enumerate(tqdm(text)):
        for word in word_tokenize(sentence):
            count.setdefault(word,0)
            count[word] += 1
    
    return count
            

def check_coverage(count, embedding):
    """Check how much text and words are covered by the embedding.
    
    Args:
        count: A dict that is generated from the function gen_count_matrix
            similar to CountVectorizer in sklearn
        embedding: A gensim KeyedVector that has word embeddings
        
    Returns:
        oov: A dict that shows the words that are not in the embeddings(vocabulary)
            and how many times each one occurred.
            short for out of vocabulary.
        known: A dict that shows the words that are in the embeddings(vocabulary)
            and how many times each one occurred. """
    
    oov = {}
    known = {}
    for word in tqdm(count.keys()):
        if word in embedding.index_to_key:
            known.setdefault(word, 0)
            known[word] += count[word]
        else:
            oov.setdefault(word,0)
            oov[word] += count[word]
    
    print('Word % in embedding : {} %'.format( round( len(known) / (len(known)+len(oov)) * 100, 2)))
    print('Text % in embedding : {} %'.format( round( sum(known.values()) / (sum(known.values()) + sum(oov.values())) * 100, 2)))
    return oov, known
            
def sort_dict_values(oov):
    """Sorts dictionary by values.
    """
    return dict(sorted(oov.items(), key=lambda item: item[1], reverse=True))
    

# **Preprocessing**

## **Make sure corpus matches pretrained embeddings** 
**We make sure that our corpus matches with our pretrained word vectors**

In [None]:
text = [sentence.strip() for sentence in train_df['text']]
twitter_vectors = gensim.downloader.load('glove-twitter-100') # Loading word embeddings

**We can see that we have 37k words in our corpus and that**
- **40% of our words are covered** 
- **80% of text is covered in embeddings**

In [None]:
count = gen_count_matrix(text)
print('Number of unique words in our corpus ', len(count))
oov , known = check_coverage(count, twitter_vectors)

## **Lower case , numbers & links**
**From previous examples done for this, I know that glove-twitter embeddings has all lowercase embeddings and you can notice that in the oov dictionary**

This leads me to preprocessing my data by making all text lowercase and removing links and replacing numbers with ##

In [None]:
sort_dict_values(oov)

**When training glove, the numbers were changed so that**
   - 20 -> ##, (2 numbers -> ##)
   - 2015 -> ####, (4 numbers -> ####)
   - 200 -> ###, (3 numbers -> ###) 
   - 315135135 -> ##### (5 or more numbers -> #####)

I saw this from another kaggle notebook that I'll try to link

In [None]:
def text_preproc_1(tweets):
    '''Removes links and numbers and turns all text to lowercase.'''
    
    x = [ re.sub(r"https?:(\/\/t\.co\/([A-Za-z0-9]|[A-Za-z]){10})", "", tweet) for tweet in tweets]
    x = [ word.lower() for word in x]
    x = [ re.sub(r'[0-9]{5,}', '#####', word) for word in x ]
    x = [ re.sub(r'[0-9]{4}', '####', word) for word in x ]
    x = [ re.sub(r'[0-9]{3}', '###', word) for word in x ]
    x = [ re.sub(r'[0-9]{2}', '##', word) for word in x ]
    return x

**When we applied our preprocessing, The number of words decreased tremendously and we got more coverage**

**This happened as**
- Each group of numbers (2 numbers, 3 numbers ...etc) are now represented as one
  and are also covered in our embeddings
- Links are all removed from our corpus as they are just noise that increased the size of the corpus (one could argue that they should be replaced with a token \<LNK\> for example but we don't have enough text to retrain the embeddings)
- The words 'You' and 'you' were considered different words and the former wasn't counted towards the words that were in the embedding.


In [None]:
cleaned_tweets = text_preproc_1(text)
count = gen_count_matrix(cleaned_tweets)
print('Number of unique words in our corpus : ', len(count))
oov , known = check_coverage(count, twitter_vectors)

In [None]:
sort_dict_values(oov)

# **Contractions**
**A lot of the out of vocabulary items are actually contractions**
**Contraction** is something like how will not is written as won\`t and while both are correct the word embedding doesn\`t understand the latter

In [None]:
mappings = {
        'twas': 'it was', "it`s": 'it is',
        "could`ve": 'could have', "it`ll": 'it will',
        "they`ll": 'they will', "he`ll": 'he will',
        "we`ll": 'we will', "i`m": 'i am',
        "don`t": 'do not', "can`t": 'can not',
        'i`ll': 'i will', 'that`s': 'that is',
        'didn`t': 'did not', 'i`ve': 'i have',
        'won`t': 'will not', 'doesn`t': 'does not',
        'he`s': 'he is', 'isn`t': 'is not',
        'i`d': 'i would', 'haven`t': 'have not',
        'we`re': 'we are', 'wasn`t': 'was not',
        'she`s': 'she is', 'there`s': 'there is',
        'couldn`t': 'could not','they`re': 'they are',
        'what`s': 'what is', 'w/': 'with',
        'you`ll': 'you will', 'we`ll': 'we will',
        'aren`t': 'are not', 'you`ve': 'you have',
        'wouldn`t': 'would not', 'let`s': 'let us',
        'it`ll': 'it will', '2day': 'today',
        'how`s': 'how is', 'b4': 'before',
        'y`all': 'you all', '2nite': 'tonight',
        'you`d': 'you would', 'ya`ll': 'you all',
        'who`s': 'who is', 'hasn`t': 'has not',
        'where`s': 'where is', 'here`s': 'here is',
        'shouldn`t': 'should not', 'we`ve': 'we have',
        'weren`t': 'were not', 'w/o': 'without',
        '`cause': 'because', 'b/c': 'because',
        '2moro': 'tomorrow', 'hadn`t': 'had not',
        'he`ll': 'he will', 'we`d': 'we would',
        'they`ve': 'they have', 'gr8': 'great',
        'would`ve': 'would have', '2morrow': 'tomorrow',
    }

In [None]:
def text_preproc_2(tweets, mapping,map_contractions=False):
    x = [ re.sub(r"https?:(\/\/t\.co\/([A-Za-z0-9]|[A-Za-z]){10})", "", tweet) for tweet in tweets]
    x = [ tweet.lower() for tweet in x]
    x = [ re.sub(r'[0-9]{5,}', '#####', word) for word in x ]
    x = [ re.sub(r'[0-9]{4}', '####', word) for word in x ]
    x = [ re.sub(r'[0-9]{3}', '###', word) for word in x ]
    x = [ re.sub(r'[0-9]{2}', '##', word) for word in x ]
    if(map_contractions):
        for key, value in mapping.items():
            x = [sentence.replace(key, value)  for sentence in x]
    return x

**We can see that the contractions improved our coverage by 3%**

However, If you look at the competitions rules and submissions, you'll find that submitting 'do not' instead of 'don't' would make you lose points so, we will make the default to not do this and then if we need to improve the performance, we can work around it to revert the contractions at prediction time.

In [None]:
cleaned_tweets = text_preproc_2(text, mappings,True)
count = gen_count_matrix(cleaned_tweets)
print('Number of words in our corpus : ', len(count))
oov , known = check_coverage(count, twitter_vectors)

In [None]:
def check_tweets_containing(search_word, tweets, num_tweets=7):
    num_printed = 0
    for sentence in cleaned_tweets:
        if num_printed == num_tweets:
            break
        for word in word_tokenize(sentence):
            if word == search_word:
                print(sentence)
                num_printed += 1
                continue

In [None]:
sort_dict_values(oov)

In [None]:
check_tweets_containing('..', cleaned_tweets)

In [None]:
check_tweets_containing("'you", cleaned_tweets)

## **Dots & Quotations**
### **Dots**
**You can see that a lot of the words are written normally but have two or three dots before/after them**

### **Quotations**
Some quotations are right next to the word which causes the words to look like the text above


We can try removing these dots and see how that affects our coverage

In [None]:
def text_preproc_3(tweets, mapping, map_contractions=False):
    x = [ re.sub(r"https?:(\/\/t\.co\/([A-Za-z0-9]|[A-Za-z]){10})", "", tweet) for tweet in tweets]
    x = [ tweet.lower() for tweet in x]
    x = [ re.sub(r'[0-9]{5,}', '#####', word) for word in x ]
    x = [ re.sub(r'[0-9]{4}', '####', word) for word in x ]
    x = [ re.sub(r'[0-9]{3}', '###', word) for word in x ]
    x = [ re.sub(r'[0-9]{2}', '##', word) for word in x ]
    x = [ re.sub(r'[.]{1,}', '', word) for word in x ]    
    if(map_contractions):
        for key, value in mapping.items():
            x = [sentence.replace(key, value)  for sentence in x]
    x = [ word.replace("'",'') for word in x ]
    return x


They didn't have any significant improvement to our model so we will leave them out as unlike contractions there is no way to reverse them, so they might ruin our submissions

In [None]:
cleaned_tweets = text_preproc_3(text, mappings)
count = gen_count_matrix(cleaned_tweets)
print('Number of words in our corpus : ', len(count))
oov , known = check_coverage(count, twitter_vectors)

In [None]:
sort_dict_values(oov)

# **Final text processing function**

This is the same as the function text_preproc_2

In [None]:
def text_preproc(tweets, mapping,map_contractions=False):
    x = [ re.sub(r"https?:(\/\/t\.co\/([A-Za-z0-9]|[A-Za-z]){10})", "", tweet) for tweet in tweets]
    x = [ tweet.lower() for tweet in x]
    x = [ re.sub(r'[0-9]{5,}', '#####', word) for word in x ]
    x = [ re.sub(r'[0-9]{4}', '####', word) for word in x ]
    x = [ re.sub(r'[0-9]{3}', '###', word) for word in x ]
    x = [ re.sub(r'[0-9]{2}', '##', word) for word in x ]
    if(map_contractions):
        for key, value in mapping.items():
            x = [sentence.replace(key, value)  for sentence in x]
    return x

# Test Data

Here we will test the preprocessing that we made on the testset, for the first time to make sure we are not overfitting the train dataset.

In [None]:
test_df = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
test_text = [sentence.strip() for sentence in test_df['text']]

**We applied the check coverage function 3 times**
1. **No preprocessing**
   - We had 55.3% of words in corpus are also in embedding
   - We had 80.59% of words in corpus are also in embedding
2. **Preprocessing (no contractions)**
   - We had 79.37% of words in corpus are also in embedding
   - We had 92.75% of words in corpus are also in embedding
3. **Preprocessing (with contractions)**
   - We had 79.99% of words in corpus are also in embedding
   - We had 95.11% of words in corpus are also in embedding

We can see that these results are very similar to our training set which means we didn't overfit the train data.

However, This data came from twitter and due to the character limit people write a certain way so this may not work as well on text from other places.

In [None]:
count = gen_count_matrix(test_text)
print('Number of unique words in our corpus : ', len(count))
oov , known = check_coverage(count, twitter_vectors)

In [None]:
test_cleaned_tweets = text_preproc(test_text,mappings)
count = gen_count_matrix(test_cleaned_tweets)
print('Number of unique words in our corpus : ', len(count))
oov , known = check_coverage(count, twitter_vectors)

In [None]:
test_cleaned_tweets = text_preproc(test_text,mappings, map_contractions=True)
count = gen_count_matrix(test_cleaned_tweets)
print('Number of unique words in our corpus : ', len(count))
oov , known = check_coverage(count, twitter_vectors)