# Preprocessing
I'll take a look at text data and turn it into quantifiable measures.

From randomly sampled tweets, I decided to do at least the following: 

1. find any links and remove them (e.g. bit.ly/g03MZB)
2. `@mention`, `{link}`
Create a feature that shows how many of these exists, but remove them from tokenization.
3. `[\w+]` seems to be emoji or pics/videos. keep them as is (with brackets) 
3. word after # should be treated as a separate word than the same word without #, also count how many tags
4. `?&quot;`, `$amp;`, `�` should be removed.
5. ! and ? might be important to keep. count how many. 

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import pickle

## Import Data
---

In [40]:
X_train = pd.read_csv('DATA/X_train.csv', index_col=0)
X_val = pd.read_csv('DATA/X_val.csv', index_col=0)
X_test = pd.read_csv('DATA/X_test.csv', index_col=0)

### Hyperlinks

In [41]:
# find links and turn them into {link}
def replace_links(str_):
    p = '((http\w*:\/\/)?(www\.\w+)?(\w+\.(com|co|ly|ch|org|net)+)(\/\w+)?)'
    return re.sub(p, '{link}', str_)

### Count mentions and links

In [12]:
# count how many times @mention, and {link} occurs 
def count_exp(str_, exp):
    p = re.compile(exp)
    return len(p.findall(str_))

### Remove HTML symbols

In [13]:
# remove html symbol
def remove_html(series_):
    return series_.map(lambda x: re.sub("[^A-Za-z0-9 ]\w+;", '', x))

### Remove unnecessary punctuations
(remove ones except !?$[], we will remove these after counting number of exclamation points and question marks, and if they occur by themselves (not as hashtag or emoji))

In [14]:
# remove unnecessary punctuations 
import string
punctuations = string.punctuation
table_ = str.maketrans('', '', '!?#[]')
punctuations = punctuations.translate(table_) + '�'

def remove_punctuations(str_, punctuations):
    table_ = str.maketrans('', '', punctuations)
    return str_.translate(table_)   

### Stopwords
Define stopwords. We don't want overall distribution of customer satisfaction of each company to affect our analysis so I'll remove some of the specific words for popular brands.

In [15]:
# stopwords
stopwords = stopwords.words('english')


stopwords += ['apple', 'android', 'ipad', 'iphone', 'sxsw', '#sxsw', 'intel', 
             'atari', 'cisco', 'google', 'genentech', 'mac', 'pc']

stopwords += string.punctuation
stopwords += string.digits

### Tokenize 


In [16]:
# now tokenize and remove stopwords
def tokenize(str_, stopwords):
    str_ = str_.lower()
    return [x for x in word_tokenize(str_) if x not in stopwords]

### Putting them together

In [43]:
def prepro_text(df0, punctuations, stopwords):
    df = df0.copy()
    df['tweet'] = df.tweet.apply(replace_links)
    df['tweet'] = remove_html(df.tweet)
    df['tweet'] = df.tweet.apply(lambda x: remove_punctuations(x, punctuations))    
    df['mention_count'] = df.tweet.apply(lambda x: count_exp(x, '@\w+'))
    df['link_count'] = df.tweet.apply(lambda x: count_exp(x, '{link}')) 
    df['tag_count'] = df.tweet.apply(lambda x: count_exp(x, '#\w+'))
    df['exclam_count'] = df.tweet.apply(lambda x: count_exp(x, '!'))
    df['quest_count'] = df.tweet.apply(lambda x: count_exp(x, '\?'))
    df['tweet'] = df.tweet.apply(lambda x: tokenize(x, stopwords))
    return df

In [48]:
X_train_pp = prepro_text(X_train, punctuations, stopwords)
X_val_pp = prepro_text(X_val, punctuations, stopwords)
X_test_pp = prepro_text(X_test, punctuations, stopwords)

## Export Pickles
Export preprocessed data out as pickles

In [18]:
#import os
#os.mkdir('PKL')

In [50]:
X_train_pp.to_pickle('PKL/X_train_pp.pkl')
X_val_pp.to_pickle('PKL/X_val_pp.pkl')
X_test_pp.to_pickle('PKL/X_test_pp.pkl')