In [1]:
%matplotlib inline
import matplotlib.image as mpimg
import numpy as np
import matplotlib.pyplot as plt
import os,sys
import re
%load_ext autoreload
%autoreload 2

In [2]:
def write(list_,name):
    f = open(name,'w')
    for s in list_:
        f.write(str(s) + '\n')
    f.close()

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) 
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def load_data_and_labels(positive_data_file, negative_data_file):
    """
    Loads the training data, splits the data into words and generates labels.
    Returns split sentences and labels.

    IN : 
    positive_data_file :    path to the positive data file
    negative_data_file :    path to the negative data file
    """
    # Load data from files
    positive_examples = list(open(positive_data_file, "r").readlines())
    positive_examples = [s.strip() for s in positive_examples]

    negative_examples = list(open(negative_data_file, "r").readlines())
    negative_examples = [s.strip() for s in negative_examples]

    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]

    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]

In [3]:
positive_data_file = "../twitter-datasets/train_pos.txt"
negative_data_file = "../twitter-datasets/train_neg.txt"

In [4]:
x_raw, y_test = load_data_and_labels(positive_data_file,negative_data_file)

In [5]:
print(x_raw[:10])

['user i dunno justin read my mention or not only justin and god knows about that , but i hope you will follow me believe 15', "because your logic is so dumb , i wo n't even crop out your name or your photo tsk url", 'user just put casper in a box ! looved the battle ! crakkbitch', "user user thanks sir do n't trip lil mama just keep doin ya thang !", 'visiting my brother tmr is the bestest birthday gift eveerrr ! ! !', 'user yay ! ! lifecompleted tweet facebook me to let me know please', 'user 1dnextalbumtitle feel for you rollercoaster of life song cocept life , yolo , becoming famous \\? 3 14 followmeplz ! 3 x15', "workin hard or hardly workin rt user at hardee 's with my future coworker user", "user i saw i 'll be replying in a bit", 'this is were i belong']


In [18]:
def delete_duplicate_lines(x_raw,y):
    """
    To delete the lines that are identical in the training data
    
    IN : 
    x_raw :  clean data (list of clean sentences)
    y : associated labels
    """
    print("Deleting duplicates in data...")
    seen = set()
    unique_x = []
    unique_y = []
    for i,line in enumerate(x_raw):
        if line not in seen:
            seen.add(line)
            unique_x.append(line)
            unique_y.append(y[i])
    perc_duplicate = (len(x_raw)-len(unique_x))/(len(x_raw))*100
    print("Found : {}% duplicates in the input".format(perc_duplicate,len(x_raw)))
    return unique_x,unique_y

In [19]:
unique_x_raw, unique_y = delete_duplicate_lines(x_raw,y_test)

Found : 9.39% duplicates in the input


In [20]:
full_positive_data_file = "../twitter-datasets/train_pos_full.txt"
full_negative_data_file = "../twitter-datasets/train_neg_full.txt"
x_raw_full, y_test_full = load_data_and_labels(full_positive_data_file,full_negative_data_file)

In [21]:
full_unique_x_raw, full_unique_y = delete_duplicate_lines(x_raw_full,y_test_full)

Found : 9.60736% duplicates in the input


In [22]:
emoticons_str = r"""(?:[:=;][oO\-]? [D\)\]\(\]/\\OpP])"""
 
regex_str = [
    emoticons_str,
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens
 
tweet = 'RT @marcobonzanini: just an example! :D http://example.com #NLP'
print(preprocess(tweet))
print(clean_str)

['RT', '@marcobonzanini', ':', 'just', 'an', 'example', '!', ':D', 'http://example.com', '#NLP']
rt marcobonzanini just an example ! d http example com nlp
