In [2]:
import json
import os
import pickle
import re

In [5]:
washington_words = ['Seattle','Washington','WA','Kirkland','Seahawks']
massachusetts_words = ['MA','Massachusetts','Boston','Greater','Patriots']

In [15]:
def read_data(filename,n_rows=100000):
    data = []
    with open(os.path.join('data','train',filename)) as fp:
        for line in fp:
            entry = json.loads(line)
            if(location_condition(entry['tweet']['user']['location'],
                                  washington_words,massachusetts_words)):
                data.append(entry['tweet'])
    return data

def get_tweets(data):
    values = []
    for entry in data:
        values.append(entry['text'])
    return values

def get_locations(data):
    values = []
    for entry in data:
        values.append(entry['user']['location'])
    return values

def location_condition(location,loc1_words,loc2_words):
    if any(word in location for word in loc1_words) or any(word in location for word in loc2_words):
        return True
    return False

def get_labeled_data(tweets,locations,loc1_words,loc2_words):
    labels=[]
    data = []
    for index,loc in enumerate(locations):
        if any(word in loc for word in loc1_words):
            data.append(tweets[index])
            labels.append(1)
        elif  any(word in loc for word in loc2_words):
            data.append(tweets[index])
            labels.append(-1)
    return [preprocess_tweets(data),labels]

def preprocess_tweets(data):
    processed_tweets = []
    emoticons_str = r"""
        (?:
            [:=;] # Eyes
            [oO\-]? # Nose (optional)
            [D\)\]\(\]/\\OpP] # Mouth
        )"""
    html_str = r'<[^>]+>'
    mention_str = r'(?:@[\w_]+)'
    hash_tag_str = r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"
    url_str = r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+'
    numbers_str = r'(?:(?:\d+,?)+(?:\.?\d+)?)'
    regex_str = [
        emoticons_str, html_str, mention_str,
        hash_tag_str, url_str, numbers_str,
        r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
        r'(?:[\w_]+)', # other words
    ]

    delete_str = [
        emoticons_str, html_str, mention_str,
        url_str, numbers_str, r'RT'
    ]

    tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
    emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
    delete_re = re.compile(r'('+'|'.join(delete_str)+')', re.VERBOSE | re.IGNORECASE)

    def tokenize(s):
        return tokens_re.findall(s)

    def preprocess(s, lowercase=True):
        tokens = tokenize(s)
        if lowercase:
            tokens = [token.lower() for token in tokens if not delete_re.search(token)]
        return tokens

    for tweet in data:
        processed_tweets.append(preprocess(tweet))
    return processed_tweets

In [16]:
load_data = True
if(not load_data):
    superbowl = read_data('tweets_#superbowl.txt')
    tweets = get_tweets(superbowl)
    locations = get_locations(superbowl)
    data,labels = get_labeled_data(tweets,locations,washington_words,massachusetts_words)
    with(open('data/train/part2/part2-data.pkl','wb')) as fp:
        pickle.dump([data,labels],fp,protocol=2)
else:
    with(open('data/train/part2/part2-data.pkl','wb')) as fp:
        data,labels = pickle.load(fp)


In [19]:
print(len(data))
print(len(labels))

54407
54407
