In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

# do something to prove it works
rdd = sc.parallelize(range(1000))
rdd.takeSample(False, 5)

[909, 137, 166, 245, 856]

In [15]:
tweet_rdd = sc.textFile('./data/train.csv'). \
    map(lambda line: (line.split(',')[0], line.split(',')[-1]))

### Tweet Processing Functions

Below are the definitions of the functions that are used to process the tweets before being sent on to feature extraction

In [139]:
import re
from nltk.stem.porter import PorterStemmer

In [72]:
with open('./data/stopwords.txt') as stopword_file:
    stopwords = {stopword.strip(): 1 for stopword in stopword_file.readlines()}
    
porter_stemmer = PorterStemmer()

In [80]:
def basic_cleaning(tweet):
    punctuation_hash_regex = '[\.\?\,\!#]'
    clean_tweet = tweet.lower()
    clean_tweet = re.sub(punctuation_hash_regex, '', clean_tweet)
    clean_tweet = clean_tweet.strip('"').strip()
    return clean_tweet

In [26]:
def remove_non_alpha_starting_words(tweet):
    non_alpha_start_words_regex = '\b[^a-zA-Z].+\s'
    clean_tweet = re.sub(non_alpha_start_words_regex, ' ', tweet)
    return clean_tweet

In [66]:
def remove_stopwords(tweet, stopwords):
    tweet_words = tweet.split()
    tweet_without_stopwords = []
    for word in tweet_words:
        cleaned_word = word.strip().strip('.').strip(',').strip('?').strip(':').strip(';')
        if cleaned_word.lower() not in stopwords and cleaned_word != '':
            tweet_without_stopwords.append(cleaned_word)
    return ' '.join(tweet_without_stopwords)

In [25]:
def replace_urls(tweet):
    url_regex = '((https?:\/\/(?:www\.|(?!www))|)[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,})'
    clean_tweet = re.sub(url_regex, 'URL', tweet)
    return clean_tweet

In [98]:
def replace_user_handles(tweet):
    tweet_str = ' '.join(tweet) if isinstance(tweet, list) else tweet
    user_handle_regex = '(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)'
    clean_tweet = re.sub(user_handle_regex, 'AT_USER', tweet_str)
    return clean_tweet.split()

In [42]:
def replace_repeated_characters(tweet):
    repeated_character_regex = '(\w)\\1{2,}'
    matches = re.finditer(repeated_character_regex, tweet)
    clean_tweet = tweet
    for match in matches:
        start = match.start(0)
        end = match.end(0)
        char = tweet[start]
        clean_tweet = re.sub(repeated_character_regex, char + char, clean_tweet[:end])
        clean_tweet += tweet[end:]
        
    return clean_tweet

In [34]:
def stem_tweet(tweet, stemmer):
    stemmed_tweet = [stemmer.stem(word) for word in tweet.split()]
    return stemmed_tweet

In [194]:
clean_tweet_sentiment_rdd = tweet_rdd.map(lambda record: (record[0], remove_stopwords(record[1], stopwords))) \
    .map(lambda record: (record[0], basic_cleaning(record[1]))) \
    .map(lambda record: (record[0], remove_non_alpha_starting_words(record[1]))) \
    .map(lambda record: (record[0], replace_repeated_characters(record[1]))) \
    .map(lambda record: (record[0], replace_urls(record[1]))) \
    .map(lambda record: (record[0], stem_tweet(record[1], porter_stemmer))) \
    .map(lambda record: (record[0], replace_user_handles(record[1]))) \
    .map(lambda record: (int(record[0].strip('"')), record[1]))


### Feature Extraction using HashingTF and IDF

This section consists of extracting features to be fed into our algorithms to generate the classifier

In [173]:
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
from pyspark.mllib.regression import LabeledPoint

In [184]:
clean_tweet_rdd = tweet_rdd.map(lambda record: " ".join(record[1]))

In [179]:
hashingtf = HashingTF()

In [217]:
def incrementer(start):
    while True:
        yield start + 1

inc = incrementer(0)

In [234]:
def index_lines(inc, line):
    return (next(inc), line)

In [248]:
tf = hashingtf.transform(clean_tweet_rdd)

In [269]:
tf_sentiment = clean_tweet_sentiment_rdd.map(lambda record: record[0]).zip(tf) \
    .map(lambda record: LabeledPoint(record[0], record[1]))

In [270]:
tf_sentiment.take(5)

[LabeledPoint(0.0, (1048576,[35920,40200,105642,113415,151034,173013,173606,211440,238153,265159,275028,279832,296409,372567,388504,425102,437751,469732,702216,734443,777769,793623,840959,875351,891534,897367],[1.0,4.0,2.0,1.0,1.0,1.0,3.0,2.0,6.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,5.0,2.0,3.0,1.0,87.0,4.0,3.0])),
 LabeledPoint(0.0, (1048576,[35920,173606,211440,238153,265159,279832,296409,335453,372567,388504,425102,524927,702216,702740,734443,777769,840959,875351,891534,897367,968035],[2.0,2.0,5.0,3.0,5.0,2.0,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,5.0,2.0,1.0,76.0,5.0,2.0,3.0])),
 LabeledPoint(0.0, (1048576,[35920,40200,105642,186435,211440,238153,265159,279832,296409,335453,372567,388504,425102,469732,517893,584844,702216,702740,734443,777769,840959,875351,891534,897367,968035],[3.0,5.0,6.0,1.0,2.0,5.0,9.0,3.0,6.0,1.0,2.0,5.0,2.0,1.0,1.0,2.0,3.0,2.0,11.0,2.0,1.0,112.0,3.0,3.0,2.0])),
 LabeledPoint(0.0, (1048576,[35920,173013,173606,211440,238153,243360,265159,279832,296409,372567,38850

In [None]:
training_rdd, test_rdd = clean_tweet_sentiment_rdd.randomSplit([0.6, 0.4])