In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

# do something to prove it works
rdd = sc.parallelize(range(1000))
rdd.takeSample(False, 5)

[909, 137, 166, 245, 856]

In [15]:
tweet_rdd = sc.textFile('./data/train.csv'). \
    map(lambda line: (line.split(',')[0], line.split(',')[-1]))

### Tweet Processing Functions

Below are the definitions of the functions that are used to process the tweets before being sent on to feature extraction

In [20]:
import re
from nltk.stem.porter import PorterStemmer

In [33]:
with open('./data/stopwords.txt') as stopword_file:
    stopwords = {stopword.strip(): 1 for stopword in stopword_file.readlines()}
    
stemmer = PorterStemmer()

In [24]:
def basic_cleaning(tweet):
    punctuation_hash_regex = '[\.\?\,\!#]'
    clean_tweet = tweet.lower()
    clean_tweet = re.sub(punctuation_hash_regex, '', clean_tweet)
    return clean_tweet

In [26]:
def remove_non_alpha_starting_words(tweet):
    non_alpha_start_words_regex = '\b[^a-zA-Z].+\s'
    clean_tweet = re.sub(non_alpha_start_words_regex, ' ', tweet)
    return clean_tweet

In [27]:
def remove_stopwords(tweet, stopwords):
    tweet_words = tweet.split()
    tweet_without_stopwords = []
    for word in tweet_words:
        cleaned_word = word.strip().strip('.').strip(',').strip('?').strip(':').strip(';')
        if cleaned_word.lower() not in stopwords and cleaned_word != '':
            tweet_without_stopwords.append(cleaned_word)
    return tweet_without_stopwords

In [25]:
def replace_urls(tweet):
    url_regex = '((https?:\/\/(?:www\.|(?!www))|)[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,})'
    clean_tweet = re.sub(url_regex, 'URL', tweet)
    return clean_tweet

In [32]:
def replace_user_handles(tweet):
    user_handle_regex = '(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)'
    clean_tweet = re.sub(user_handle_regex, 'AT_USER', tweet)
    return clean_tweet

In [42]:
def replace_repeated_characters(tweet):
    repeated_character_regex = '(\w)\\1{2,}'
    matches = re.finditer(repeated_character_regex, tweet)
    clean_tweet = tweet
    for match in matches:
        start = match.start(0)
        end = match.end(0)
        char = tweet[start]
        clean_tweet = re.sub(repeated_character_regex, char + char, clean_tweet[:end])
        clean_tweet += tweet[end:]
        
    return clean_tweet

In [34]:
def stem_tweet(tweet, stemmer):
    stemmed_tweet = [stemmer.stem(word) for word in tweet.split()]
    return stemmed_tweet

In [41]:
dirty_str = 'Hellooo Howww arrreee you?'
x = replace_repeated_characters(dirty_str)
x

'Helloo'