In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

# do something to prove it works
rdd = sc.parallelize(range(1000))
rdd.takeSample(False, 5)

[420, 449, 540, 105, 514]

In [2]:
tweet_rdd = sc.textFile('./data/train.csv') \
    .map(lambda line: (line.split(',')[0], line.split(',')[-1]))

### Tweet Processing Functions

Below are the definitions of the functions that are used to process the tweets before being sent on to feature extraction

In [4]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.2.1.tar.gz (1.1MB)
[K    100% |████████████████████████████████| 1.1MB 402kB/s 
[?25hBuilding wheels for collected packages: nltk
  Running setup.py bdist_wheel for nltk ... [?25l- \ | / - \ | / - done
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/55/0b/ce/960dcdaec7c9af5b1f81d471a90c8dae88374386efe6e54a50
Successfully built nltk
Installing collected packages: nltk
Successfully installed nltk-3.2.1


In [289]:
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [290]:
with open('./data/stopwords.txt') as stopword_file:
    stopwords = {stopword.strip(): 1 for stopword in stopword_file.readlines()}
    
porter_stemmer = PorterStemmer()

In [387]:
def basic_cleaning(tweet):
    clean_tweet = tweet.lower().strip().strip('\'').strip('"')
    clean_tweet = clean_tweet.replace('.', '').replace('?', '').replace(',', '').replace('#', '') \
        .replace('!', '').replace('\'', '').replace('"', '').replace('..', '').replace('...', '')
    clean_tweet = re.sub('\d+', '', clean_tweet)
    clean_tweet = clean_tweet.strip('"').strip()
    return clean_tweet

In [371]:
def remove_non_alpha_starting_words(tweet):
    non_alpha_start_words_regex = '(^|\s)(:|;)(D|d|\/)(?=\s|[^[:alnum:]+-]|$)'
    non_alpha_start_words_regex = '(^|\s)[^a-zA-Z]\w*($|\s)'
    clean_tweet = re.sub(non_alpha_start_words_regex, ' ', tweet)
    return clean_tweet

In [333]:
def remove_stopwords(tweet):
    tweet_words = tweet.split()
    tweet_without_stopwords = [word for word in tweet_words if word not in stopwords]
    return ' '.join(tweet_without_stopwords)

In [334]:
def replace_urls(tweet):
    url_regex = '((https?:\/\/(?:www\.|(?!www))|)[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,})'
    clean_tweet = re.sub(url_regex, 'URL', tweet)
    return clean_tweet

In [335]:
def replace_user_handles(tweet):
    tweet_str = ' '.join(tweet) if isinstance(tweet, list) else tweet
    user_handle_regex = '(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)'
    clean_tweet = re.sub(user_handle_regex, 'AT_USER', tweet_str)

    return clean_tweet

In [336]:
def replace_repeated_characters(tweet):
    repeated_character_regex = '(\w)\\1{2,}'
    matches = re.finditer(repeated_character_regex, tweet)
    clean_tweet = tweet
    for match in matches:
        start = match.start(0)
        end = match.end(0)
        char = tweet[start]
        clean_tweet = re.sub(repeated_character_regex, char + char, clean_tweet[:end])
        clean_tweet += tweet[end:]
        
    return clean_tweet

In [337]:
def stem_tweet(tweet, stemmer):
    stemmed_tweet = [stemmer.stem(word) for word in tweet.split()]
    return ' '.join(stemmed_tweet)

In [388]:
clean_tweet_sentiment_rdd = tweet_rdd.map(lambda record: (record[0], basic_cleaning(record[1]))) \
    .map(lambda record: (record[0], replace_urls(record[1]))) \
    .map(lambda record: (record[0], replace_user_handles(record[1]))) \
    .map(lambda record: (record[0], remove_non_alpha_starting_words(record[1]))) \
    .map(lambda record: (record[0], remove_stopwords(record[1]))) \
    .map(lambda record: (int(record[0].strip('"')), record[1])) \
    .map(lambda record: (record[0], replace_repeated_characters(record[1]))) \
    .map(lambda record: (record[0], stem_tweet(record[1], porter_stemmer)))

In [389]:
clean_tweet_sentiment_rdd.take(5)

[(0, 'that bummer shoulda got david carr third day'),
 (0, 'AT_US day didnt get much done'),
 (0, 'realli dont feel like get today got studi tomorrow practic exam'),
 (0, 'commiss play cop blood sand'),
 (0, 'sleep soon hate say bye see tomorrow night')]

In [345]:
tweet_rdd.take(5)

[('"0"',
  ' that\'s a bummer.  You shoulda got David Carr of Third Day to do it. ;D"'),
 ('"0"', '"@alydesigns i was out most of the day so didn\'t get much done "'),
 ('"0"',
  '"really don\'t feel like getting up today... but got to study to for tomorrows practical exam... "'),
 ('"0"',
  ' it\'s out of commission  Wutcha playing? Have you copped \'Blood On The Sand\'?"'),
 ('"0"',
  '"sleep soon... i just hate saying bye and see you tomorrow for the night. "')]

### Feature Extraction using HashingTF and IDF

This section consists of extracting features to be fed into our algorithms to generate the classifier

In [390]:
from pyspark.mllib.feature import HashingTF, IDF
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel

In [391]:
clean_tweet_rdd = clean_tweet_sentiment_rdd.map(lambda record: " ".join(record[1]))

In [392]:
hashingtf = HashingTF(100000)

In [400]:
tf = hashingtf.transform(clean_tweet_rdd)
tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

In [401]:
tf_idf_sentiment = clean_tweet_sentiment_rdd.map(lambda record: record[0]).zip(tfidf) \
    .map(lambda record: LabeledPoint(record[0], record[1]))

In [402]:
training, test = tf_idf_sentiment.randomSplit([0.9, 0.1])

### Naive Bayes

This section consists of the training of a Naive Bayes model and checking its accuracy value

In [410]:
model = NaiveBayes.train(training, 1.0)

In [412]:
predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))

In [413]:
accuracy = predictionAndLabel.filter(lambda x: x[0] == x[1]).count() / test.count()
accuracy

0.5688696726374305

In [190]:
test_rdd = sc.textFile('./data/test.csv') \
    .map(lambda line: (line.split(',')[0], line.split(',')[-1]))

In [191]:
test_sentiment_rdd = test_rdd.map(lambda record: (record[0], remove_stopwords(record[1], stopwords))) \
    .map(lambda record: (record[0], basic_cleaning(record[1]))) \
    .map(lambda record: (record[0], remove_non_alpha_starting_words(record[1]))) \
    .map(lambda record: (record[0], replace_repeated_characters(record[1]))) \
    .map(lambda record: (record[0], replace_urls(record[1]))) \
    .map(lambda record: (record[0], stem_tweet(record[1], porter_stemmer))) \
    .map(lambda record: (record[0], replace_user_handles(record[1]))) \
    .map(lambda record: (int(record[0].strip('"')), record[1]))

In [192]:
clean_test_rdd = test_sentiment_rdd.map(lambda record: " ".join(record[1]))

In [193]:
test_tf = hashingtf.transform(clean_test_rdd)
test_tf.cache()
test_idf = IDF().fit(test_tf)
test_tfidf = idf.transform(test_tf)

In [194]:
test_tf_idf_sentiment = test_sentiment_rdd.map(lambda record: record[0]).zip(test_tfidf) \
    .map(lambda record: LabeledPoint(record[0], record[1]))

In [195]:
predictionAndLabel = test_tf_idf_sentiment.map(lambda p: (model.predict(p.features), p.label))

In [196]:
accuracy = predictionAndLabel.filter(lambda x: x[0] == x[1]).count() / test_tf_idf_sentiment.count()
accuracy

0.5543175487465181

### Logistic Regression

This following section describes the Logistic Regression steps

In [414]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel

In [415]:
model = LogisticRegressionWithLBFGS.train(training)

In [416]:
labelsAndPreds = test.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda record: record[0] != record[1]).count() / float(test.count())

In [417]:
trainErr

0.4170475602223595

In [411]:
def k_fold_cross_validation(k_value, model1, model2, data):
    avg_accuracy1 = get_avg_accuracy(k_value, model1, data)
    avg_accuracy2 = get_avg_accuracy(k_value, model2, data)
    best_model = None
    if avg_accuracy1 > avg_accuracy2:
        best_model = model1
    else:
        best_model = model2
    
    return best_model


def get_avg_accuracy(k_value, model_type, data):
    split_ratio = [0.1 for i in range(0, k_value)]
    data_split = data.randomSplit(split_ratio)
    test_index = 0
    avg_accuracy = 0
    best_model = None
    for test_index in range(0, k_value):
        training_list = [i for index, i in enumerate(data_split) if index != test_index]
        training_rdd = sc.emptyRDD()
        for training in training_list:
            training_rdd.union(training)
        model = model_type.train(training, 1.0)
        accuracy = test_with_model(model, data_split[test_index])
        avg_accuracy += accuracy
    
    avg_accuracy /= k_value
    
    return avg_accuracy


def test_with_model(model, test):
    predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
    accuracy = predictionAndLabel.filter(lambda x: x[0] == x[1]).count() / test.count()
    return accuracy

In [None]:
best_model = k_fold_cross_validation(10, NaiveBayes, LogisticRegressionWithLBFGS, tf_idf_sentiment)

In [None]:
best_model