In [1]:
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier
import json
import re

In [2]:
def read_file(filename):
    with open(filename) as datafile:
        return json.load(datafile)

In [3]:
def clean_tweet(tweet):
    # Remove @-mentions
    tweet = re.sub(r'(?:@[\w_]+)', "", tweet)

    # Remove hash-tags
    tweet = re.sub(r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", "", tweet)

    # Remove numbers
    tweet = re.sub(r'(?:(?:\d+,?)+(?:\.?\d+)?)', "", tweet)

    # Remove URLs
    tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', "", tweet)

    # Remove stopwords
    tokens = tokenize_string(tweet)
    stopset = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stopset]

    tweet = " ".join(tokens)
    return tweet

In [4]:
def tokenize_string(my_string):
    return re.findall('[\w\-]+', my_string.lower())

In [5]:
def word_features(tweet):
    return dict([(word, True) for word in tokenize_string(tweet)])

In [6]:
def read_training_set(filename):
    training = []
    with open(filename) as file:
        for line in file.readlines():
            training.append((clean_tweet(line.split(",")[0]), line.split(",")[1].replace("\n", "")))
    return training

In [7]:
def get_naive_bayes_classifier():
    training_data = read_training_set("train.csv")
    train_features = []

    for tweet in training_data:
        train_features.append((word_features(tweet[0]), tweet[1]))
    return NaiveBayesClassifier.train(train_features)

In [8]:
def classify_tweet(classifier, text):
    text = clean_tweet(text)
    predicted = classifier.classify(word_features(text))
    return predicted

In [9]:
def classify_tweets(tweets):
    classification = []
    classifier = get_naive_bayes_classifier()

    for key, value in tweets.items():
        for text in value["tweets"]:
            sentiment = classify_tweet(classifier, text)
            classification.append((text, sentiment))
    return classification

In [10]:
def save_data(data, filename):
    with open(filename, "w") as outfile:
        json.dump(data, outfile)

In [13]:
if __name__ == "__main__":
    tweets = read_file("tweets.json")
    classification = classify_tweets(tweets)
    save_data(classification, "classification.json")