# Twitter Sentiment Analysis

In [1]:
import csv
import re
import nltk
import random
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

### Utilities

In [2]:
class PreprocessTweets:
    def __init__(self):
        self._stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER', 'URL'])

    def process_tweets(self, list_of_tweets):
        processed_tweets = []
        for tweet in list_of_tweets:
            if tweet["label"] is not None:
                if tweet["label"] == "positive" or tweet["label"] == "negative":
                    processed_tweets.append((self._process_tweet(tweet["text"]), tweet["label"]))
            else:
                processed_tweets.append((self._process_tweet(tweet["text"]), None))

        return processed_tweets

    def _process_tweet(self, tweet):
        tweet = tweet.lower()  # convert text to lower-case
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet)  # remove URLs
        tweet = re.sub('@[^\s]+', 'AT_USER', tweet)  # remove usernames
        tweet = re.sub(r'#([^\s]+)', r'\1', tweet)  # remove the # in #hashtag
        tweet = word_tokenize(tweet)  # remove repeated characters (helloooooooo into hello)

        words = []
        for word in tweet:
            if word not in self._stopwords:
                words.append(word)
        return words

In [3]:
def build_vocabulary(preprocessed_training_data):
    all_words = []

    for (words, sentiment) in preprocessed_training_data:
        all_words.extend(words)

    wordlist = nltk.FreqDist(all_words)
    word_features = wordlist.keys()

    return word_features

In [4]:
def extract_features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in training_data_features:
        is_feature_in_words = word in tweet_words
        features[word] = is_feature_in_words
    return features

### Load Tweet Data

In [5]:
tweet_data = []
with open("tweetDataFile.csv", 'rt', encoding='cp437') as csvfile:
    lineReader = csv.reader(csvfile, delimiter=',', quotechar="\"")
    for row in lineReader:
        tweet_data.append({"tweet_id": row[0], "text": row[1], "label": row[2], "topic": row[3]})

In [6]:
len(tweet_data)

2935

In [7]:
random.shuffle(tweet_data)

print("total tweet data = ", len(tweet_data))
training_data_set = tweet_data[:2435]
test_data_set = tweet_data[2435:]

total tweet data =  2935


# Sentiment Analyzer

### Training

In [8]:
tweet_processor = PreprocessTweets()
preprocessed_training_set = tweet_processor.process_tweets(training_data_set)

training_data_features = build_vocabulary(preprocessed_training_set)

training_features = nltk.classify.apply_features(extract_features, preprocessed_training_set)

NBayesClassifier = nltk.NaiveBayesClassifier.train(training_features)

### Testing

In [9]:
preprocessed_test_set = tweet_processor.process_tweets(test_data_set)

classified_result_labels = []
for tweet in preprocessed_test_set:
    classified_result_labels.append(NBayesClassifier.classify(extract_features(tweet[0])))

if classified_result_labels.count('positive') > classified_result_labels.count('negative'):
    print("Overall Positive Sentiment")
    print("Positive Sentiment Percentage = " + str(
        100 * classified_result_labels.count('positive') / len(classified_result_labels)) + "%")
else:
    print("Overall Negative Sentiment")
    print("Negative Sentiment Percentage = " + str(
        100 * classified_result_labels.count('negative') / len(classified_result_labels)) + "%")

Overall Negative Sentiment
Negative Sentiment Percentage = 56.64335664335665%
