# Tweet Sentiment Classification

Build and train models to classify tweet sentiment as positive or negative using `Logistic Regression` and `Naïve Bayes` classifiers

In [27]:
import re
from string import punctuation
import numpy as np
import nltk
from nltk.corpus import twitter_samples, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

## Download and explore data

In [28]:
# Download twitter samples and unzip 
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [29]:
# Load postive and negative tweets from JSON to list
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [30]:
len(positive_tweets), len(negative_tweets)

(5000, 5000)

In [31]:
# View few samples tweets
print('Positive Tweets...')
for i, tweet in enumerate(positive_tweets[:5]): print(f'Tweet {i+1}: ', tweet)
print('\nNegative Tweets...')
for i, tweet in enumerate(negative_tweets[:5]): print(f'Tweet {i+1}: ', tweet)

Positive Tweets...
Tweet 1:  #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
Tweet 2:  @Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!
Tweet 3:  @DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!
Tweet 4:  @97sides CONGRATS :)
Tweet 5:  yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days

Negative Tweets...
Tweet 1:  hopeless for tmr :(
Tweet 2:  Everything in the kids section of IKEA is so cute. Shame I'm nearly 19 in 2 months :(
Tweet 3:  @Hegelbon That heart sliding into the waste basket. :(
Tweet 4:  “@ketchBurning: I hate Japanese call him "bani" :( :(”

Me too
Tweet 5:  Dang starting next week I have "work" :(


## Preprocess data
Tweets needs to be cleaned and encoded to make them ready for modeling

In [32]:
# Download NLTK tokenizer model and stopwords
nltk.download('punkt')
nltk.download('stopwords')

# Get list of stop words and punctuations
stop_words = stopwords.words('english')
punctuations = list(punctuation)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
def clean_and_tokenize_tweet(tweet, stop_words, punctuations):
    '''
    Returns a clean tokenized representation of a tweet
    1) Remove twitter handles, hash-tags and hyper-links
    2) Convert every tweet to lowercase
    3) Remove punctuation
    4) Remove stop words
    5) Apply stemming
    '''
    # Remove handles
    clean_tweet = re.sub('@[a-zA-Z0-9_]+', 
                         '', tweet, flags=re.MULTILINE)
    # Remove hashtags
    clean_tweet = re.sub('#[a-zA-Z0-9_]+', 
                         '', clean_tweet, flags=re.MULTILINE) 
    # Remove hyperlinks
    clean_tweet = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 
                  '', clean_tweet, flags=re.MULTILINE)
    # Covert to lowercase
    clean_tweet = clean_tweet.lower()
    # Split tweet in word tokens
    tweet_tokens = word_tokenize(clean_tweet)
    # Remove stop-words and punctuation
    tweet_tokens = [token for token in tweet_tokens 
                    if token not in stop_words and
                       token not in punctuations]
    # Apply stemming
    stemmer = PorterStemmer()
    tweet_tokens = [stemmer.stem(token) for token in tweet_tokens] 
    return tweet_tokens

In [40]:
# Loop through all tweets, clean and tokenize them
clean_positive_tweets = [clean_and_tokenize_tweet(tweet, stop_words, punctuations) 
                         for tweet in positive_tweets[:5]]
clean_negative_tweets = [clean_and_tokenize_tweet(tweet, stop_words, punctuations) 
                         for tweet in negative_tweets[:5]]

### Encode tweets and etract features
1) Build vocabulary (unique word list)
2) Build frequency dictionary for positive and negative tweets
3) Encode each tweet in corpus as [bias (always 1), sum-postive-freq, sum-postive-freq]
4) build a matrix containing all encoded tweets and a list containg corresponding labels

### Split data in training (80%) and testing (20%) sets

## Logistics Regression

## Naive Bays