# PSUPR / PRMLS CA2

### Team
Kenneth Goh Chia Wei : A0198544N  
Tan Heng Han : A0198502B  
Raymond Ng Boon Cheong : A0198543R

### CA2 Test Script
Perform sentiment analysis on tweets from Apple event

In [71]:
import pandas as pd
import lxml
import re
import time

from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer

from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model

### Create a function for cleaning up tweets

In [72]:
def tweet_cleaner(text):
    # Init WordPunctTokenizer
    tok = WordPunctTokenizer()
    # create reg expression
    pat1 = r'@[A-Za-z0-9]+'
    pat2 = r'https?://[A-Za-z0-9./]+'
    combined_pat = r'|'.join((pat1, pat2))
    # Use beautiful soup to decode html to text
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    # strip text using regular expression
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    # strip to letters only
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    # remove retweets handle
    removed_rt = re.sub('RT ','',letters_only)
    # set to lower case for all characters
    lower_case = removed_rt.lower()
    # Use tokenize to remove whitespace created by letters_only above
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()

### Load Apple event data from csv

In [73]:
file = '..\\dataset\\tweets_appleEvent.csv'
df = pd.read_csv(file)

In [74]:
df.shape

(1000, 14)

In [75]:
df.head()

Unnamed: 0,text,weekday,month,day,hour,has_hashtag,has_url,fav_count,is_reply,retweet_count,followers,following,user,screen_name
0,RT @timmaughan: Back in 2014 I visited Inner M...,Tue,Sep,10,16,0,0,0,0,4,3086,1288,Roderick McKie Esquire,rodmckie
1,RT @beebomco: iPhone 11 launches tonight. You ...,Tue,Sep,10,16,1,0,0,0,1716,14,90,God Kizaru🐝🇩🇴🍑,GodKizaruRD
2,RT @CNET: Tune into the #AppleEvent today at:\...,Tue,Sep,10,16,1,0,0,0,84,418,2700,on/off ™,EstherLamarr
3,RT @BraJinglez: Apple has added a new payment ...,Tue,Sep,10,16,1,0,0,0,5943,848,352,Yemi Stark,yemiscofield
4,RT @mauobenol: iPhone 11 launches tomorrow! An...,Tue,Sep,10,16,1,0,0,0,4094,48,69,𝗸𝗮𝗶𝗹𝗲𝗲。,_seoularr


In [76]:
# Keep only tweets and its label
text = df['text']
text.head()

0    RT @timmaughan: Back in 2014 I visited Inner M...
1    RT @beebomco: iPhone 11 launches tonight. You ...
2    RT @CNET: Tune into the #AppleEvent today at:\...
3    RT @BraJinglez: Apple has added a new payment ...
4    RT @mauobenol: iPhone 11 launches tomorrow! An...
Name: text, dtype: object

### Clean up tweets

In [77]:
%%time
cleaned_tweets = []
for tweet in text:
    cleaned_tweets.append(tweet_cleaner(tweet))
print(cleaned_tweets[:5])
print(f'Tweets total: {len(cleaned_tweets)}')

['back in i visited inner mongolia and this toxic run off lake that s a byproduct of rare earth refining want to help', 'iphone launches tonight you ready iphone iphonexi appleevent apple', 'tune into the appleevent today at am pdt pm cdt pm edt pm bst pm ist wednesday', 'apple has added a new payment option for iphone appleevent', 'iphone launches tomorrow and i still have an iphone s appleevent']
Tweets total: 1000
Wall time: 241 ms


### Rejoin all tweets and drop duplicate tweets

In [78]:
appleTweets = pd.DataFrame({'text': cleaned_tweets})
appleTweets.drop_duplicates(subset='text', inplace=True, keep='first')

In [79]:
appleTweets.shape

(328, 1)

In [80]:
appleTweets.head()

Unnamed: 0,text
0,back in i visited inner mongolia and this toxi...
1,iphone launches tonight you ready iphone iphon...
2,tune into the appleevent today at am pdt pm cd...
3,apple has added a new payment option for iphon...
4,iphone launches tomorrow and i still have an i...


### Load pre-train model

In [81]:
model = load_model('train_model')

In [82]:
labels = ['happy', 'sad', 'angry']

def label_sentiment(scores):
    maxScore = 0
    maxIdx = 0
    for i in range(len(scores)):
        if scores[i] > maxScore:
            maxScore = scores[i]
            maxIdx = i
    return maxIdx

def predict(text):
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=300)
    # Predict
    scores = model.predict([x_test])[0]
    # Decode sentiment
    lblIdx = label_sentiment(scores)
    return labels[lblIdx]  

### Perform sentiment analysis on tweets

In [None]:
%%time
for txt in finalDF.text:
    print(f'Tweet: {txt}')
    print(f'Predicted Sentiment: {predict(txt)}\n')

Tweet: back in i visited inner mongolia and this toxic run off lake that s a byproduct of rare earth refining want to help
Predicted Sentiment: angry

Tweet: iphone launches tonight you ready iphone iphonexi appleevent apple
Predicted Sentiment: sad

Tweet: tune into the appleevent today at am pdt pm cdt pm edt pm bst pm ist wednesday
Predicted Sentiment: angry

Tweet: apple has added a new payment option for iphone appleevent
Predicted Sentiment: happy

Tweet: iphone launches tomorrow and i still have an iphone s appleevent
Predicted Sentiment: angry

Tweet: rawkz i m confused did apple just launched a new gas cooker or is this the iphone appleevent
Predicted Sentiment: sad

Tweet: cupertino calling join us today at a m pdt to watch the appleevent at
Predicted Sentiment: sad

Tweet: i can t believe we re already at iphone like when did that happen appleevent
Predicted Sentiment: sad

Tweet: this trailer was exclusively shot with the iphone you can see a large improvement in the camera