In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score

import nltk 
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

import re

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

[nltk_data] Downloading package vader_lexicon to C:\Users\Quynh
[nltk_data]     Pham\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Import data

In [2]:
train = pd.read_csv('sentiment_train.csv', names=['index','text','sentiment'], header=0)
train.drop('index', axis=1, inplace=True)
train

Unnamed: 0,text,sentiment
0,⚠️ #TLUpdates - Due to a safety inspection of ...,negative
1,@TLRailUK On the announcement we were told tha...,negative
2,1418 Thameslink train to Horsham: On time - pl...,neutral
3,This morning's Thameslink train is covered in ...,neutral
4,"@TLRailUK I think a bit before Mill Hill, but ...",negative
...,...,...
11373,@TLRailUK She was the ticket lady at Arlesey t...,positive
11374,@TLRailUK perhaps you could have your train dr...,negative
11375,1323 Great Northern train to Moorgate: Delayed...,neutral
11376,Now stuck at harpenden. Not meant to stop here...,negative


In [3]:
validation = pd.read_csv('sentiment_validation.csv', names=['index','text','sentiment'], header=0)
validation.drop('index', axis=1, inplace=True)
validation

Unnamed: 0,text,sentiment
0,What dirt does @GatwickExpress have on @TLRail...,negative
1,@EmilyTrenouth @TLRailUK Can’t see a carriage ...,neutral
2,@TLRailUK The 18:57 Cambridge to KingsX was de...,negative
3,@SouthernRailUK @TLRailUK @GatwickExpress @MET...,neutral
4,@TLRailUK Thank you for coming back to me. It'...,negative
...,...,...
2003,@TLRailUK Thanks.......It said it arrived ? Ca...,negative
2004,@TLRailUK Can anyone there explain to me why I...,neutral
2005,2005 Thameslink train to Gatwick Airport: Dela...,neutral
2006,.@networkrail @TLRailUK which person designed ...,negative


In [4]:
test = pd.read_csv('sentiment_test.csv', names=['index','text','sentiment'], header=0)
test.drop('index', axis=1, inplace=True)
test

Unnamed: 0,text,sentiment
0,#TLUpdates - We have been advised by our colle...,neutral
1,@TLRailUK Aiming ... like chucking a hot dog u...,negative
2,Long time since I’ve had to get a @TLRailUK tr...,negative
3,@thomasbrake @TLRailUK Can I ask for your view...,neutral
4,@TLRailUK how is it even possible for you to c...,negative
...,...,...
2358,⚠️ #TLUpdates - Services are beginning to retu...,negative
2359,⚠️#TLUpdates - Train services running through ...,neutral
2360,@LiveCommute @tlupdates @WorkerRailway @SaaSyS...,negative
2361,"@TLRailUK Hey, how do I do the ‘return unused ...",neutral


# Functions for cleaning Tweet (clean text & clean sentence)

In [5]:
# load spacy English language model trained based on web and social media texts
# add more stop words to the list of stop words list in spacy

nlp = spacy.load('en_core_web_sm')
nlp.Defaults.stop_words |= {'thameslink','tlupdates','gtrailuk','tlrailuk','govia', 'gtr'}
nlp.Defaults.stop_words.remove('not')



In [6]:
STOP_WORDS

print(len(STOP_WORDS))
print(STOP_WORDS)

331
{"'ll", 'anywhere', 'seems', 'many', 'seem', 'against', 'sometime', 'through', 'various', 'me', 'we', 'two', 'none', 'tlrailuk', 'and', 'now', 'up', 'thence', 'often', 'this', 'become', 'fifteen', 'be', 'sometimes', '’ve', 'would', 'latterly', 'had', 'than', 'his', 'which', 'thru', 'again', 'below', 'whereafter', 'anyone', 'though', 'four', 'using', 'your', 'five', 'from', 'else', 'amount', 'hundred', 'move', 'therefore', 'i', 'that', 'perhaps', 'thereby', 'always', 'nothing', 'either', 'sixty', 'should', 'being', 'hereby', 'per', 'all', "'d", 'meanwhile', 'please', 'three', 'mostly', 'anyhow', 'once', 'been', 'these', 'hers', 'what', 'first', 'seeming', 'bottom', 'became', 'due', 'do', 'even', '’m', 'elsewhere', '’re', 'least', 'those', 'yourselves', 'thereafter', 'herself', 'in', 'already', 'regarding', 'almost', 'gtr', 'make', 'is', 'between', 'yourself', 'serious', 'their', 'much', 'wherein', 'ca', 'anything', 'well', 'upon', 'off', 'because', 'before', 'go', 'thereupon', 'anot

In [7]:
# Create a function to clean tweet content
# Tweet will be cleaned by the following steps:
# 1. Remove emoji
# 2. Remove #thameslink, #TLUpdates, @gtrailuk, @TLRailUK
# 3. Turn word to lowercase
# 4. Remove Twitter @usernames
# 5. Remove hyperlink
# 6. Remove punctuations
# return clean text

def cleanText(text):
    #create a list of emojis pattern
    emoji_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   "]+", flags = re.UNICODE)
    text = re.sub(emoji_pattern,'', text) #remove emoji    
    text = re.sub(r'#thameslink','',text) #remove #thameslink
    text = re.sub(r'#TLUpdates','',text) #remove #TLUpdates
    text = re.sub(r'@gtrailuk','',text) #remove @gtrailuk
    text = re.sub(r'@TLRailUK','',text) #remove @TLRailUK
    
    text = text.lower() #turn every capitalization to lowercase    
    text = re.sub(r'@[A-Za-z0-9\_]+','',text) #remove @usernames, format of a username: alphanumeric characters (letters A-Z, numbers 0-9) with the exception of underscores
    text = re.sub(r'https?:\/\/\S+','',text) #remove hyperlink
    text = re.sub(r'\b\d+\b','',text) #remove numbers
    text = re.sub(r'[^\w\s]','',text) #remove punctuations
    return text

In [8]:
# Clean the tweet content
# Use for loop to run through every tweets in the clean dataset
# Cleaning steps include:
# 1. Basic clean (i.e. remove unnecessary emojis, patterns, punctuation...) using the cleanText function above
# 2. Tokenization: break sentence into words
# 3. Remove stop words
# 4. Lemmatization: strip words down to its root/stem

def cleanSentence(main_df):
    df = main_df.copy()
    lemmatizer = WordNetLemmatizer()

    for index, row in df.iterrows():
        filter_sentence = []
        sentence = cleanText(row['text']) # call our function above to clean user text
        words = nltk.word_tokenize(sentence) # tokenization
        words = [w for w in words if not w in STOP_WORDS] # stopwords removal
        for word in words:
            filter_sentence.append(lemmatizer.lemmatize(word)) # lemmatization
        clean_sentence = ' '.join(str(x) for x in filter_sentence if not len(x) == 1)
        df.at[index,'text'] = clean_sentence  
    return df

# SENTIMENT ANALYSIS WITH VADER

In [9]:
Vader_df = test.copy()
Vader_df = cleanSentence(Vader_df)
Vader_df

Unnamed: 0,text,sentiment
0,advised colleague network rail point failure s...,neutral
1,aiming like chucking hot dog toilet roll tube ...,negative
2,long time ive train late filthy got seat look ...,negative
3,ask view mr brake 1m fine given railway death ...,neutral
4,possible compound today delay having train pla...,negative
...,...,...
2358,service beginning return normal running networ...,negative
2359,train service running station cancelled delaye...,neutral
2360,train leave minute late impunity time cant hel...,negative
2361,hey return unused ticket cancellation fee appl...,neutral


In [10]:
# Run the polarity score on the dataset
sentiment = SentimentIntensityAnalyzer()

from itertools import islice #import to run the for loop for the number of row & col of your choice

result = {}

for index, row in islice(Vader_df.iterrows(), 0, len(Vader_df)):
    text = row['text']
    result[index] = sentiment.polarity_scores(text)

In [11]:
Vader_df = Vader_df.join(pd.DataFrame(result).T)
Vader_df

Unnamed: 0,text,sentiment,neg,neu,pos,compound
0,advised colleague network rail point failure s...,neutral,0.356,0.644,0.000,-0.7351
1,aiming like chucking hot dog toilet roll tube ...,negative,0.000,0.762,0.238,0.3612
2,long time ive train late filthy got seat look ...,negative,0.168,0.670,0.162,-0.0258
3,ask view mr brake 1m fine given railway death ...,neutral,0.157,0.726,0.117,-0.4588
4,possible compound today delay having train pla...,negative,0.103,0.897,0.000,-0.3182
...,...,...,...,...,...,...
2358,service beginning return normal running networ...,negative,0.187,0.691,0.122,-0.1531
2359,train service running station cancelled delaye...,neutral,0.245,0.566,0.189,0.0258
2360,train leave minute late impunity time cant hel...,negative,0.270,0.730,0.000,-0.4881
2361,hey return unused ticket cancellation fee appl...,neutral,0.228,0.772,0.000,-0.6369


In [12]:
# create a list of conditions
conditions =[
    (Vader_df['compound'] < 0),
    (Vader_df['compound'] == 0),
    (Vader_df['compound'] > 0)
    ]

# create a list of sentiment value to assign for each condition
values = ['negative', 'neutral', 'positive']

# create a new column and use np.select to assign values to it using the above lists as arguments
Vader_df['VADER Analysis'] = np.select(conditions, values)

# display updated DataFrame
Vader_df.head()

Unnamed: 0,text,sentiment,neg,neu,pos,compound,VADER Analysis
0,advised colleague network rail point failure s...,neutral,0.356,0.644,0.0,-0.7351,negative
1,aiming like chucking hot dog toilet roll tube ...,negative,0.0,0.762,0.238,0.3612,positive
2,long time ive train late filthy got seat look ...,negative,0.168,0.67,0.162,-0.0258,negative
3,ask view mr brake 1m fine given railway death ...,neutral,0.157,0.726,0.117,-0.4588,negative
4,possible compound today delay having train pla...,negative,0.103,0.897,0.0,-0.3182,negative


In [13]:
#confusion matrix on test set
cm = confusion_matrix(Vader_df['sentiment'], Vader_df['VADER Analysis'], labels=Vader_df.sentiment.unique())
confusionMatrix_df = pd.DataFrame(cm, index=Vader_df.sentiment.unique(), columns=Vader_df.sentiment.unique())
print ("-"*60)
print ("Confusion Matrix\n")
print (confusionMatrix_df)

#confusion matrix in percentage %
confusionMatrix_percenteage_df = confusionMatrix_df.copy()
for i in confusionMatrix_percenteage_df:
    confusionMatrix_percenteage_df[i]/=confusionMatrix_percenteage_df[i].sum()
print ("-"*60)
print ("Confusion Matrix in percentage\n")
print (confusionMatrix_percenteage_df)
print ("-"*60)

------------------------------------------------------------
Confusion Matrix

          neutral  negative  positive
neutral       126       461       278
negative      164       881       419
positive        3         3        28
------------------------------------------------------------
Confusion Matrix in percentage

           neutral  negative  positive
neutral   0.430034  0.342751  0.383448
negative  0.559727  0.655019  0.577931
positive  0.010239  0.002230  0.038621
------------------------------------------------------------
