In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score

import nltk 
from nltk.stem import WordNetLemmatizer

import re

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

from textblob import TextBlob

# Import data

In [2]:
train = pd.read_csv('sentiment_train.csv', names=['index','text','sentiment'], header=0)
train.drop('index', axis=1, inplace=True)
train

Unnamed: 0,text,sentiment
0,⚠️ #TLUpdates - Due to a safety inspection of ...,negative
1,@TLRailUK On the announcement we were told tha...,negative
2,1418 Thameslink train to Horsham: On time - pl...,neutral
3,This morning's Thameslink train is covered in ...,neutral
4,"@TLRailUK I think a bit before Mill Hill, but ...",negative
...,...,...
11373,@TLRailUK She was the ticket lady at Arlesey t...,positive
11374,@TLRailUK perhaps you could have your train dr...,negative
11375,1323 Great Northern train to Moorgate: Delayed...,neutral
11376,Now stuck at harpenden. Not meant to stop here...,negative


In [3]:
validation = pd.read_csv('sentiment_validation.csv', names=['index','text','sentiment'], header=0)
validation.drop('index', axis=1, inplace=True)
validation

Unnamed: 0,text,sentiment
0,What dirt does @GatwickExpress have on @TLRail...,negative
1,@EmilyTrenouth @TLRailUK Can’t see a carriage ...,neutral
2,@TLRailUK The 18:57 Cambridge to KingsX was de...,negative
3,@SouthernRailUK @TLRailUK @GatwickExpress @MET...,neutral
4,@TLRailUK Thank you for coming back to me. It'...,negative
...,...,...
2003,@TLRailUK Thanks.......It said it arrived ? Ca...,negative
2004,@TLRailUK Can anyone there explain to me why I...,neutral
2005,2005 Thameslink train to Gatwick Airport: Dela...,neutral
2006,.@networkrail @TLRailUK which person designed ...,negative


In [4]:
test = pd.read_csv('sentiment_test.csv', names=['index','text','sentiment'], header=0)
test.drop('index', axis=1, inplace=True)
test

Unnamed: 0,text,sentiment
0,#TLUpdates - We have been advised by our colle...,neutral
1,@TLRailUK Aiming ... like chucking a hot dog u...,negative
2,Long time since I’ve had to get a @TLRailUK tr...,negative
3,@thomasbrake @TLRailUK Can I ask for your view...,neutral
4,@TLRailUK how is it even possible for you to c...,negative
...,...,...
2358,⚠️ #TLUpdates - Services are beginning to retu...,negative
2359,⚠️#TLUpdates - Train services running through ...,neutral
2360,@LiveCommute @tlupdates @WorkerRailway @SaaSyS...,negative
2361,"@TLRailUK Hey, how do I do the ‘return unused ...",neutral


# Functions for cleaning Tweet (clean text & clean sentence)

In [5]:
# load spacy English language model trained based on web and social media texts
# add more stop words to the list of stop words list in spacy

nlp = spacy.load('en_core_web_sm')
nlp.Defaults.stop_words |= {'thameslink','tlupdates','gtrailuk','tlrailuk','govia', 'gtr'}
nlp.Defaults.stop_words.remove('not')



In [6]:
STOP_WORDS

print(len(STOP_WORDS))
print(STOP_WORDS)

331
{'no', 'unless', 'before', 'being', 'been', 'is', 'they', 'whoever', 'whereafter', 'about', 'we', '‘d', 'between', 'nobody', "'re", 'back', 'whose', 'which', 'her', 'hence', 'someone', 'empty', 'though', 'ca', 'eight', 'than', 'several', 'almost', 'get', 'noone', 'sometime', 'his', 'too', 'me', "'m", 'those', 'always', 'otherwise', 'really', 'thereafter', 'would', 'an', 'after', 'herein', 'never', 'see', 'whom', 'on', 'again', 'everything', 'much', 'either', 'tlupdates', 'least', 'two', 'did', 'you', 'i', 'none', 'part', 'it', 'therefore', 'at', 'mine', 'beyond', 'side', 'seemed', 'onto', 'therein', 'towards', 'am', 'in', '‘re', 'themselves', 'any', 'them', 'nothing', '‘ll', 'three', 'hereupon', 'whole', 'up', 'afterwards', "n't", 'formerly', 'its', 'thence', 'us', 'due', 'here', 'everywhere', 'seem', 'however', 'hereby', 'above', 'next', 'once', 'other', 'anything', 'cannot', 'together', 'their', 'if', 'fifteen', "'s", 'further', 'third', 'namely', 'anyway', 'thru', 'top', 'could'

In [7]:
# Create a function to clean tweet content
# Tweet will be cleaned by the following steps:
# 1. Remove emoji
# 2. Remove #thameslink, #TLUpdates, @gtrailuk, @TLRailUK
# 3. Turn word to lowercase
# 4. Remove Twitter @usernames
# 5. Remove hyperlink
# 6. Remove punctuations
# return clean text

def cleanText(text):
    #create a list of emojis pattern
    emoji_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   "]+", flags = re.UNICODE)
    text = re.sub(emoji_pattern,'', text) #remove emoji    
    text = re.sub(r'#thameslink','',text) #remove #thameslink
    text = re.sub(r'#TLUpdates','',text) #remove #TLUpdates
    text = re.sub(r'@gtrailuk','',text) #remove @gtrailuk
    text = re.sub(r'@TLRailUK','',text) #remove @TLRailUK
    
    text = text.lower() #turn every capitalization to lowercase    
    text = re.sub(r'@[A-Za-z0-9\_]+','',text) #remove @usernames, format of a username: alphanumeric characters (letters A-Z, numbers 0-9) with the exception of underscores
    text = re.sub(r'https?:\/\/\S+','',text) #remove hyperlink
    text = re.sub(r'\b\d+\b','',text) #remove numbers
    text = re.sub(r'[^\w\s]','',text) #remove punctuations
    return text

In [8]:
# Clean the tweet content
# Use for loop to run through every tweets in the clean dataset
# Cleaning steps include:
# 1. Basic clean (i.e. remove unnecessary emojis, patterns, punctuation...) using the cleanText function above
# 2. Tokenization: break sentence into words
# 3. Remove stop words
# 4. Lemmatization: strip words down to its root/stem
# return a DataFrame with clean words

def cleanSentence(main_df):
    df = main_df.copy()
    lemmatizer = WordNetLemmatizer()

    for index, row in df.iterrows():
        filter_sentence = []
        sentence = cleanText(row['text']) # call our function above to clean user text
        words = nltk.word_tokenize(sentence) # tokenization
        words = [w for w in words if not w in STOP_WORDS] # stopwords removal
        for word in words:
            filter_sentence.append(lemmatizer.lemmatize(word)) # lemmatization
        clean_sentence = ' '.join(str(x) for x in filter_sentence if not len(x) == 1)
        df.at[index,'text'] = clean_sentence  
    return df

# SENTIMENT ANALYSIS WITH TEXTBLOB

In [9]:
# run TextBlob on test set only
TextBlob_df = test.copy()

TextBlob_df.text = TextBlob_df.text.apply(cleanText)

In [10]:
# Create a function to compute the negative, neutral and positive analysis
def SentimentAnalysis(score):
    if score < 0:
        return 'negative'
    elif score == 0:
        return 'neutral'
    else:
        return 'positive'

In [11]:
# Create a function to get the polarity
# tell the user if the given sentence is negative, neutral or positive
# Return values in range [-1,1]
# -1: negative
# 0: neutral
# 1: positive
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

In [12]:
# Create new column Polarity
TextBlob_df['polarity'] = TextBlob_df['text'].apply(getPolarity)
TextBlob_df

Unnamed: 0,text,sentiment,polarity
0,we have been advised by our colleagues at ne...,neutral,0.014583
1,aiming like chucking a hot dog up a toilet r...,negative,0.108333
2,long time since ive had to get a train late f...,negative,-0.275000
3,can i ask for your views please mr brake on ...,neutral,0.258333
4,how is it even possible for you to compound t...,negative,-0.100000
...,...,...,...
2358,services are beginning to return to normal ...,negative,0.127273
2359,train services running through these station...,neutral,0.350000
2360,so a train can leave minutes or more lat...,negative,0.066667
2361,hey how do i do the return unused ticket with...,neutral,0.166667


In [13]:
# Create 1 new column TextBlob Analysis
TextBlob_df['TextBlob Analysis'] = TextBlob_df['polarity'].apply(SentimentAnalysis)
TextBlob_df

Unnamed: 0,text,sentiment,polarity,TextBlob Analysis
0,we have been advised by our colleagues at ne...,neutral,0.014583,positive
1,aiming like chucking a hot dog up a toilet r...,negative,0.108333,positive
2,long time since ive had to get a train late f...,negative,-0.275000,negative
3,can i ask for your views please mr brake on ...,neutral,0.258333,positive
4,how is it even possible for you to compound t...,negative,-0.100000,negative
...,...,...,...,...
2358,services are beginning to return to normal ...,negative,0.127273,positive
2359,train services running through these station...,neutral,0.350000,positive
2360,so a train can leave minutes or more lat...,negative,0.066667,positive
2361,hey how do i do the return unused ticket with...,neutral,0.166667,positive


In [14]:
#confusion matrix on test set
cm = confusion_matrix(TextBlob_df['sentiment'], TextBlob_df['TextBlob Analysis'], labels=TextBlob_df.sentiment.unique())
confusionMatrix_df = pd.DataFrame(cm, index=TextBlob_df.sentiment.unique(), columns=TextBlob_df.sentiment.unique())
print ("-"*60)
print ("Confusion Matrix\n")
print (confusionMatrix_df)

#confusion matrix in percentage %
confusionMatrix_percenteage_df = confusionMatrix_df.copy()
for i in confusionMatrix_percenteage_df:
    confusionMatrix_percenteage_df[i]/=confusionMatrix_percenteage_df[i].sum()
print ("-"*60)
print ("Confusion Matrix in percentage\n")
print (confusionMatrix_percenteage_df)
print ("-"*60)

------------------------------------------------------------
Confusion Matrix

          neutral  negative  positive
neutral       228       294       343
negative      305       647       512
positive        8         2        24
------------------------------------------------------------
Confusion Matrix in percentage

           neutral  negative  positive
neutral   0.421442  0.311771  0.390216
negative  0.563771  0.686108  0.582480
positive  0.014787  0.002121  0.027304
------------------------------------------------------------


In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("pysentimiento/robertuito-sentiment-analysis", from_tf=True)

model = AutoModelForSequenceClassification.from_pretrained("pysentimiento/robertuito-sentiment-analysis", from_tf=True)

All TF 2.0 model weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.
