In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score

import nltk 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

from sklearn.feature_extraction.text import CountVectorizer

from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss, RandomUnderSampler

# Import data

In [2]:
train = pd.read_csv('sentiment_train.csv', names=['index','text','sentiment'], header=0)
train.drop('index', axis=1, inplace=True)
train

Unnamed: 0,text,sentiment
0,⚠️ #TLUpdates - Due to a safety inspection of ...,negative
1,@TLRailUK On the announcement we were told tha...,negative
2,1418 Thameslink train to Horsham: On time - pl...,neutral
3,This morning's Thameslink train is covered in ...,neutral
4,"@TLRailUK I think a bit before Mill Hill, but ...",negative
...,...,...
11373,@TLRailUK She was the ticket lady at Arlesey t...,positive
11374,@TLRailUK perhaps you could have your train dr...,negative
11375,1323 Great Northern train to Moorgate: Delayed...,neutral
11376,Now stuck at harpenden. Not meant to stop here...,negative


In [3]:
validation = pd.read_csv('sentiment_validation.csv', names=['index','text','sentiment'], header=0)
validation.drop('index', axis=1, inplace=True)
validation

Unnamed: 0,text,sentiment
0,What dirt does @GatwickExpress have on @TLRail...,negative
1,@EmilyTrenouth @TLRailUK Can’t see a carriage ...,neutral
2,@TLRailUK The 18:57 Cambridge to KingsX was de...,negative
3,@SouthernRailUK @TLRailUK @GatwickExpress @MET...,neutral
4,@TLRailUK Thank you for coming back to me. It'...,negative
...,...,...
2003,@TLRailUK Thanks.......It said it arrived ? Ca...,negative
2004,@TLRailUK Can anyone there explain to me why I...,neutral
2005,2005 Thameslink train to Gatwick Airport: Dela...,neutral
2006,.@networkrail @TLRailUK which person designed ...,negative


In [4]:
test = pd.read_csv('sentiment_test.csv', names=['index','text','sentiment'], header=0)
test.drop('index', axis=1, inplace=True)
test

Unnamed: 0,text,sentiment
0,#TLUpdates - We have been advised by our colle...,neutral
1,@TLRailUK Aiming ... like chucking a hot dog u...,negative
2,Long time since I’ve had to get a @TLRailUK tr...,negative
3,@thomasbrake @TLRailUK Can I ask for your view...,neutral
4,@TLRailUK how is it even possible for you to c...,negative
...,...,...
2358,⚠️ #TLUpdates - Services are beginning to retu...,negative
2359,⚠️#TLUpdates - Train services running through ...,neutral
2360,@LiveCommute @tlupdates @WorkerRailway @SaaSyS...,negative
2361,"@TLRailUK Hey, how do I do the ‘return unused ...",neutral


# Functions for cleaning Tweet (clean text & clean sentence)

In [5]:
# load spacy English language model trained based on web and social media texts
# add more stop words to the list of stop words list in spacy

nlp = spacy.load('en_core_web_sm')

nlp.Defaults.stop_words |= {'thameslink','tlupdates','gtrailuk','tlrailuk','govia', 'gtr'}
nlp.Defaults.stop_words.remove('not')

In [6]:
STOP_WORDS

print(len(STOP_WORDS))
print(STOP_WORDS)

331
{'ourselves', '’m', 'ten', 'gtr', '‘s', 'around', 'ever', 'whereas', 'are', 'eleven', 'further', 'n’t', 'moreover', 'seemed', 'go', 'whenever', 'see', 'amongst', 'gtrailuk', '’re', 'myself', 'throughout', 'else', 'though', 'against', 'using', 'beforehand', 'still', 'herein', 'been', 'even', 'always', 'very', 'me', 'his', 'via', 'five', 'say', 'thameslink', 'last', 'nor', 're', 'much', 'afterwards', 'seem', 'her', 'hereafter', 'to', 'by', 'therein', 'was', 'get', 'every', 'it', 'towards', 'within', "'ve", "'s", 'now', 'whom', "'d", '’ll', 'four', 'unless', 'whereafter', 'same', 'nothing', 'whereby', 'hundred', 'everything', 'please', 'so', 'across', 'he', 'up', 'keep', 'other', 'becomes', 'has', 'which', 'what', 'cannot', 'full', 'side', 'everyone', 'whose', 'while', 'latterly', "'re", 'or', 'latter', 'that', 'whoever', 'if', 'this', 'down', 'seeming', '‘re', 'less', 'more', 'among', 'amount', 'nevertheless', 'their', 'after', 'least', 'you', 'never', 'can', 'both', 'well', 'rather'

In [7]:
# Create a function to clean tweet content
# Tweet will be cleaned by the following steps:
# 1. Remove emoji
# 2. Remove #thameslink, #TLUpdates, @gtrailuk, @TLRailUK
# 3. Turn word to lowercase
# 4. Remove Twitter @usernames
# 5. Remove hyperlink
# 6. Remove punctuations
# return clean text

def cleanText(text):
    #create a list of emojis pattern
    emoji_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   "]+", flags = re.UNICODE)
    text = re.sub(emoji_pattern,'', text) #remove emoji    
    text = re.sub(r'#thameslink','',text) #remove #thameslink
    text = re.sub(r'#TLUpdates','',text) #remove #TLUpdates
    text = re.sub(r'@gtrailuk','',text) #remove @gtrailuk
    text = re.sub(r'@TLRailUK','',text) #remove @TLRailUK
    
    text = text.lower() #turn every capitalization to lowercase    
    text = re.sub(r'@[A-Za-z0-9\_]+','',text) #remove @usernames, format of a username: alphanumeric characters (letters A-Z, numbers 0-9) with the exception of underscores
    text = re.sub(r'https?:\/\/\S+','',text) #remove hyperlink
    text = re.sub(r'\b\d+\b','',text) #remove numbers
    text = re.sub(r'[^\w\s]','',text) #remove punctuations
    return text

In [8]:
# Clean the tweet content
# Use for loop to run through every tweets in the clean dataset
# Cleaning steps include:
# 1. Basic clean (i.e. remove unnecessary emojis, patterns, punctuation...) using the cleanText function above
# 2. Tokenization: break sentence into words
# 3. Remove stop words
# 4. Lemmatization: strip words down to its root/stem
# return a DataFrame with clean words

def cleanSentence(main_df):
    df = main_df.copy()
    lemmatizer = WordNetLemmatizer()

    for index, row in df.iterrows():
        filter_sentence = []
        sentence = cleanText(row['text']) # call our function above to clean user text
        words = nltk.word_tokenize(sentence) # tokenization
        words = [w for w in words if not w in STOP_WORDS] # stopwords removal
        for word in words:
            filter_sentence.append(lemmatizer.lemmatize(word)) # lemmatization
        clean_sentence = ' '.join(str(x) for x in filter_sentence if not len(x) == 1)
        df.at[index,'text'] = clean_sentence  
    return df

# SENTIMENT ANALYSIS WITH LOGISTIC REGRESSION

## Create dataset for Logistic Regression

In [9]:
# right now, the model perform much better on unclean text. Clean text can always been done by uncheck the hastags below

train_lr = train.copy()
train_lr = cleanSentence(train_lr)
train_lr

Unnamed: 0,text,sentiment
0,safety inspection track undertaken network rai...,negative
1,announcement told driver hadnt shown cancel tr...,negative
2,train horsham time plat train peterborough del...,neutral
3,morning train covered graffiti read come dosse...,neutral
4,think bit mill hill definitely got worse peopl...,negative
...,...,...
11373,ticket lady arlesey ended dealing literally tr...,positive
11374,train driver passenger informed running lbg tb...,negative
11375,great northern train moorgate delayed plat tra...,neutral
11376,stuck harpenden not meant stop door didnt open...,negative


In [10]:
test_lr = test.copy()
test_lr = cleanSentence(test_lr)
test_lr

Unnamed: 0,text,sentiment
0,advised colleague network rail point failure s...,neutral
1,aiming like chucking hot dog toilet roll tube ...,negative
2,long time ive train late filthy got seat look ...,negative
3,ask view mr brake 1m fine given railway death ...,neutral
4,possible compound today delay having train pla...,negative
...,...,...
2358,service beginning return normal running networ...,negative
2359,train service running station cancelled delaye...,neutral
2360,train leave minute late impunity time cant hel...,negative
2361,hey return unused ticket cancellation fee appl...,neutral


In [11]:
validation_lr = validation.copy()
validation_lr = cleanSentence(validation_lr)
validation_lr

Unnamed: 0,text,sentiment
0,dirt let dick tired commutingsucks,negative
1,cant carriage number say carriage screen,neutral
2,cambridge kingsx delayed 10mins announcement g...,negative
3,train running bridge directly littlehampton st...,neutral
4,thank coming daft reason late stop order south...,negative
...,...,...
2003,thanksit said arrived not delayed screen,negative
2004,explain dig travel card enter ticket number di...,neutral
2005,train gatwick airport delayed plat train bridg...,neutral
2006,person designed seat spike comfortable,negative


In [12]:
X_train_lr = train_lr['text']
y_train_lr = train_lr['sentiment']

X_test_lr = test_lr['text']
y_test_lr = test_lr['sentiment'] 

X_val_lr = validation_lr['text']
y_val_lr = validation_lr['sentiment'] 

## Data transformation

In [13]:
vectorizer = CountVectorizer(max_features = 1420)
vectorizer.fit(X_train_lr)

encoded_X_train = vectorizer.transform(X_train_lr)
encoded_X_test = vectorizer.transform(X_test_lr)

## Balancing dataset

In [14]:
ros = RandomOverSampler(random_state=777)
X_ROS, y_ROS = ros.fit_sample(encoded_X_train, y_train_lr)

smote = SMOTE(random_state=777)
X_smote, y_smote = smote.fit_sample(encoded_X_train, y_train_lr)

smoteenn = SMOTEENN(random_state=777)
X_smoteenn, y_smoteenn = smoteenn.fit_sample(encoded_X_train, y_train_lr)

smotetomek = SMOTETomek(random_state=777)
X_smotetomek, y_smotetomek = smotetomek.fit_sample(encoded_X_train, y_train_lr)

adasyn = ADASYN(random_state=777)
X_adasyn, y_adasyn = adasyn.fit_sample(encoded_X_train, y_train_lr)

rus = RandomUnderSampler(random_state=777)
X_RUS, y_RUS = rus.fit_sample(encoded_X_train, y_train_lr)

## Modelling & Evaluating

In [15]:
# Models after balancing
model1 = LogisticRegression(max_iter=1000)
model1.fit(X_ROS, y_ROS)
accuracy1 = model1.score(encoded_X_test, y_test_lr)
print('Accuracy with ROS: ',accuracy1)

model2 = LogisticRegression(max_iter=1000)
model2.fit(X_smote, y_smote)
accuracy2 = model2.score(encoded_X_test, y_test_lr)
print('Accuracy with SMOTE: ',accuracy2)

model3 = LogisticRegression(max_iter=1000)
model3.fit(X_smoteenn, y_smoteenn)
print('Accuracy with SMOTEENN: ',model3.score(encoded_X_test, y_test_lr))

model4 = LogisticRegression(max_iter=1000)
model4.fit(X_smotetomek, y_smotetomek)
print('Accuracy with SMOTETOMEK: ',model4.score(encoded_X_test, y_test_lr))

model5 = LogisticRegression(max_iter=1000)
model5.fit(X_adasyn, y_adasyn)
accuracy5 = model5.score(encoded_X_test, y_test_lr)
print('Accuracy with ADASYN: ',accuracy5)

model6 = LogisticRegression(max_iter=1000)
model6.fit(X_RUS, y_RUS)
print('Accuracy with RUS: ',model6.score(encoded_X_test, y_test_lr))

Accuracy with ROS:  0.6555226407109607
Accuracy with SMOTE:  0.6546762589928058
Accuracy with SMOTEENN:  0.39060516292848074
Accuracy with SMOTETOMEK:  0.6563690224291155
Accuracy with ADASYN:  0.6550994498518832
Accuracy with RUS:  0.5078290308929327


In [26]:
# Model with ROS 
y_pred1 = model1.predict(encoded_X_test)
print('Accuracy with ROS: ',accuracy1)
#precision and recall
print('Precision: %.3f' % precision_score(y_test_lr, y_pred1,average='weighted'))
print('Recall: %.3f' % recall_score(y_test_lr, y_pred1,average='weighted'))

#confusion matrix on test set
cm = confusion_matrix(y_test_lr, y_pred1, labels=test.sentiment.unique())
confusionMatrix_df = pd.DataFrame(cm, index=test.sentiment.unique(), columns=test.sentiment.unique())
print ("-"*60)
print ("Confusion Matrix\n")
print (confusionMatrix_df)


#confusion matrix in percentage %
confusionMatrix_percenteage_df = confusionMatrix_df.copy()
for i in confusionMatrix_percenteage_df:
    confusionMatrix_percenteage_df[i]/=confusionMatrix_percenteage_df[i].sum()
print ("-"*60)
print ("Confusion Matrix in percentage\n")
print (confusionMatrix_percenteage_df)
print ("-"*60)


Accuracy with ROS:  0.6555226407109607
Precision: 0.684
Recall: 0.656
------------------------------------------------------------
Confusion Matrix

          neutral  negative  positive
neutral       572       268        25
negative      470       958        36
positive        6         9        19
------------------------------------------------------------
Confusion Matrix in percentage

           neutral  negative  positive
neutral   0.545802  0.217004    0.3125
negative  0.448473  0.775709    0.4500
positive  0.005725  0.007287    0.2375
------------------------------------------------------------


In [27]:
# Model with SMOTE 
y_pred2 = model2.predict(encoded_X_test)
print('Accuracy with SMOTE: ',accuracy2)

#precision and recall
print('Precision with SMOTE: %.3f' % precision_score(y_test_lr, y_pred2,average='weighted'))
print('Recall with SMOTE: %.3f' % recall_score(y_test_lr, y_pred2,average='weighted'))

#confusion matrix on test set
cm = confusion_matrix(y_test_lr, y_pred2, labels=test.sentiment.unique())
confusionMatrix_df = pd.DataFrame(cm, index=test.sentiment.unique(), columns=test.sentiment.unique())
print ("-"*60)
print ("Confusion Matrix\n")
print (confusionMatrix_df)

#confusion matrix in percentage %
confusionMatrix_percenteage_df = confusionMatrix_df.copy()
for i in confusionMatrix_percenteage_df:
    confusionMatrix_percenteage_df[i]/=confusionMatrix_percenteage_df[i].sum()
print ("-"*60)
print ("Confusion Matrix in percentage\n")
print (confusionMatrix_percenteage_df)
print ("-"*60)

Accuracy with SMOTE:  0.6546762589928058
Precision with SMOTE: 0.682
Recall with SMOTE: 0.655
------------------------------------------------------------
Confusion Matrix

          neutral  negative  positive
neutral       538       283        44
negative      428       994        42
positive        7        12        15
------------------------------------------------------------
Confusion Matrix in percentage

           neutral  negative  positive
neutral   0.552929   0.21955  0.435644
negative  0.439877   0.77114  0.415842
positive  0.007194   0.00931  0.148515
------------------------------------------------------------


In [28]:
# Model with ADASYN 
y_pred5 = model5.predict(encoded_X_test)
print('Accuracy with ADSYN: ',accuracy5)

#precision and recall
print('Precision with ADASYN: %.3f' % precision_score(y_test_lr, y_pred2,average='weighted'))
print('Recall with ADASYN: %.3f' % recall_score(y_test_lr, y_pred2,average='weighted'))

#confusion matrix on test set
cm = confusion_matrix(y_test_lr, y_pred5, labels=test.sentiment.unique())
confusionMatrix_df = pd.DataFrame(cm, index=test.sentiment.unique(), columns=test.sentiment.unique())
print ("-"*60)
print ("Confusion Matrix\n")
print (confusionMatrix_df)

#confusion matrix in percentage %
confusionMatrix_percenteage_df = confusionMatrix_df.copy()
for i in confusionMatrix_percenteage_df:
    confusionMatrix_percenteage_df[i]/=confusionMatrix_percenteage_df[i].sum()
print ("-"*60)
print ("Confusion Matrix in percentage\n")
print (confusionMatrix_percenteage_df)
print ("-"*60)

Accuracy with ADSYN:  0.6550994498518832
Precision with ADASYN: 0.682
Recall with ADASYN: 0.655
------------------------------------------------------------
Confusion Matrix

          neutral  negative  positive
neutral       551       272        42
negative      443       982        39
positive        8        11        15
------------------------------------------------------------
Confusion Matrix in percentage

           neutral  negative  positive
neutral   0.549900  0.215020   0.43750
negative  0.442116  0.776285   0.40625
positive  0.007984  0.008696   0.15625
------------------------------------------------------------


In [29]:
# Model without class weight balance
model10 = LogisticRegression(max_iter=1000, class_weight='balanced')
model10.fit(encoded_X_train, y_train_lr)
y_pred10 = model10.predict(encoded_X_test)
print('Accuracy with class weight: ',model10.score(encoded_X_test, y_test_lr))

#precision and recall
print('Precision with class weight: %.3f' % precision_score(y_test_lr, y_pred10,average='weighted'))
print('Recall with class weight: %.3f' % recall_score(y_test_lr, y_pred10,average='weighted'))

#confusion matrix on test set
cm = confusion_matrix(y_test_lr, y_pred10, labels=test.sentiment.unique())
confusionMatrix_df = pd.DataFrame(cm, index=test.sentiment.unique(), columns=test.sentiment.unique())
print ("-"*60)
print ("Confusion Matrix\n")
print (confusionMatrix_df)

#confusion matrix in percentage %
confusionMatrix_percenteage_df = confusionMatrix_df.copy()
for i in confusionMatrix_percenteage_df:
    confusionMatrix_percenteage_df[i]/=confusionMatrix_percenteage_df[i].sum()
print ("-"*60)
print ("Confusion Matrix in percentage\n")
print (confusionMatrix_percenteage_df)
print ("-"*60)

Accuracy with class weight:  0.6618705035971223
Precision with class weight: 0.695
Recall with class weight: 0.662
------------------------------------------------------------
Confusion Matrix

          neutral  negative  positive
neutral       590       249        26
negative      466       954        44
positive        7         7        20
------------------------------------------------------------
Confusion Matrix in percentage

           neutral  negative  positive
neutral   0.555033  0.205785  0.288889
negative  0.438382  0.788430  0.488889
positive  0.006585  0.005785  0.222222
------------------------------------------------------------


In [30]:
# Model without balancing
model11 = LogisticRegression(max_iter=1000)
model11.fit(encoded_X_train, y_train_lr)
y_pred11 = model11.predict(encoded_X_test)
print('Accuracy without balancing: ',model11.score(encoded_X_test, y_test_lr))
#precision and recall
print('Precision without balancing: %.3f' % precision_score(y_test_lr, y_pred11,average='weighted'))
print('Recall without balancing: %.3f' % recall_score(y_test_lr, y_pred11,average='weighted'))

#confusion matrix on test set
cm = confusion_matrix(y_test_lr, y_pred11, labels=test.sentiment.unique())
confusionMatrix_df = pd.DataFrame(cm, index=test.sentiment.unique(), columns=test.sentiment.unique())
print ("-"*60)
print ("Confusion Matrix\n")
print (confusionMatrix_df)

#confusion matrix in percentage %
confusionMatrix_percenteage_df = confusionMatrix_df.copy()
for i in confusionMatrix_percenteage_df:
    confusionMatrix_percenteage_df[i]/=confusionMatrix_percenteage_df[i].sum()
print ("-"*60)
print ("Confusion Matrix in percentage\n")
print (confusionMatrix_percenteage_df)
print ("-"*60)

Accuracy without balancing:  0.7046127803639441
Precision without balancing: 0.699
Recall without balancing: 0.705
------------------------------------------------------------
Confusion Matrix

          neutral  negative  positive
neutral       482       381         2
negative      287      1174         3
positive       12        13         9
------------------------------------------------------------
Confusion Matrix in percentage

           neutral  negative  positive
neutral   0.617157  0.242985  0.142857
negative  0.367478  0.748724  0.214286
positive  0.015365  0.008291  0.642857
------------------------------------------------------------
