In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import nltk 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# IMPORT THE DATASET

In [2]:
data = pd.read_csv('C:/Users/Quynh Pham/Desktop/PM/Dataset/tweets.csv')
data.head()

Unnamed: 0,source_created_at,author_id,text,source,language,longitude,latitude,id,source_id,tweet_id,user_id,relevant,topic,ground_truth,sentiment
0,2020-09-18 21:56:20.798000,593731316,@DSisourath The Thameslink core between London...,sprinklr,en,-0.12574,51.50853,acd7673f-e621-5f1a-d662-df278964a6ea,,acd7673f-e621-5f1a-d662-df278964a6ea,Z003XDCS,True,service,True,negative
1,2020-10-13 07:31:53.122000,745583289520496640,@DulwichHistory Loving the complaint about peo...,sprinklr,en,-0.12574,51.50853,5b92aba8-4b05-6c63-8485-e9c870742137,,5b92aba8-4b05-6c63-8485-e9c870742137,Z003XDCS,True,delays,True,negative
2,2020-10-26 19:27:24.695000,303134761,@SW_Help .And yet you have no toilets on some ...,sprinklr,en,-0.12574,51.50853,0a799c07-8b76-17ba-b840-e538d51e832d,,0a799c07-8b76-17ba-b840-e538d51e832d,Z003XDCS,True,toilets,True,negative
3,2020-10-26 19:28:49.281000,303134761,@SW_Help you have no toilets on some of your t...,sprinklr,en,-0.12574,51.50853,8b4d2a34-c4f0-0e19-4055-dfe4af5f0e14,,8b4d2a34-c4f0-0e19-4055-dfe4af5f0e14,Z003XDCS,True,toilets,True,negative
4,2020-09-28 11:59:41.212000,56427671,@SpeedySticks007 @MrNeilJH @TLRailUK @christia...,sprinklr,en,-1.09125,50.79899,1fd08862-d8c7-0682-6b11-2603fba22d94,,1fd08862-d8c7-0682-6b11-2603fba22d94,Z003XDCS,True,seats,True,neutral


# REMOVE UNNECESSARY COLUMNS EXCEPT 'SOURCE_CREATED_AT', 'TEXT' & 'SENTIMENT'

In [3]:
# drop other columns except Timestamp, Tweet Content and Sentiment

data.drop(data.columns.difference(['source_created_at','text','sentiment']), axis=1, inplace=True)
data

Unnamed: 0,source_created_at,text,sentiment
0,2020-09-18 21:56:20.798000,@DSisourath The Thameslink core between London...,negative
1,2020-10-13 07:31:53.122000,@DulwichHistory Loving the complaint about peo...,negative
2,2020-10-26 19:27:24.695000,@SW_Help .And yet you have no toilets on some ...,negative
3,2020-10-26 19:28:49.281000,@SW_Help you have no toilets on some of your t...,negative
4,2020-09-28 11:59:41.212000,@SpeedySticks007 @MrNeilJH @TLRailUK @christia...,neutral
...,...,...,...
16944,2019-07-11 07:34:35,Haha oh man the audio corruption on @TLRailUK ...,neutral
16945,2020-08-10 11:19:10.181000,@TLRailUK SweetIs there a plug to charge my ph...,neutral
16946,2020-08-29 09:51:10.833000,@TLRailUK now there are far fewer commuters ha...,neutral
16947,2020-11-02 12:06:06.967000,@geofftech I am voting for Thameslink. 1. in a...,neutral


# SPLIT DATA INTO TRAINING, TEST & VALIDATION SETS

In [4]:
# Split the dataset into training, test & validation sets with ratio 70-15-15
# First, split dataset into traing & test (ratio 85-15) then split training sets into training & validation (70-15)
# Target variable is sentiment
# Return: X_train, X_test, X_val, y_train, y_test, y_val
# Parameters:
#    random_state: make sure to get the same 3 subsets everytime. Set to be any int values
#    stratify = sentiment (since there was an imbalance in neg, neu and positive sentiments and we have to set this in order to keep the same ratio in our subsets)

X = data.drop(columns='sentiment') #the entire dataframe without sentiment column
y = data['sentiment'] # our target variable

X_main, X_test, y_main, y_test = train_test_split(X,y,test_size = 0.15, random_state=2, stratify = y) #split datasset into main & test set
X_train, X_val, y_train, y_val = train_test_split(X_main, y_main, test_size=0.15, random_state=2, stratify = y_main) #split main into train & validation sets

In [5]:
print("Length of training set:\t\t",len(X_train)," ",len(y_train))
print("Length of test set:\t\t",len(X_test)," ",len(y_test))
print("Length of validation set:\t",len(X_val)," ",len(y_val))

Length of training set:		 12245   12245
Length of test set:		 2543   2543
Length of validation set:	 2161   2161


# CREATE SENTIMENT DATASET & SORT DATASET BY TIMESTAMP

In [6]:
# create sentiment dataset

sentiment_data = pd.DataFrame(X_train)
sentiment_data['sentiment'] = y_train
sentiment_data

Unnamed: 0,source_created_at,text,sentiment
3952,2019-11-20 10:49:20,@SW_Help @hiba_arch @GC_Rail @GNRailUK @TLRail...,neutral
531,2020-07-21 13:47:37.664000,#TLUpdates - We've been advised of a major los...,negative
14298,2019-07-11 06:07:45,Why no wifi on 6:37 from #brighton @TLRailUK N...,negative
13419,2020-08-21 16:10:14.816000,TRAVEL: NORMAL SERVICE RESUMED: 10 minute dela...,negative
3647,2020-03-28 05:33:20,@NetworkRailSE @Se_Railway @SouthernRailUK @TL...,neutral
...,...,...,...
5660,2020-08-24 15:12:02.748000,@TLRailUK Delays on the services are there?,negative
16356,2019-07-17 12:37:22,BBC News - Govia Thameslink fined £1m over Gat...,neutral
2686,2020-02-24 14:30:57,@BaniAnvari @CTS_UCL @FraunhoferIML @AchmeaInn...,neutral
14395,2020-02-21 08:06:13,great idea terminating north bound train at Bl...,neutral


In [7]:
# sort the dataset by timestamp so that later on, all the duplicates are in order next to each other 

sentiment_data.sort_values(by='source_created_at', inplace=True)
sentiment_data

Unnamed: 0,source_created_at,text,sentiment
10484,2019-01-16 10:41:46,@catherinerusse2 @TLRailUK The definitely know...,negative
13991,2019-01-16 11:09:15,"This morning, on a busy commuter train, a woma...",neutral
11081,2019-01-16 11:58:45,@myubi @UlyssesGuybrush @delayrepayagent @TLRa...,neutral
5269,2019-01-16 12:13:46,@TLRailUK @thebiggm Why do you @TLRailUK keep ...,negative
8656,2019-01-16 12:23:36,"@TLRailUK hi all, are you able to help with de...",neutral
...,...,...,...
15083,2020-11-29 21:08:48.310000,@TLRailUK train presentation team working hard...,positive
15082,2020-11-30 07:28:51.195000,@TLRailUK Thanks. The 0701 departed at 0704. W...,neutral
16881,2020-11-30 16:23:15.962000,With the student travel window opening this Th...,neutral
15081,2020-11-30 18:19:48.957000,@kemenzerem @TLRailUK @StPancrasInt There are ...,neutral


In [8]:
# remove timestamp in the sentiment dataset

sentiment_data.drop('source_created_at', axis=1, inplace=True)
sentiment_data

Unnamed: 0,text,sentiment
10484,@catherinerusse2 @TLRailUK The definitely know...,negative
13991,"This morning, on a busy commuter train, a woma...",neutral
11081,@myubi @UlyssesGuybrush @delayrepayagent @TLRa...,neutral
5269,@TLRailUK @thebiggm Why do you @TLRailUK keep ...,negative
8656,"@TLRailUK hi all, are you able to help with de...",neutral
...,...,...
15083,@TLRailUK train presentation team working hard...,positive
15082,@TLRailUK Thanks. The 0701 departed at 0704. W...,neutral
16881,With the student travel window opening this Th...,neutral
15081,@kemenzerem @TLRailUK @StPancrasInt There are ...,neutral


# REMOVE DUPLICATE ROWS (I.E. SAME TEXT & SAME SENTIMENT)

In [9]:
# Show total number of duplicate rows
# Keep 1 of the duplicates in the dataframe, anything else is flagged as duplicates 
# Some have 2 duplicates and some has 3 duplicates   

print('Total duplicate rows (same text, same sentiment): ', sentiment_data.duplicated().sum())

Total duplicate rows (same text, same sentiment):  640


In [10]:
# list all duplicate rows in the entire dataset
# duplicated() parameters:
#    By default, keep = 'first': 1st of duplicated rows to be kept in the dataframe, the rest to be flagged as duplicates and show down here
#    keep = 'last': last of duplicated rows to be kept in the dataframe, the rest to be flagged as duplicates and show down here
#    keep = False: flag and show all duplicated rows (not keeping any rows in dataframe)

sentiment_data.loc[sentiment_data.duplicated(keep = False),:]

Unnamed: 0,text,sentiment
888,(Thameslink Update) 06:30 Luton to Orpington d...,neutral
887,(Thameslink Update) 06:30 Luton to Orpington d...,neutral
6476,@TLRailUK Is there honestly no way to sort the...,negative
6477,@TLRailUK Is there honestly no way to sort the...,negative
5346,@TLRailUK All services - TL and Southern - tho...,negative
...,...,...
16896,@IRideTrains1 @PaulCodd @gnrupdates @SteveWhit...,negative
16892,"@iancwebb @tlupdates @TLRailUK Mate, I’m DREAD...",negative
16893,"@iancwebb @tlupdates @TLRailUK Mate, I’m DREAD...",negative
15087,@tfl @TLRailUK @SWRRLY If u r looking for a ...,neutral


In [11]:
# Drop rows that already flagged as duplicates (keep 1 row in the dataframe)
# inplace=True: make changes to the origial DataFrame

sentiment_data.drop_duplicates(inplace=True)

In [12]:
sentiment_data.duplicated().sum()

0

# LIST DUPLICATE TWEET WITH DIFFERENT SENTIMENT

In [13]:
# how many duplicate Tweets with different sentiment remain in the dataset

print('Total duplicate Tweet with different sentiment: ',sentiment_data['text'].duplicated().sum())

Total duplicate Tweet with different sentiment:  17


In [14]:
# show first 20 rows that has duplicate Tweets with different sentiment
# by default, only show the 1st row of duplicate and drop the last row of duplicate
# if want to keep the last row instead, set the parameter to duplicate(keep = 'last')
# if want to show both the duplicates, set the parameter to duplicate(False)

sentiment_data[sentiment_data['text'].duplicated(keep=False)].head(20)

Unnamed: 0,text,sentiment
5295,@TLRailUK @tlupdates 😳😮 Hope I’m not sitting i...,neutral
15569,@TLRailUK @tlupdates 😳😮 Hope I’m not sitting i...,negative
2779,@CentralBeds @TLRailUK Lucky they got a seat i...,neutral
15266,@CentralBeds @TLRailUK Lucky they got a seat i...,negative
15260,"@BugsieGiven @GNRailUK Mind you, given the cho...",neutral
2741,"@BugsieGiven @GNRailUK Mind you, given the cho...",negative
3432,@LambrettaGT205 @TLRailUK @Haleypest No tables...,negative
15346,@LambrettaGT205 @TLRailUK @Haleypest No tables...,neutral
15327,@JDrvr @TLRailUK Those announcements always me...,neutral
3285,@JDrvr @TLRailUK Those announcements always me...,negative


In [15]:
# The duplicate Tweets with different sentiment is because they have different topics.
# However, knowing they belong to different topics doesn't help with our model building because some of them are just sarcasm, wrong classification 
# Hence, it's best to remove these 17 Tweets from our dataset

# Remove duplicate Tweet with different sentiment
sentiment_data.drop_duplicates(['text'], inplace=True)

In [16]:
sentiment_data['text'].duplicated().sum()

0

In [17]:
print('Dataset for sentiment is ',sentiment_data.shape)

Dataset for sentiment is  (11588, 2)


# CHECK MISSING VALUES IN THE DATASET

In [18]:
sentiment_data.isnull().any()

text         False
sentiment    False
dtype: bool

# CLEANING TWEET CONTENT

In [19]:
# load spacy English language model trained based on web and social media texts
# add more stop words to the list of stop words list in spacy

nlp = spacy.load('en_core_web_sm')
nlp.Defaults.stop_words |= {'thameslink','tlupdates','gtrailuk','tlrailuk','govia', 'gtr'}

In [20]:
STOP_WORDS

print(len(STOP_WORDS))
print(STOP_WORDS)

332
{'either', 'tlupdates', 'without', 'nor', 'were', 'amongst', 'five', 'unless', 'yours', 'nobody', 'empty', 'and', 'fifty', "'re", 'made', 'am', 'ten', 'behind', 'show', 'perhaps', 'thameslink', 'anything', 'moreover', 'such', 'cannot', '‘d', '‘m', 'together', 'two', 'with', 'afterwards', 'tlrailuk', 'whenever', 'by', 'most', 'everything', 'forty', 'say', 'yourself', 'since', 'ever', 'what', 'me', 'beside', 'this', '’ll', 'should', 'former', 'used', 'upon', 'govia', 'can', 're', 'sometimes', 'hereupon', 'off', 'before', 'more', 'hers', 'who', 'four', 'thus', 'becoming', 'over', 'him', 'are', 'back', 'nothing', 'toward', 'see', 'already', 'not', 'somewhere', 'becomes', 'do', 'onto', 'hence', 'on', 'everyone', 'any', 'is', 'hereafter', 'them', 'towards', 'all', 'well', 'sometime', 'you', 'alone', 'top', '’m', 'until', 'n‘t', 'side', 'against', 'we', 'latterly', 'otherwise', 'she', 'has', 'fifteen', 'various', 'myself', 'full', 'however', 'its', 'whoever', 'same', 'to', 'herein', 'get'

In [21]:
# Create a function to clean tweet content
# Tweet will be cleaned by the following steps:
# 1. Remove emoji
# 2. Remove #TLUpdates
# 3. Turn word to lowercase
# 4. Remove Twitter @usernames
# 5. Remove hyperlink
# 6. Remove punctuations
# Use function re.sub() to subsitute particular sub-string with another sub-string 
    # arguments: (condition [or pattern], substitute with, passing string).
    # Patterns: 
    # [ character block start
    # ^ not these characters (letters, numbers)
    # \w word characters
    # \s space characters
    # ] character block end.

def cleanText(text):
    #create a list of emojis pattern
    emoji_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   "]+", flags = re.UNICODE)
    text = re.sub(emoji_pattern,'', text) #remove emoji    
    text = re.sub(r'#TLUpdates','',text) #remove the hashtag #TLUpdates    
    text = text.lower() #turn every capitalization to lowercase    
    text = re.sub(r'@[A-Za-z0-9\_]+','',text) #remove @usernames, format of a username: alphanumeric characters (letters A-Z, numbers 0-9) with the exception of underscores
    text = re.sub(r'https?:\/\/\S+','',text) #remove hyperlink
    text = re.sub(r'\b\d+\b','',text) #remove numbers
    text = re.sub(r'[^\w\s]','',text) #remove punctuations
    return text

In [22]:
# Clean the tweet content
# Use for loop to run through every tweets in the clean dataset
# Cleaning steps include:
# 1. Basic clean (i.e. remove unnecessary emojis, patterns, punctuation...) using the cleanText function above
# 2. Tokenization: break sentence into words
# 3. Remove stop words
# 4. Lemmatization: strip words down to its root/stem

lemmatizer = WordNetLemmatizer()

for index, row in sentiment_data.iterrows():
    print("Original Text:\t\t", row['text'])
    filter_sentence = []
    sentence = cleanText(row['text']) # call our function above to clean user text
    words = nltk.word_tokenize(sentence) # tokenization
    words = [w for w in words if not w in STOP_WORDS] # stopwords removal
    for word in words:
        filter_sentence.append(lemmatizer.lemmatize(word)) # lemmatization
    print("\nAfter lemmatizing Text:\t", filter_sentence,"\n")
    sentiment_data.at[index,'text'] = filter_sentence  

Original Text:		 @catherinerusse2 @TLRailUK The definitely know how to delay because their really good at it 😂😃

After lemmatizing Text:	 ['definitely', 'know', 'delay', 'good'] 

Original Text:		 This morning, on a busy commuter train, a woman got up 15 MINUTES before her stop, giving her seat up in the process, weaved her way through those standing and tried to get closer to the door. It brings new meaning to #london #commuter #trainfail #thameslink #trains

After lemmatizing Text:	 ['morning', 'busy', 'commuter', 'train', 'woman', 'got', 'minute', 'stop', 'giving', 'seat', 'process', 'weaved', 'way', 'standing', 'tried', 'closer', 'door', 'brings', 'new', 'meaning', 'london', 'commuter', 'trainfail', 'train'] 

Original Text:		 @myubi @UlyssesGuybrush @delayrepayagent @TLRailUK It has been timed at 0839 since the timetable change, and the service itself for this period hasn’t been changed since September

After lemmatizing Text:	 ['timed', 'timetable', 'change', 'service', 'period',

In [23]:
sentiment_data.to_csv('Clean sentiment datset.csv')