In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import nltk 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# IMPORT THE DATASET

In [2]:
data = pd.read_csv('C:/Users/Quynh Pham/Desktop/PM/Dataset/tweets.csv')
data.head()

Unnamed: 0,source_created_at,author_id,text,source,language,longitude,latitude,id,source_id,tweet_id,user_id,relevant,topic,ground_truth,sentiment
0,2020-09-18 21:56:20.798000,593731316,@DSisourath The Thameslink core between London...,sprinklr,en,-0.12574,51.50853,acd7673f-e621-5f1a-d662-df278964a6ea,,acd7673f-e621-5f1a-d662-df278964a6ea,Z003XDCS,True,service,True,negative
1,2020-10-13 07:31:53.122000,745583289520496640,@DulwichHistory Loving the complaint about peo...,sprinklr,en,-0.12574,51.50853,5b92aba8-4b05-6c63-8485-e9c870742137,,5b92aba8-4b05-6c63-8485-e9c870742137,Z003XDCS,True,delays,True,negative
2,2020-10-26 19:27:24.695000,303134761,@SW_Help .And yet you have no toilets on some ...,sprinklr,en,-0.12574,51.50853,0a799c07-8b76-17ba-b840-e538d51e832d,,0a799c07-8b76-17ba-b840-e538d51e832d,Z003XDCS,True,toilets,True,negative
3,2020-10-26 19:28:49.281000,303134761,@SW_Help you have no toilets on some of your t...,sprinklr,en,-0.12574,51.50853,8b4d2a34-c4f0-0e19-4055-dfe4af5f0e14,,8b4d2a34-c4f0-0e19-4055-dfe4af5f0e14,Z003XDCS,True,toilets,True,negative
4,2020-09-28 11:59:41.212000,56427671,@SpeedySticks007 @MrNeilJH @TLRailUK @christia...,sprinklr,en,-1.09125,50.79899,1fd08862-d8c7-0682-6b11-2603fba22d94,,1fd08862-d8c7-0682-6b11-2603fba22d94,Z003XDCS,True,seats,True,neutral


# REMOVE UNNECESSARY COLUMNS EXCEPT 'SOURCE_CREATED_AT', 'TEXT' & 'TOPIC¶

In [3]:
# drop other columns except Timestamp, Tweet Content and Topic

data.drop(data.columns.difference(['source_created_at','text','topic']), axis=1, inplace=True)
data

Unnamed: 0,source_created_at,text,topic
0,2020-09-18 21:56:20.798000,@DSisourath The Thameslink core between London...,service
1,2020-10-13 07:31:53.122000,@DulwichHistory Loving the complaint about peo...,delays
2,2020-10-26 19:27:24.695000,@SW_Help .And yet you have no toilets on some ...,toilets
3,2020-10-26 19:28:49.281000,@SW_Help you have no toilets on some of your t...,toilets
4,2020-09-28 11:59:41.212000,@SpeedySticks007 @MrNeilJH @TLRailUK @christia...,seats
...,...,...,...
16944,2019-07-11 07:34:35,Haha oh man the audio corruption on @TLRailUK ...,announcements
16945,2020-08-10 11:19:10.181000,@TLRailUK SweetIs there a plug to charge my ph...,plugs
16946,2020-08-29 09:51:10.833000,@TLRailUK now there are far fewer commuters ha...,tables
16947,2020-11-02 12:06:06.967000,@geofftech I am voting for Thameslink. 1. in a...,toilets


# SPLIT DATA INTO TRAINING, TEST & VALIDATION SETS

In [4]:
# Split the dataset into training, test & validation sets with ratio 70-15-15
# First, split dataset into traing & test (ratio 85-15) then split training sets into training & validation (70-15)
# Target variable is sentiment
# Return: X_train, X_test, X_val, y_train, y_test, y_val
# Parameters:
#    random_state: make sure to get the same 3 subsets everytime. Set to be any int values
#    stratify = sentiment (since there was an imbalance in neg, neu and positive sentiments and we have to set this in order to keep the same ratio in our subsets)

X = data.drop(columns='topic') #the entire dataframe without sentiment column
y = data['topic'] # our target variable

X_main, X_test, y_main, y_test = train_test_split(X,y,test_size = 0.15, random_state=2, stratify = y) #split datasset into main & test set
X_train, X_val, y_train, y_val = train_test_split(X_main, y_main, test_size=0.15, random_state=2, stratify = y_main) #split main into train & validation sets

In [5]:
print("Length of training set:\t\t",len(X_train)," ",len(y_train))
print("Length of test set:\t\t",len(X_test)," ",len(y_test))
print("Length of validation set:\t",len(X_val)," ",len(y_val))

Length of training set:		 12245   12245
Length of test set:		 2543   2543
Length of validation set:	 2161   2161


In [6]:
# create topic dataset

topic_data = pd.DataFrame(X_train)
topic_data['topic'] = y_train
topic_data

Unnamed: 0,source_created_at,text,topic
4788,2020-01-27 08:42:42,"@TLRailUK @GNRailUK so 2 trains arrive, one is...",delays
8447,2019-11-28 07:51:50,@TLRailUK currently sitting on the 7.46 (41106...,delays
6051,2019-04-30 06:47:40,@TLRailUK Hiya - 7.45 from Shortlands towards ...,hvac
13172,2019-05-20 18:57:13,Service announcement: Anyone with a standard c...,none
3922,2019-07-21 09:41:15,"@RichardWellings How do you work that out, Ric...",none
...,...,...,...
13264,2020-06-26 12:44:36.401000,Social Distancing #stpancrasinternational #tha...,covid
4117,2019-02-19 08:29:50,@Se_Railway please can you explain why your de...,tickets/seat_reservations
5589,2020-02-17 22:55:11,@TLRailUK Can’t wait for my £2 compensation Th...,delays
14026,2019-01-29 08:44:46,"Train delayed, no information, Thameslink path...",delays


In [7]:
# sort the dataset by timestamp so that later on, all the duplicates are in order next to each other 

topic_data.sort_values(by='source_created_at', inplace=True)
topic_data

Unnamed: 0,source_created_at,text,topic
10484,2019-01-16 10:41:46,@catherinerusse2 @TLRailUK The definitely know...,delays
11081,2019-01-16 11:58:45,@myubi @UlyssesGuybrush @delayrepayagent @TLRa...,delays
5269,2019-01-16 12:13:46,@TLRailUK @thebiggm Why do you @TLRailUK keep ...,vandalism
1068,2019-01-16 12:44:05,(Thameslink Update) 11:28 Rainham Kt to Luton ...,delays
1091,2019-01-16 12:44:07,(Thameslink Update) 12:28 Rainham Kt to Luton ...,delays
...,...,...,...
15084,2020-11-28 09:53:05.402000,@SouthernRailUK @TLRailUK Deeply grateful for...,service
16882,2020-11-28 19:41:56.244000,@TLRailUK I would say the roof gives it away.....,none
15082,2020-11-30 07:28:51.195000,@TLRailUK Thanks. The 0701 departed at 0704. W...,none
16881,2020-11-30 16:23:15.962000,With the student travel window opening this Th...,service


In [8]:
# remove timestamp in the topic dataset

topic_data.drop('source_created_at', axis=1, inplace=True)
topic_data

Unnamed: 0,text,topic
10484,@catherinerusse2 @TLRailUK The definitely know...,delays
11081,@myubi @UlyssesGuybrush @delayrepayagent @TLRa...,delays
5269,@TLRailUK @thebiggm Why do you @TLRailUK keep ...,vandalism
1068,(Thameslink Update) 11:28 Rainham Kt to Luton ...,delays
1091,(Thameslink Update) 12:28 Rainham Kt to Luton ...,delays
...,...,...
15084,@SouthernRailUK @TLRailUK Deeply grateful for...,service
16882,@TLRailUK I would say the roof gives it away.....,none
15082,@TLRailUK Thanks. The 0701 departed at 0704. W...,none
16881,With the student travel window opening this Th...,service


# REMOVE DUPLICATE ROWS (I.E. SAME TEXT & SAME TOPIC)

In [9]:
# Show total number of duplicate rows 
# Keep 1 of the duplicates in the dataframe, anything else is flagged as duplicates 
# Hence, above is 320 (show all) and here is 218 (keep 1, flag the rest as duplicate) because some have 2 duplicates and some has 3 duplicates   

print('Total duplicate rows (same text, same topic): ', topic_data.duplicated().sum())

Total duplicate rows (same text, same topic):  131


In [10]:
# list all duplicate rows in the entire dataset
# duplicated() parameters:
#    By default, keep = 'first': 1st of duplicated rows to be kept in the dataframe, the rest to be flagged as duplicates and show down here
#    keep = 'last': last of duplicated rows to be kept in the dataframe, the rest to be flagged as duplicates and show down here
#    keep = False: flag and show all duplicated rows (not keeping any rows in dataframe)

topic_data.loc[topic_data.duplicated(keep = False),:]

Unnamed: 0,text,topic
888,(Thameslink Update) 06:30 Luton to Orpington d...,delays
887,(Thameslink Update) 06:30 Luton to Orpington d...,delays
891,(Thameslink Update) 06:30 Rainham Kt to Luton ...,delays
892,(Thameslink Update) 06:30 Rainham Kt to Luton ...,delays
13363,TRAVEL: 20 minute delays @GatwickExpress @Sout...,delays
...,...,...
13453,TRAVEL: UPDATE: 10 minute delays @TLRailUK Nor...,delays
803,(Thameslink Update) 19:22 St Albans City to S...,delays
804,(Thameslink Update) 19:22 St Albans City to S...,delays
7671,@TLRailUK Trains delayed are they?,delays


In [11]:
# Drop rows that already flagged as duplicates (keep 1 row in the dataframe)
# inplace=True: make changes to the origial DataFrame

topic_data.drop_duplicates(inplace=True)

In [12]:
topic_data.duplicated().sum()

0

# LIST DUPLICATE TWEET WITH DIFFERENT TOPICS (FOR VIEW, NOT TREATED)

In [13]:
# how many duplicate Tweet remains in the dataset

print('Total numbers of duplicate Tweet with different topics: ',topic_data['text'].duplicated().sum())

Total numbers of duplicate Tweet with different topics:  513


In [14]:
# show the first 20 rows that has duplicate Tweet with different topics
# by default, only show the 1st row of duplicate and drop the last row of duplicate
# if want to keep the last row instead, set the parameter to duplicate(keep = 'last')
# if want to show both the duplicates, set the parameter to duplicate(False)

topic_data[topic_data['text'].duplicated(keep=False)].head(20)

Unnamed: 0,text,topic
8383,@TLRailUK can you tell the driver of the delay...,delays
8382,@TLRailUK can you tell the driver of the delay...,service
6476,@TLRailUK Is there honestly no way to sort the...,air conditioning
6477,@TLRailUK Is there honestly no way to sort the...,hvac
11408,@thameslink 13.28 to Sutton arrived at West Ha...,delays
11409,@thameslink 13.28 to Sutton arrived at West Ha...,station
14120,Warning train rant!!Since timetable of May 18 ...,plugs
14119,Warning train rant!!Since timetable of May 18 ...,tables
12490,I pay a shit load of money to sit on crap seat...,wifi
12489,I pay a shit load of money to sit on crap seat...,tables


In [15]:
print('Dataset for Topic is ',topic_data.shape)

Dataset for Topic is  (12114, 2)


# CHECK MISSING VALUES IN THE DATASET

In [16]:
topic_data.isnull().any()

text     False
topic    False
dtype: bool

# CLEANING TWEET CONTENT

In [17]:
# load spacy English language model trained based on web and social media texts
# add more stop words to the list of stop words list in spacy

nlp = spacy.load('en_core_web_sm')
nlp.Defaults.stop_words |= {'thameslink','tlupdates','gtrailuk','tlrailuk','govia', 'gtr'}

In [18]:
# Print the list of stop words
STOP_WORDS

print(len(STOP_WORDS))
print(STOP_WORDS)

332
{'beside', 'bottom', 'gtrailuk', 'herself', 'they', 'up', 'we', 'quite', 'neither', 'every', 'besides', 'themselves', 'anything', 'move', 'most', 'whereas', 'already', 'an', 'becomes', '’ll', 'keep', 'be', 'been', 'everyone', 'our', 'again', 'empty', 'full', 'but', 'seem', 'ours', 'less', 'to', 'itself', 'nobody', 'anyhow', 'yourself', 'only', 'per', 'anywhere', 'same', 'meanwhile', 'made', 'seemed', 'regarding', 'toward', '’ve', 'cannot', 'with', 'never', 'take', '’s', 'hereupon', 'not', 'nor', 'thameslink', 'twelve', 'may', 'whoever', 'she', 'would', 'above', 'thereupon', 'whenever', 'where', 'then', 'against', 'done', 'n‘t', 'whom', 'someone', 'what', 'two', 'back', 'now', 'three', 'latterly', 'within', 'tlupdates', 'get', 'least', 'for', 'out', 'him', 'became', 'although', 'none', 'front', 'a', 'on', 'between', 'might', 'indeed', 'doing', 'last', 'something', 'than', 'this', '‘ve', 'everywhere', 'about', 'my', 'nevertheless', 'below', 'thence', 'beyond', 'after', 'fifty', 'behi

In [19]:
# Create a function to clean tweet content
# Tweet will be cleaned by the following steps:
# 1. Remove emoji
# 2. Remove #TLUpdates
# 3. Turn word to lowercase
# 4. Remove Twitter @usernames
# 5. Remove hyperlink
# 6. Remove punctuations
# Use function re.sub() to subsitute particular sub-string with another sub-string 
    # arguments: (condition [or pattern], substitute with, passing string).
    # Patterns: 
    # [ character block start
    # ^ not these characters (letters, numbers)
    # \w word characters
    # \s space characters
    # ] character block end.

def cleanText(text):
    #create a list of emojis pattern
    emoji_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   "]+", flags = re.UNICODE)
    text = re.sub(emoji_pattern,'', text) #remove emoji    
    text = re.sub(r'#TLUpdates','',text) #remove the hashtag #TLUpdates    
    text = text.lower() #turn every capitalization to lowercase    
    text = re.sub(r'@[A-Za-z0-9\_]+','',text) #remove @usernames, format of a username: alphanumeric characters (letters A-Z, numbers 0-9) with the exception of underscores
    text = re.sub(r'https?:\/\/\S+','',text) #remove hyperlink
    text = re.sub(r'\b\d+\b','',text) #remove numbers
    text = re.sub(r'[^\w\s]','',text) #remove punctuations
    return text

In [20]:
# Clean the tweet content
# Use for loop to run through every tweets in the clean dataset
# Cleaning steps include:
# 1. Basic clean (i.e. remove unnecessary emojis, patterns, punctuation...) using the cleanText function above
# 2. Tokenization: break sentence into words
# 3. Remove stop words
# 4. Lemmatization: strip words down to its root/stem

lemmatizer = WordNetLemmatizer()

for index, row in topic_data.iterrows():
    print("Original Text:\t\t", row['text'])
    filter_sentence = []
    sentence = cleanText(row['text']) # call our function above to clean user text
    words = nltk.word_tokenize(sentence) # tokenization
    words = [w for w in words if not w in STOP_WORDS] # stopwords removal
    for word in words:
        filter_sentence.append(lemmatizer.lemmatize(word)) # lemmatization
    print("\nAfter lemmatizing Text:\t", filter_sentence,"\n")
    topic_data.at[index,'text'] = filter_sentence  

Original Text:		 @catherinerusse2 @TLRailUK The definitely know how to delay because their really good at it 😂😃

After lemmatizing Text:	 ['definitely', 'know', 'delay', 'good'] 

Original Text:		 @myubi @UlyssesGuybrush @delayrepayagent @TLRailUK It has been timed at 0839 since the timetable change, and the service itself for this period hasn’t been changed since September

After lemmatizing Text:	 ['timed', 'timetable', 'change', 'service', 'period', 'hasnt', 'changed', 'september'] 

Original Text:		 @TLRailUK @thebiggm Why do you @TLRailUK keep asking passengers what service had a graffitied train?? They are your trains, stored in your facilities! You MUST know ...... if you have eyes!

After lemmatizing Text:	 ['asking', 'passenger', 'service', 'graffitied', 'train', 'train', 'stored', 'facility', 'know', 'eye'] 

Original Text:		 (Thameslink Update) 11:28 Rainham Kt to Luton due 14:01 - 11:28 Rainham Kt to Luton due 14:01 has been delayed at Dartford and is now 5 minutes late.

A

In [21]:
topic_data.to_csv("Clean topic dataset.csv")