In [1]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import nltk 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Import data

In [2]:
data = pd.read_csv('C:/Users/Quynh Pham/Desktop/PM/Dataset/tweets.csv')
data.head()

Unnamed: 0,source_created_at,author_id,text,source,language,longitude,latitude,id,source_id,tweet_id,user_id,relevant,topic,ground_truth,sentiment
0,2020-09-18 21:56:20.798000,593731316,@DSisourath The Thameslink core between London...,sprinklr,en,-0.12574,51.50853,acd7673f-e621-5f1a-d662-df278964a6ea,,acd7673f-e621-5f1a-d662-df278964a6ea,Z003XDCS,True,service,True,negative
1,2020-10-13 07:31:53.122000,745583289520496640,@DulwichHistory Loving the complaint about peo...,sprinklr,en,-0.12574,51.50853,5b92aba8-4b05-6c63-8485-e9c870742137,,5b92aba8-4b05-6c63-8485-e9c870742137,Z003XDCS,True,delays,True,negative
2,2020-10-26 19:27:24.695000,303134761,@SW_Help .And yet you have no toilets on some ...,sprinklr,en,-0.12574,51.50853,0a799c07-8b76-17ba-b840-e538d51e832d,,0a799c07-8b76-17ba-b840-e538d51e832d,Z003XDCS,True,toilets,True,negative
3,2020-10-26 19:28:49.281000,303134761,@SW_Help you have no toilets on some of your t...,sprinklr,en,-0.12574,51.50853,8b4d2a34-c4f0-0e19-4055-dfe4af5f0e14,,8b4d2a34-c4f0-0e19-4055-dfe4af5f0e14,Z003XDCS,True,toilets,True,negative
4,2020-09-28 11:59:41.212000,56427671,@SpeedySticks007 @MrNeilJH @TLRailUK @christia...,sprinklr,en,-1.09125,50.79899,1fd08862-d8c7-0682-6b11-2603fba22d94,,1fd08862-d8c7-0682-6b11-2603fba22d94,Z003XDCS,True,seats,True,neutral


# Remove unnecessary columns except 'text' & 'topic

In [3]:
# drop other columns except Tweet Content and Sentiment

data.drop(data.columns.difference(['text','topic']), axis=1, inplace=True)
data.sort_values(by='text', inplace=True)
data

Unnamed: 0,text,topic
15196,""" Govia Thameslink fined £1m over passenger ki...",none
25,"""Congestion"" is NOT a reason for delay, it's a...",delays
26,"""Govia Thameslink Railway (GTR) is facing a £5...",delays
27,"""Minor delays"" #thameslink https://t.co/G46OnR...",delays
28,"""My apologies passengers, we appear to be havi...",doors
...,...,...
15076,🚄 The new Thameslink trains use regenerative b...,none
15077,🚄 •Overground: minor delays Hackney Wick to St...,delays
15078,🚆 A fault on a train earlier today at City Tha...,delays
15079,🚆 An operational incident between Finsbury Par...,delays


# Remove duplicate rows (i.e. same text & same topic)

In [4]:
# Show total number of duplicate rows 
# Keep 1 of the duplicates in the dataframe, anything else is flagged as duplicates 
# Hence, above is 320 (show all) and here is 218 (keep 1, flag the rest as duplicate) because some have 2 duplicates and some has 3 duplicates   

print('Total duplicate rows (same text, same topic): ', data.duplicated().sum())

Total duplicate rows (same text, same topic):  238


In [5]:
# list all duplicate rows in the entire dataset
# duplicated() parameters:
#    By default, keep = 'first': 1st of duplicated rows to be kept in the dataframe, the rest to be flagged as duplicates and show down here
#    keep = 'last': last of duplicated rows to be kept in the dataframe, the rest to be flagged as duplicates and show down here
#    keep = False: flag and show all duplicated rows (not keeping any rows in dataframe)

data.loc[data.duplicated(keep = False),:]

Unnamed: 0,text,topic
77,#Hernehill on the class 700 to #LondonVictoria...,air conditioning
15202,#Hernehill on the class 700 to #LondonVictoria...,air conditioning
284,#TLUpdates - Following a road vehicle collidin...,delays
283,#TLUpdates - Following a road vehicle collidin...,delays
523,#TLUpdates - We thank you for your patience du...,delays
...,...,...
16781,Train operator Govia Thameslink Railway has be...,none
15192,"🏡 Cricklewood Lane, Cricklewood, London, NW2Br...",none
15065,"🏡 Cricklewood Lane, Cricklewood, London, NW2Br...",none
15072,"📣 #NewInstruction!🏡 #ForSale: Millway, #NW7A w...",none


In [6]:
# Drop rows that already flagged as duplicates (keep 1 row in the dataframe)
# inplace=True: make changes to the origial DataFrame

data.drop_duplicates(inplace=True)

In [7]:
data.duplicated().sum()

0

# List duplicate tweet with different topics (for view, not treated)

In [8]:
# how many duplicate Tweet remains in the dataset

print('Total numbers of duplicate Tweet with different topics: ',data['text'].duplicated().sum())

Total numbers of duplicate Tweet with different topics:  962


In [9]:
# show the first 20 rows that has duplicate Tweet with different topics
# by default, only show the 1st row of duplicate and drop the last row of duplicate
# if want to keep the last row instead, set the parameter to duplicate(keep = 'last')
# if want to show both the duplicates, set the parameter to duplicate(False)

data[data['text'].duplicated(keep=False)].head(20)

Unnamed: 0,text,topic
67,#EastCroydon #LondonBridge #Victoria use https...,tickets/seat_reservations
66,#EastCroydon #LondonBridge #Victoria use https...,delays
309,#TLUpdates - If you are travelling on services...,delays
310,#TLUpdates - If you are travelling on services...,tickets/seat_reservations
357,#TLUpdates - Please allow extra time for your ...,tickets/seat_reservations
356,#TLUpdates - Please allow extra time for your ...,delays
370,#TLUpdates - Please listen carefully to statio...,station
371,#TLUpdates - Please listen carefully to statio...,service
382,#TLUpdates - Services are now able to travel o...,tickets/seat_reservations
381,#TLUpdates - Services are now able to travel o...,delays


# Check missing values in the dataset

In [10]:
data.isnull().any()

text     False
topic    False
dtype: bool

# Export dataset for topic to csv file

In [11]:
print('Dataset for Topic is ',data.shape)

Dataset for Topic is  (16711, 2)


In [12]:
#data.to_csv('topic_data.csv')

# Split data into training, test & validation sets

In [13]:
# Split the dataset into training, test & validation sets with ratio 70-15-15
# First, split dataset into traing & test (ratio 85-15) then split training sets into training & validation (70-15)
# Target variable is sentiment
# Return: X_train, X_test, X_val, y_train, y_test, y_val
# Parameters:
#    random_state: make sure to get the same 3 subsets everytime. Set to be any int values
#    stratify = sentiment (since there was an imbalance in neg, neu and positive sentiments and we have to set this in order to keep the same ratio in our subsets)

X = data['text'] #the entire dataframe without sentiment column
y = data['topic'] # our target variable

X_main, X_test, y_main, y_test = train_test_split(X,y,test_size = 0.15, random_state=2, stratify = y) #split datasset into main & test set
X_train, X_val, y_train, y_val = train_test_split(X_main, y_main, test_size=0.15, random_state=2, stratify = y_main) #split main into train & validation sets

In [14]:
print("Length of training set:\t\t",len(X_train)," ",len(y_train))
print("Length of test set:\t\t",len(X_test)," ",len(y_test))
print("Length of validation set:\t",len(X_val)," ",len(y_val))

Length of training set:		 12073   12073
Length of test set:		 2507   2507
Length of validation set:	 2131   2131


# Export training, validation & test sets to csv file

In [15]:
train = pd.DataFrame([X_train, y_train]).T
train

Unnamed: 0,text,topic
6646,@TLRailUK I’m trying to process a refund for t...,delays
11692,Another damning indictment on @TLRailUK on the...,toilets
4981,@TLRailUK @SouthernRailUK By severely reduced ...,delays
8307,@TLRailUK been waiting outside Dartford on the...,delays
1323,(Thameslink Update) 18:37 St Albans City to Su...,delays
...,...,...
15621,@TLRailUK Good morning! Just want to notify yo...,vandalism
5348,@TLRailUK All the displays screens in the carr...,none
13029,Please can you turn the heating off @TLRailUK ...,hvac
8308,@TLRailUK beyond pitiful. Told to go to platfo...,station


In [16]:
test = pd.DataFrame([X_test, y_test]).T
test

Unnamed: 0,text,topic
15444,@SouthernRailUK @southern @TLRailUK There you ...,tickets/seat_reservations
8504,@TLRailUK does your air system bring in fresh ...,air conditioning
3447,@LaraLipsey @tlupdates @TLRailUK They’ve obvio...,hvac
797,(Thameslink Update) 18:52 St Albans City to S...,delays
5272,@TLRailUK @thetrainline Thank you for letting ...,delays
...,...,...
7597,@TLRailUK There isn’t any seats left on this t...,none
16585,In case those affected by this had not seen th...,train_general
3310,@JillySueD @TLRailUK Couldn't spot any numbers...,delays
2071,1648 Grand Central train to Sunderland: On tim...,delays


In [17]:
validation = pd.DataFrame([X_val, y_val]).T
validation

Unnamed: 0,text,topic
2166,1820 Great Northern train to Moorgate: expecte...,delays
9559,@TLRailUK waiting for a train to go into the s...,delays
3511,@LutonTown @TLRailUK Hi Luton town any chance ...,none
4342,@SteveWhiteRail @HPDCommuters @TLRailUK @grant...,covid
8391,@TLRailUK cancelled four stops on the train we...,delays
...,...,...
12197,Govia Thameslink Railway has announced that Lu...,none
2337,2020 LNER train to Kings Cross: Delayed - plat...,delays
5211,@TLRailUK @nationalrailenq Bedford Station - t...,delays
5092,@TLRailUK @TfL @NetworkRailSE I would love it ...,delays


In [18]:
#test.to_csv('topic_test.csv') 
#train.to_csv('topic_train.csv')
#validation.to_csv('topic_validation.csv')

# Cleaning Tweet Content

In [19]:
# load spacy English language model trained based on web and social media texts
# add more stop words to the list of stop words list in spacy

nlp = spacy.load('en_core_web_sm')
nlp.Defaults.stop_words |= {'thameslink','tlupdates','gtrailuk','tlrailuk','govia', 'gtr'}
nlp.Defaults.stop_words.remove('not')



In [20]:
# Print the list of stop words
STOP_WORDS

print(len(STOP_WORDS))
print(STOP_WORDS)

331
{'does', 'keep', 'together', 'further', '’ll', 'already', 'this', 'the', '‘ve', 'while', 'itself', 'below', 'seemed', 'became', 'mostly', 'though', 'between', 'which', 'tlupdates', 'whether', 'wherever', 'would', 'never', 'in', 'too', 'hereupon', 'thereafter', 'because', 'top', 'everywhere', 'somewhere', 'any', 'go', 'third', 'very', 'both', 'might', 'or', 'always', 'anyway', 'yet', 'own', 'there', 'elsewhere', 'ca', 'behind', 'same', 'cannot', 'latterly', 'she', "n't", 'towards', '’m', 'using', 'nevertheless', 'such', 'get', 'part', 'wherein', 'whatever', 'well', 'how', 'becoming', "'s", 'although', 'done', 'less', 'do', 'will', 'all', 'otherwise', 'empty', '’s', 'again', 'anyone', 'hereafter', 'ours', 'are', 'alone', 'many', 'please', 'seems', 'themselves', 'were', 'indeed', 'across', 'am', 'with', 'several', 'still', 'next', 'hereby', 'so', 'whole', 'either', '‘s', 'myself', 'really', 'whose', 'what', '‘m', 'anything', 'almost', 'whereas', 'everyone', 'above', 'when', 'nor', 'no

In [21]:
# Create a function to clean tweet content
# Tweet will be cleaned by the following steps:
# 1. Remove emoji
# 2. Remove #thameslink, #TLUpdates, @gtrailuk, @TLRailUK
# 3. Turn word to lowercase
# 4. Remove Twitter @usernames
# 5. Remove hyperlink
# 6. Remove punctuations
# return clean text

def cleanText(text):
    #create a list of emojis pattern
    emoji_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   "]+", flags = re.UNICODE)
    text = re.sub(emoji_pattern,'', text) #remove emoji    
    text = re.sub(r'#thameslink','',text) #remove #thameslink
    text = re.sub(r'#TLUpdates','',text) #remove #TLUpdates
    text = re.sub(r'@gtrailuk','',text) #remove @gtrailuk
    text = re.sub(r'@TLRailUK','',text) #remove @TLRailUK
    
    text = text.lower() #turn every capitalization to lowercase    
    text = re.sub(r'@[A-Za-z0-9\_]+','',text) #remove @usernames, format of a username: alphanumeric characters (letters A-Z, numbers 0-9) with the exception of underscores
    text = re.sub(r'https?:\/\/\S+','',text) #remove hyperlink
    text = re.sub(r'\b\d+\b','',text) #remove numbers
    text = re.sub(r'[^\w\s]','',text) #remove punctuations
    return text

In [22]:
# Clean the tweet content
# Use for loop to run through every tweets in the clean dataset
# Cleaning steps include:
# 1. Basic clean (i.e. remove unnecessary emojis, patterns, punctuation...) using the cleanText function above
# 2. Tokenization: break sentence into words
# 3. Remove stop words
# 4. Lemmatization: strip words down to its root/stem

def cleanSentence(main_df):
    df = main_df.copy()
    lemmatizer = WordNetLemmatizer()

    for index, row in df.iterrows():
        filter_sentence = []
        sentence = cleanText(row['text']) # call our function above to clean user text
        words = nltk.word_tokenize(sentence) # tokenization
        words = [w for w in words if not w in STOP_WORDS] # stopwords removal
        for word in words:
            filter_sentence.append(lemmatizer.lemmatize(word)) # lemmatization
        clean_sentence = ' '.join(str(x) for x in filter_sentence if not len(x) == 1)
        df.at[index,'text'] = clean_sentence  
    return df