In [1]:
import re 
import spacy 
import string 
import numpy as np 
import pandas as pd 

In [2]:
df = pd.read_csv('Tweets.csv')
df.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@USAirways Is there a phone line to call into ...
1,positive,@united Bag was finally delivered and intact. ...
2,positive,@usairways Thanks to Kevin and team at F38ish ...
3,negative,"@AmericanAir Yes, talked to them. FLL says is ..."
4,negative,@VirginAmerica and it's a really big bad thing...


# Data Preprocessing

In [3]:
# Train Test Split 

df.iloc[:, 0].value_counts()
# Note that there are three imbalanced sentiment classes - with the negative class having the highest frequency

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [4]:
# First, we need to convert these classes into numbers 
from sklearn.preprocessing import LabelEncoder
labels = df.iloc[:, 0]

label_encoder = LabelEncoder()

label_encoder.fit(labels)
print(f"The relevant classes are: {label_encoder.classes_}")

df['target_class'] = label_encoder.transform(labels)
df.head()

The relevant classes are: ['negative' 'neutral' 'positive']


Unnamed: 0,airline_sentiment,text,target_class
0,neutral,@USAirways Is there a phone line to call into ...,1
1,positive,@united Bag was finally delivered and intact. ...,2
2,positive,@usairways Thanks to Kevin and team at F38ish ...,2
3,negative,"@AmericanAir Yes, talked to them. FLL says is ...",0
4,negative,@VirginAmerica and it's a really big bad thing...,0


In [5]:
# Retain the relevant columns 
df = df.drop(['airline_sentiment'], axis=1)
df.head()

Unnamed: 0,text,target_class
0,@USAirways Is there a phone line to call into ...,1
1,@united Bag was finally delivered and intact. ...,2
2,@usairways Thanks to Kevin and team at F38ish ...,2
3,"@AmericanAir Yes, talked to them. FLL says is ...",0
4,@VirginAmerica and it's a really big bad thing...,0


In [6]:
# Now, we need to split the data into train and test but preserving the ratio of classes
ratios = lambda x : (x.value_counts() / len(x)) * 100
print(ratios(df.target_class))

from sklearn.model_selection import train_test_split
X, y = df.text, df.target_class
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

0    62.691257
1    21.168033
2    16.140710
Name: target_class, dtype: float64


In [7]:
print(len(X_train) / len(X), len(X_test) / len(X))
# Correct 80-20 split

# Also, we note that the class ratios have been preserved after the split as well:
print(f"Train Data:\n{ratios(y_train)},\nTest data:\n{ratios(y_test)}")

0.8 0.2
Train Data:
0    62.696380
1    21.166325
2    16.137295
Name: target_class, dtype: float64,
Test data:
0    62.670765
1    21.174863
2    16.154372
Name: target_class, dtype: float64


In [8]:
# load English library from spacy 
nlp = spacy.load("en_core_web_sm")

# list of stop words 
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# punctuations
punctuations = string.punctuation + '...'

In [9]:
# Remove URLS and @ mentions 
remove_urls = lambda text: re.sub(rf"\S*https?:\S*|@\S*", "", text, flags=re.MULTILINE)

# remove emojis
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [10]:
# tokenizer 
def tokenizer(text):

    # Remove unwanted URLs
    text = remove_urls(text)
    text = remove_emoji(text)

    tokens = nlp(text)
    
    # Lemmatize each token maintain relevant case
    tokens = [word.lemma_.strip().lower() if word.pos_ != "PROPN" else word.lemma_.strip() for word in tokens]

    # Remove stop words and punctuation
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations]

    return tokens 

In [11]:
# We can see how a sample text is preprocessed 

# Before preprocessing 
tweet = df['text'].iloc[0]
print(tweet)

# After preprocessing 
print(tokenizer(tweet))

@USAirways Is there a phone line to call into that isn't being affected by "bad weather"? Trying to book a vacation, here... 👀
['phone', 'line', 'affect', 'bad', 'weather', 'try', 'book', 'vacation']


In [12]:
# Now, tokenizing the entire array of tweets
cleaned_words = X_train.apply(tokenizer)

In [13]:
# Distribution

# Check most 10 most common words after cleaning 
from collections import Counter 
Counter(x for xs in cleaned_words for x in set(xs)).most_common(10)

[('flight', 2879),
 ('thank', 1288),
 ('hour', 844),
 ('cancel', 787),
 ('service', 754),
 ('delay', 738),
 ('time', 728),
 ('help', 711),
 ('customer', 700),
 ('fly', 585)]

# Bag of Words Model 

To create a numeric feature representation of these tweets, we employ the bag of words model using the custom tokenizer function defined above.

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vector = CountVectorizer(tokenizer=tokenizer, ngram_range=(1,1))

In [15]:
sample_tweets = ['The flight was good', 'The flight was on time']
bow_vector.fit_transform(sample_tweets).toarray()

print(bow_vector.get_feature_names_out())
print(bow_vector.fit_transform(sample_tweets).toarray())


['flight' 'good' 'time']
[[1 1 0]
 [1 0 1]]


# Classification pipeline

We can combine our preprocessing tasks and machine learning classification into a single pipeline using sklearn's Pipeline module.

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(multi_class="multinomial", max_iter=500)
pipeline = Pipeline([('bow_vector', bow_vector),('classifier', logreg)])

In [17]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('bow_vector',
                 CountVectorizer(tokenizer=<function tokenizer at 0x0000019360F9A9D0>)),
                ('classifier',
                 LogisticRegression(max_iter=500, multi_class='multinomial'))])

In [18]:
y_hat = pipeline.predict(X_test)

In [19]:
print(f"Model Accuracy: {(y_hat == y_test).sum() / len(y_test)}")

Model Accuracy: 0.7797131147540983


In [20]:
from sklearn import metrics

print(metrics.classification_report(y_test, y_hat, digits=3, target_names=['negative', 'neutral', 'positive']))

              precision    recall  f1-score   support

    negative      0.844     0.877     0.860      1835
     neutral      0.600     0.573     0.586       620
    positive      0.743     0.672     0.706       473

    accuracy                          0.780      2928
   macro avg      0.729     0.707     0.717      2928
weighted avg      0.776     0.780     0.777      2928

