In [1]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

!pip install contractions -q
import contractions

!pip install textacy -q
from textacy import preprocessing

import re
import string

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/345.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m337.9/345.1 kB[0m [31m10.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.1/345.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.7/210.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.6/321.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m360.5/360.5 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

## Settings

In [2]:
# Mount Google Drive (for Colab)
from google.colab import drive
drive.mount('/content/drive')

# Dataset folder from Drive
dataset_dir = '/content/drive/My Drive/Colab Notebooks/NLP/toast-roast-dataset'

roast_files = [
    'Train__Roast.xlsx',
    'Val__Roast.xlsx',
    'Test__Roast.xlsx',
]
toast_files = [
    'Train__Toast.xlsx',
    'Val__Toast.xlsx',
    'Test__Toast.xlsx',
]


Mounted at /content/drive


## Read dataset

In [3]:
# Read roast/toast files and concatenate into a single dataset
roast_frames = [pd.read_excel(f"{dataset_dir}/{name}", index_col=None) for name in roast_files]
toast_frames = [pd.read_excel(f"{dataset_dir}/{name}", index_col=None) for name in toast_files]

df_roast = pd.concat(roast_frames, ignore_index=True).assign(label=0)
df_toast = pd.concat(toast_frames, ignore_index=True).assign(label=1)
df = pd.concat([df_roast, df_toast], ignore_index=True)

text_all = df["text"].astype(str).to_list()
labels = df["label"].to_numpy()

print(len(text_all), labels.shape, np.unique(labels), sum(labels==0), sum(labels==1))


119261 (119261,) [0 1] 68159 51102


## Preprocessing

In [4]:
# Define function for punctuation removal
def f_punctuation_removal(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    return text

# Define function for whitespace normalization
def f_whitespace_normalization(text):
  text = re.sub('[\s]+', ' ', text).strip()
  return text

# Remove contractions
text_all = [contractions.fix( text_current, slang=False) for text_current in text_all]

# Other steps... (feel free to add/remove as per your informed choice)
text_data = []

for text_curr in text_all:

  # - Tokenize
  tokens = word_tokenize(text_curr)

  # - Lemmatize
  lemmatized_tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]

  # - Remove stopwords
  filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]

  # - Convert from tokens to sentence, lower case, punctuation removal,and append to list
  sent = " ".join( filtered_tokens ).lower()
  sent = f_punctuation_removal( sent )

  # - Remove hashtags, user handles, emojis, urls, quotation marks, brackets, numbers
  sent = preprocessing.replace.hashtags( sent, repl=" ")
  sent = preprocessing.replace.user_handles( sent, repl=" ")
  sent = preprocessing.replace.emojis( sent, repl=" ")
  sent = preprocessing.replace.urls( sent, repl=" ")

  sent = preprocessing.normalize.quotation_marks( sent )
  sent = preprocessing.remove.html_tags( sent )
  sent = preprocessing.remove.brackets( sent )
  sent = re.sub('"', '', sent)
  sent = re.sub("'", '', sent)
  sent = preprocessing.replace.numbers( sent, repl=" " )

  # - Normalize whitespace
  sent = f_whitespace_normalization( sent )

  # - Append to list
  text_data.append( sent )



  text = re.sub('[\s]+', ' ', text).strip()


## Text Classification pipeline

In [5]:
# Split the dataset into training and testing subsets (80% train, 20% test)
text_Train, text_Test, Labs_Train, Labs_Test = train_test_split(text_data, labels, test_size=0.2, stratify=labels, random_state=0)

print( len(text_Train), sum(Labs_Train)/len(Labs_Train), len(text_Test), sum(Labs_Test)/len(Labs_Test) )

95408 0.42848608083179607 23853 0.4284995598037983


In [6]:
# Set custom hyperparameters for TfidfVectorizer
vectorizer = TfidfVectorizer( ngram_range=(1, 2),  # Consider unigrams and bigrams
                              max_features=5000  # Limit the number of features to 5000
                              )

In [7]:
# Apply TfidfVectorizer with custom hyperparameters to transform the text data into numerical features
Feats_Train_tfidf = vectorizer.fit_transform( text_Train ).toarray()
Feats_Test_tfidf = vectorizer.transform( text_Test ).toarray()

print( Feats_Train_tfidf.shape, Feats_Test_tfidf.shape )

(95408, 5000) (23853, 5000)


In [8]:
print( np.mean(Feats_Train_tfidf[:,0]), np.std(Feats_Train_tfidf[:,0]) )

0.00014110529570648448 0.008082978711863135


In [9]:
# Apply standard scaling
scaler = StandardScaler()
Feats_Train_tfidf = scaler.fit_transform( Feats_Train_tfidf )
Feats_Test_tfidf = scaler.transform( Feats_Test_tfidf )

print( Feats_Train_tfidf.shape, Feats_Test_tfidf.shape )

(95408, 5000) (23853, 5000)


In [10]:
print( np.mean(Feats_Train_tfidf[:,0]), np.std(Feats_Train_tfidf[:,0]) )

-2.9789650166028016e-19 1.0000000000006601


In [None]:
# Train an SVM classifier with the 'linear' kernel
clf = LinearSVC( class_weight="balanced", max_iter=1000, random_state=0)
clf.fit( Feats_Train_tfidf, Labs_Train)

# Predict the classes of the testing data
predLabs_Test = clf.predict( Feats_Test_tfidf )

# Evaluate the performance of the model using classification metrics
print(classification_report( Labs_Test, predLabs_Test ))

# Calculate the confusion matrix
cm = confusion_matrix( Labs_Test, predLabs_Test)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix - TFIDF Vectorizer')
plt.show()
