In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

!pip install contractions -q
import contractions

!pip install textacy -q
from textacy import preprocessing

import re
import string

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.7/110.7 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.7/210.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.4/320.4 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m
[?25h

## Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Settings

In [None]:
from pathlib import Path

dataset_path = Path("/content/drive/My Drive/Colab Notebooks/NLP/toast-roast-dataset")

## Read and Concatenate Dataset

In [None]:
# Load roast and toast files for each split
df_train_roast = pd.read_excel(dataset_path / "Train__Roast.xlsx", index_col=None)
df_train_toast = pd.read_excel(dataset_path / "Train__Toast.xlsx", index_col=None)
df_val_roast   = pd.read_excel(dataset_path / "Val__Roast.xlsx", index_col=None)
df_val_toast   = pd.read_excel(dataset_path / "Val__Toast.xlsx", index_col=None)
df_test_roast  = pd.read_excel(dataset_path / "Test__Roast.xlsx", index_col=None)
df_test_toast  = pd.read_excel(dataset_path / "Test__Toast.xlsx", index_col=None)

# Assign labels: Roast = 0, Toast = 1
df_train_roast["label"] = 0
df_train_toast["label"] = 1
df_val_roast["label"]   = 0
df_val_toast["label"]   = 1
df_test_roast["label"]  = 0
df_test_toast["label"]  = 1

# Concatenate all splits into one dataframe
df = pd.concat([df_train_roast, df_train_toast,
                df_val_roast, df_val_toast,
                df_test_roast, df_test_toast], ignore_index=True)

text_all = df["text"].astype(str).to_list()
labels = df["label"].to_numpy()

print(f"Total samples: {len(text_all)}, Labels shape: {labels.shape}")
print(f"Unique labels: {np.unique(labels)}, Roast (0): {sum(labels==0)}, Toast (1): {sum(labels==1)}")

## Preprocessing

In [None]:
# Define function for punctuation removal
def f_punctuation_removal(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    return text

# Define function for whitespace normalization
def f_whitespace_normalization(text):
  text = re.sub(r'[\s]+', ' ', text).strip()
  return text

# Remove contractions
text_all = [contractions.fix( text_current, slang=False) for text_current in text_all]

# Other steps... (feel free to add/remove as per your informed choice)
text_data = []

for text_curr in text_all:

  # - Tokenize
  tokens = word_tokenize(text_curr)

  # - Lemmatize
  lemmatized_tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]

  # - Remove stopwords
  filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]

  # - Convert from tokens to sentence, lower case, punctuation removal,and append to list
  sent = " ".join( filtered_tokens ).lower()
  sent = f_punctuation_removal( sent )

  # - Remove hashtags, user handles, emojis, urls, quotation marks, brackets, numbers
  sent = preprocessing.replace.hashtags( sent, repl=" ")
  sent = preprocessing.replace.user_handles( sent, repl=" ")
  sent = preprocessing.replace.emojis( sent, repl=" ")
  sent = preprocessing.replace.urls( sent, repl=" ")

  sent = preprocessing.normalize.quotation_marks( sent )
  sent = preprocessing.remove.html_tags( sent )
  sent = preprocessing.remove.brackets( sent )
  sent = re.sub('"', '', sent)
  sent = re.sub("'", '', sent)
  sent = preprocessing.replace.numbers( sent, repl=" " )

  # - Normalize whitespace
  sent = f_whitespace_normalization( sent )

  # - Append to list
  text_data.append( sent )

## Text Classification pipeline

In [5]:
# Split the dataset into training and testing subsets (80% train, 20% test)
text_Train, text_Test, Labs_Train, Labs_Test = train_test_split(text_data, labels, test_size=0.2, stratify=labels, random_state=0)

print( len(text_Train), sum(Labs_Train)/len(Labs_Train), len(text_Test), sum(Labs_Test)/len(Labs_Test) )

800 0.5 200 0.5


In [6]:
# Set custom hyperparameters for CountVectorizer
vectorizer = CountVectorizer( ngram_range=(1, 2),  # Consider unigrams and bigrams
                              max_features=5000  # Limit the number of features to 5000
                              )

In [7]:
# Apply CountVectorizer with custom hyperparameters to transform the text data into numerical features
Feats_Train_countvec = vectorizer.fit_transform( text_Train ).toarray()
Feats_Test_countvec = vectorizer.transform( text_Test ).toarray()

print( Feats_Train_countvec.shape, Feats_Test_countvec.shape )

(800, 5000) (200, 5000)


In [8]:
print( np.mean(Feats_Train_countvec[:,0]), np.std(Feats_Train_countvec[:,0]) )

0.0025 0.04993746088859545


In [9]:
# Apply standard scaling
scaler = StandardScaler()
Feats_Train_countvec = scaler.fit_transform( Feats_Train_countvec )
Feats_Test_countvec = scaler.transform( Feats_Test_countvec )

print( Feats_Train_countvec.shape, Feats_Test_countvec.shape )

(800, 5000) (200, 5000)


In [10]:
print( np.mean(Feats_Train_countvec[:,0]), np.std(Feats_Train_countvec[:,0]) )

1.3322676295501878e-17 1.0000000000000016


In [None]:
# Train an SVM classifier with the 'linear' kernel
clf = LinearSVC( class_weight="balanced", max_iter=1000, random_state=0)
clf.fit( Feats_Train_countvec, Labs_Train)

# Predict the classes of the testing data
predLabs_Test = clf.predict( Feats_Test_countvec )

# Evaluate the performance of the model using classification metrics
print(classification_report( Labs_Test, predLabs_Test, target_names=["Roast", "Toast"] ))

# Calculate the confusion matrix
cm = confusion_matrix( Labs_Test, predLabs_Test)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Roast", "Toast"], yticklabels=["Roast", "Toast"])
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix - Count Vectorizer (Roast vs Toast)')
plt.show()