Dataset: https://www.kaggle.com/datasets/sbhatti/financial-sentiment-analysis

In [1]:
import pandas as pd

def read_csv(fname: str) -> pd.DataFrame:
    return pd.read_csv(fname)

In [2]:
df = read_csv('data.csv')

df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [3]:
df.describe()

Unnamed: 0,Sentence,Sentiment
count,5842,5842
unique,5322,3
top,"Operating loss totalled EUR 0.9 mn , down from...",neutral
freq,2,3130


In [4]:
def read_data(fname: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    data = read_csv(fname)
    return data.Sentence, data.Sentiment

In [5]:
X, y = read_data('data.csv')
(y.value_counts() / y.size)

Sentiment
neutral     0.535775
positive    0.317015
negative    0.147210
Name: count, dtype: float64

Traditional ML models can handle string output, but deep learning models require numeric output. Some loss functions (f1) actually require one-hot encoded, but we'll just use some other loss function.

In [6]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [7]:
label_encoder.inverse_transform([0, 1, 2])

array(['negative', 'neutral', 'positive'], dtype=object)

We can use class weights (= inverse of the distribution) to train our models should we wish to.

In [8]:
DISTRIBUTION = { i: (y == i).sum() / y.size for i in [0, 1, 2] }
CLASS_WEIGHTS = { i: 1 / DISTRIBUTION[i] for i in [0, 1, 2] }
CLASS_WEIGHTS

{0: np.float64(6.793023255813953),
 1: np.float64(1.8664536741214057),
 2: np.float64(3.1544276457883367)}

In [9]:
X = X.to_numpy()

Let's extract a test dataset and put it away.

In [10]:
from sklearn.model_selection import train_test_split

TEST_SPLIT = 0.1
VALIDATION_SPLIT = 0.15

X, X_test, y, y_test = train_test_split(X, y, random_state=0, stratify=y, test_size=TEST_SPLIT)

### Evalution - Scoring Methods

Choice of metric depends on how we plan to use this model. Note that this is imbalanced classification, so plain accuracy might not be the best metric. Since there's no specific goal in mind, I'm looking at a bunch of metrics to get an overall evaluation of the model.

In [11]:
from enum import Enum

class ScoringMethod(Enum):
    CLASSIFICATION_REPORT = 0
    CONFUSION_MATRIX = 1
    CROSS_VAL_SCORE = 2
    SUMMARY = 3

In [12]:
import numpy as np

from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    cohen_kappa_score,
)

def custom_summary(y_val: np.ndarray, y_pred: np.ndarray) -> dict[str, float]:
    return {
        "accuracy": accuracy_score(y_val, y_pred),
        "balanced_accuracy": float(balanced_accuracy_score(y_val, y_pred)),
        "f1_macro": float(f1_score(y_val, y_pred, average="macro")),
        "f1_weighted": float(f1_score(y_val, y_pred, average="weighted")),
        "precision_weighted": float(precision_score(y_val, y_pred, average="weighted")),
        "cohen_kappa": float(cohen_kappa_score(y_val, y_pred, weights="linear")),
    }

In [13]:
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix

METRICS = (
    "accuracy",
    "balanced_accuracy",
    "f1_macro",
    "f1_weighted",
    "precision_weighted",
)

def score(model,
          X: np.ndarray,
          y: np.ndarray,
          method: ScoringMethod = ScoringMethod.CROSS_VAL_SCORE,
          scoring: str = "balanced_accuracy"):

    if method == ScoringMethod.CROSS_VAL_SCORE:
        results = cross_validate(model, X, y, scoring=METRICS, cv=3, n_jobs=-1)
        return {
            f"cv_{metric}": results[f"test_{metric}"].mean()
            for metric in METRICS
        }

    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      random_state=0,
                                                      stratify=y,
                                                      test_size=VALIDATION_SPLIT)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    return {
        ScoringMethod.CLASSIFICATION_REPORT: classification_report,
        ScoringMethod.CONFUSION_MATRIX: confusion_matrix,
        ScoringMethod.SUMMARY: custom_summary,
    }.get(method)(y_val, y_pred)

In [14]:
from sklearn.metrics import ConfusionMatrixDisplay

def plot_confusion_matrix(confusion_matrix: np.ndarray):
    ConfusionMatrixDisplay(confusion_matrix, display_labels=model.classes_).plot() 

### Extremely Basic Models

In [15]:
from sklearn.base import BaseEstimator

class RandomModel(BaseEstimator):
    def fit(self, X, y=None):
        if y is None:
            self.distribution = { 0: 1/3, 1: 1/3, 2: 1/3 }
        else:
            n = y.size
            self.distribution = {
                i: (y == i).sum() / n
                for i in [0, 1, 2]
            }

    def predict(self, X):
        return pd.Series(np.random.choice(
            list(self.distribution.keys()),
            size=X.size,
            p=list(self.distribution.values())
        ), name = "Sentiment")

In [16]:
score(RandomModel(), X, y, ScoringMethod.CONFUSION_MATRIX)

array([[ 12,  62,  42],
       [ 69, 225, 129],
       [ 40, 138,  72]])

In [17]:
score(RandomModel(), X, y, ScoringMethod.SUMMARY)

{'accuracy': 0.39923954372623577,
 'balanced_accuracy': 0.30889008450857314,
 'f1_macro': 0.3083160697178291,
 'f1_weighted': 0.3970393223392711,
 'precision_weighted': 0.39492872102437837,
 'cohen_kappa': -0.04282117662133644}

In [18]:
class SingleSentimentModel(BaseEstimator):
    def __init__(self, sentiment: int = 0):
        self.sentiment = sentiment

    def fit(self, X, y=None):
        pass

    def predict(self, X):
        return [self.sentiment] * X.size

In [19]:
score(SingleSentimentModel(sentiment=1), X, y, ScoringMethod.SUMMARY)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'accuracy': 0.5361216730038023,
 'balanced_accuracy': 0.3333333333333333,
 'f1_macro': 0.23267326732673266,
 'f1_weighted': 0.37422354402740654,
 'precision_weighted': 0.2874264482643959,
 'cohen_kappa': 0.0}

As noted earlier, accuracy might not be the best metric as this is an imbalanced classification. We see that picking neutral for everything gives us a better accuracy than picking a random sentiment, but is worse according to every other metric.

## Text processing

In [20]:
import nltk

nltk.download("punkt")  # stemmer
nltk.download("punkt_tab")  # word_tokenize
nltk.download("stopwords")
nltk.download("wordnet")  # lemmatizer
nltk.download('averaged_perceptron_tagger_eng') # pos_tag
nltk.download('universal_tagset')  # pos_tag

[nltk_data] Downloading package punkt to /home/recurze/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/recurze/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/recurze/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/recurze/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/recurze/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/recurze/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

#### Preprocessor

Clean up code by removing some unwanted elements and lowercasing the text

In [21]:
import re

def preprocessor(text: str) -> str:
    regex_url = r"(http|www)\S+"
    regex_twitter_mentions = r"@\w+"
    regex_ticker = r"\$[A-Za-z]{1,5}(\.[A-Za-z]{1,3})?"
    regex_punctuation = r"[^\w\s']+"

    replacements = [
        (regex_url, ""),
        (regex_twitter_mentions, ""),
        #(regex_ticker, ""),
        #(regex_punctuation, ""),
    ]

    text = text.lower()
    for pattern, replacement in replacements:
        text = re.sub(pattern, replacement, text)

    return text

In [22]:
X[1000], preprocessor(X[1000])

('$STX long play, another consistent profit zone.. check it out https://t.co/DQfuzOzYlh https://t.co/ggne4ayzqn',
 '$stx long play, another consistent profit zone.. check it out  ')

In [23]:
X[3147], preprocessor(X[3147])

('Finnish fibers and plastic products maker Suominen Corporation said its net loss from continuing operations narrowed to 1.8 mln euro ( $ 2.3 mln ) in 2006 from 3.7 mln euro ( $ 4.8 mln ) in 2005 .',
 'finnish fibers and plastic products maker suominen corporation said its net loss from continuing operations narrowed to 1.8 mln euro ( $ 2.3 mln ) in 2006 from 3.7 mln euro ( $ 4.8 mln ) in 2005 .')

#### StemTokenizer

See https://en.wikipedia.org/wiki/Stemming

In [24]:
from nltk.stem.porter import PorterStemmer

class StemTokenizer:
    # https://scikit-learn.org/stable/modules/feature_extraction.html#customizing-the-vectorizer-classes

    def __init__(self):
        self.stemmer = PorterStemmer()

    def __call__(self, text: str) -> list[str]:
        return [self.stemmer.stem(token) for token in nltk.word_tokenize(text)]

X[1], StemTokenizer()(preprocessor(X[1]))

('For 2009 , net profit was EUR 3 million and the company paid a dividend of EUR 1.30 apiece .',
 ['for',
  '2009',
  ',',
  'net',
  'profit',
  'wa',
  'eur',
  '3',
  'million',
  'and',
  'the',
  'compani',
  'paid',
  'a',
  'dividend',
  'of',
  'eur',
  '1.30',
  'apiec',
  '.'])

#### LemmaTokenizer

See https://en.wikipedia.org/wiki/Lemmatization

In [25]:
from nltk.stem.wordnet import WordNetLemmatizer

class LemmaTokenizer:
    # https://scikit-learn.org/stable/modules/feature_extraction.html#customizing-the-vectorizer-classes

    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()

    def __call__(self, text: str) -> list[str]:
        # https://github.com/slavpetrov/universal-pos-tags
        # https://www.nltk.org/api/nltk.stem.wordnet.html#nltk.stem.wordnet.WordNetLemmatizer.lemmatize

        tagged_tokens = nltk.pos_tag(nltk.word_tokenize(text), tagset="universal")
        pos_tag_map = { "VERB": 'v', "ADJ": 'a', "ADV": 'r' }
        return [
            self.lemmatizer.lemmatize(token, pos=pos_tag_map.get(pos, 'n'))
            for token, pos in tagged_tokens
        ]

X[1], LemmaTokenizer()(preprocessor(X[1]))

('For 2009 , net profit was EUR 3 million and the company paid a dividend of EUR 1.30 apiece .',
 ['for',
  '2009',
  ',',
  'net',
  'profit',
  'be',
  'eur',
  '3',
  'million',
  'and',
  'the',
  'company',
  'pay',
  'a',
  'dividend',
  'of',
  'eur',
  '1.30',
  'apiece',
  '.'])

#### Stop Words

Common words like "a" and "the" are present in most text, but do not provide any information about the sentiment of the text. Such words are called stop words. We may as well remove them. Note, however, that "not" is also in the list of stop words. So blindly removing stop words might not be a good idea (take this sentence for example: removing not negates the intended meaning).

In [26]:
from nltk.corpus import stopwords

STOPWORDS = stopwords.words('english')

'not' in STOPWORDS

True

In [27]:
print(STOPWORDS)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

We want to remove stop words but not negation words

In [28]:
negation_words = [
    "aren't", "couldn't", "didn't", "doesn't", "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't",
    "mustn't", "needn't", "shan't", "shouldn't", "wasn't", "weren't", "won't", "wouldn't", 'above', 'after',
    'against', 'ain', 'all', 'any', 'aren', 'before', 'below', 'but', 'couldn', 'didn',
    'doesn', 'don', 'down', 'few', 'from', 'further', 'hadn', 'hasn', 'haven', 'in',
    'isn', 'ma', 'mightn', 'more', 'most', 'mustn', 'needn', 'no', 'nor', 'not',
    'off', 'on', 'only', 'out', 'over', 'shan', 'shouldn', 'some', 'such', 't',
    'to', 'too', 'under', 'up', 'very', 'wasn', 'weren', 'won', 'wouldn',
]

# The additions are because stop_words need to be compatible with the tokenizers used
LemmaSTOPWORDS = [word for word in STOPWORDS if word not in negation_words] +  ["'d", "'ll", "'re", "'s", "'ve"]
StemSTOPWORDS = LemmaSTOPWORDS +  ['becau', 'becaus', 'doe', 'dure', 'ha', 'hi', 'onc', 'ourselv', 'themselv', 'thi', 'wa', 'whi', 'yourselv']

## Basic Vectorizers

Input is text but machines like numbers. Let's convert the sentences into words into numbers.

`CountVectorizer` gives us a matrix of token counts. It does not retain the order of the words. Each sentence is simply considered a bag-of-words (BOW) and nothing more.

`TfidfVectorizer` gives us the Term Frequency (TF: how many times a word appears in a document) - Inverse Document Frequency (IDF: how many documents a particular word appears in).

In [29]:
from itertools import product
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline


classifiers = [(LogisticRegression(max_iter=200, n_jobs=-1), "logisitic")]
vectorizers = [
    (CountVectorizer(), "cv"),
    (CountVectorizer(preprocessor=preprocessor), "cv+pre"),
    (CountVectorizer(tokenizer=StemTokenizer(), token_pattern=None), "cv+stem"),
    (CountVectorizer(preprocessor=preprocessor, tokenizer=StemTokenizer(), token_pattern=None), "cv+pre+stem"),
    (CountVectorizer(tokenizer=LemmaTokenizer(), token_pattern=None), "cv+lemma"),
    (CountVectorizer(preprocessor=preprocessor, tokenizer=LemmaTokenizer(), token_pattern=None), "cv+pre+lemma"),
    
    (CountVectorizer(preprocessor=preprocessor, ngram_range=(2, 2)), "cv+pre+bigram"),
    (CountVectorizer(preprocessor=preprocessor, tokenizer=LemmaTokenizer(), token_pattern=None, ngram_range=(2, 2)), "cv+pre+lemma+bigram"),

    (TfidfVectorizer(), "tfidf"),
    (TfidfVectorizer(preprocessor=preprocessor), "tfidf+pre"),
    (TfidfVectorizer(tokenizer=StemTokenizer(), token_pattern=None), "tfidf+stem"),
    (TfidfVectorizer(preprocessor=preprocessor, tokenizer=StemTokenizer(), token_pattern=None), "tfidf+pre+stem"),
    (TfidfVectorizer(tokenizer=LemmaTokenizer(), token_pattern=None), "tfidf+lemma"),
    (TfidfVectorizer(preprocessor=preprocessor, tokenizer=LemmaTokenizer(), token_pattern=None), "tfidf+pre+lemma"),
    
    (TfidfVectorizer(preprocessor=preprocessor, ngram_range=(2, 2)), "tfidf+pre+bigram"),
    (TfidfVectorizer(preprocessor=preprocessor, tokenizer=LemmaTokenizer(), token_pattern=None, ngram_range=(2, 2)), "tfidf+pre+lemma+bigram"),
]

restab = {}
for (vectorizer, vec_name), (classifier, clf_name) in product(vectorizers, classifiers):
    model = Pipeline([
        ("vectorizer", vectorizer),
        ("classifier", classifier),
    ])
    restab[f"{vec_name} / {clf_name}"] = score(model, X, y)

pd.DataFrame(restab).T.style.highlight_max(color="green", axis=0).highlight_min(color="red", axis=0)

Unnamed: 0,cv_accuracy,cv_balanced_accuracy,cv_f1_macro,cv_f1_weighted,cv_precision_weighted
cv / logisitic,0.680047,0.577015,0.58286,0.671563,0.667795
cv+pre / logisitic,0.679287,0.577971,0.583448,0.670873,0.666957
cv+stem / logisitic,0.699451,0.598691,0.604167,0.692112,0.688358
cv+pre+stem / logisitic,0.699832,0.601739,0.607477,0.692994,0.689727
cv+lemma / logisitic,0.698501,0.599306,0.605259,0.69168,0.688557
cv+pre+lemma / logisitic,0.700593,0.601723,0.60782,0.693634,0.690449
cv+pre+bigram / logisitic,0.615751,0.471664,0.472975,0.586592,0.590977
cv+pre+lemma+bigram / logisitic,0.643715,0.506603,0.508633,0.61984,0.615127
tfidf / logisitic,0.705728,0.561955,0.566866,0.675064,0.682156
tfidf+pre / logisitic,0.700783,0.559476,0.565082,0.671131,0.67816


Turns out our custom preprocessor makes the models slightly slightly worse. We are removing more than we should. But do urls and usernames really contain valuable information? Lemmatization is better than Stemming but it's more time consuming. Bigrams are a let down though. But lemmatization helps bigrams.

## Basic Classifiers

In [30]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.svm import LinearSVC

classifiers = [
    # trees
    (RandomForestClassifier(n_jobs=-1), "rf"),
    (GradientBoostingClassifier(), "gbc"),
    # linear
    (RidgeClassifier(solver="sparse_cg"), "ridge_sparse"),
    (LogisticRegression(n_jobs=-1), "logistic"),
    # SVM
    (LinearSVC(), "linear_svc"),
    # Nearest neighbors
    (KNeighborsClassifier(n_jobs=-1), "kNN"),
    (NearestCentroid(), "rocchio"),
    # Bayes
    (MultinomialNB(), "multinomial_nb"),
]

vectorizers = [
    (TfidfVectorizer(tokenizer=StemTokenizer(), token_pattern=None), "tf+stem"),
    (TfidfVectorizer(tokenizer=LemmaTokenizer(), token_pattern=None), "tf+lemma"),
]

restab = {}
for (classifier, clf_name), (vectorizer, vec_name) in product(classifiers, vectorizers):
    model = Pipeline([
        ("vectorizer", vectorizer),
        ("classifier", classifier),
    ])
    restab[f"{vec_name} / {clf_name}"] = score(model, X, y)

pd.DataFrame(restab).T.style.highlight_max(color="green", axis=0).highlight_min(color="red", axis=0)

Unnamed: 0,cv_accuracy,cv_balanced_accuracy,cv_f1_macro,cv_f1_weighted,cv_precision_weighted
tf+stem / rf,0.662167,0.516634,0.516029,0.634756,0.627652
tf+lemma / rf,0.662928,0.516119,0.514607,0.634515,0.626854
tf+stem / gbc,0.709913,0.578278,0.587098,0.686592,0.687472
tf+lemma / gbc,0.711624,0.581723,0.591279,0.688979,0.690511
tf+stem / ridge_sparse,0.705538,0.590454,0.596942,0.691499,0.686424
tf+lemma / ridge_sparse,0.705729,0.589687,0.596744,0.691827,0.687117
tf+stem / logistic,0.717902,0.577215,0.583077,0.689414,0.693883
tf+lemma / logistic,0.719615,0.5806,0.588259,0.692115,0.697842
tf+stem / linear_svc,0.702684,0.598054,0.603595,0.693049,0.688135
tf+lemma / linear_svc,0.700403,0.597542,0.603185,0.69186,0.687369


Logistic regression and SVMs are promising. Naive Bayes seems to have surprisingly good weighted precision compared to the best performing models while being terrible at every other metric.

## Stop word removal

In [31]:
classifiers = [
    (LinearSVC(), "linear svc"),
    (LogisticRegression(n_jobs=-1), "logistic"),
    (RidgeClassifier(solver="sparse_cg"), "ridge_sparse"),
    (MultinomialNB(), "multinomial_nb"),
]

vectorizers = [
    (TfidfVectorizer(tokenizer=StemTokenizer(), token_pattern=None), "tf+stem"),
    (TfidfVectorizer(tokenizer=StemTokenizer(), token_pattern=None, stop_words=StemSTOPWORDS), "tf+stem+stop"),
    (TfidfVectorizer(tokenizer=LemmaTokenizer(), token_pattern=None), "tf+lemma"),
    (TfidfVectorizer(tokenizer=LemmaTokenizer(), token_pattern=None, stop_words=LemmaSTOPWORDS), "tf+lemma+stop"),
]


restab = {}
for (classifier, clf_name), (vectorizer, vec_name) in product(classifiers, vectorizers):
    model = Pipeline([
        ("vectorizer", vectorizer),
        ("classifier", classifier),
    ])
    restab[f"{vec_name} / {clf_name}"] = score(model, X, y)

pd.DataFrame(restab).T.style.highlight_max(color="green", axis=0).highlight_min(color="red", axis=0)

Unnamed: 0,cv_accuracy,cv_balanced_accuracy,cv_f1_macro,cv_f1_weighted,cv_precision_weighted
tf+stem / linear svc,0.702684,0.598054,0.603595,0.693049,0.688135
tf+stem+stop / linear svc,0.696787,0.592793,0.598419,0.687213,0.682086
tf+lemma / linear svc,0.700403,0.597542,0.603185,0.69186,0.687369
tf+lemma+stop / linear svc,0.694885,0.594965,0.600319,0.686717,0.682194
tf+stem / logistic,0.717902,0.577215,0.583077,0.689414,0.693883
tf+stem+stop / logistic,0.709533,0.567364,0.57477,0.680507,0.687407
tf+lemma / logistic,0.719615,0.5806,0.588259,0.692115,0.697842
tf+lemma+stop / logistic,0.709343,0.567246,0.57471,0.680251,0.687437
tf+stem / ridge_sparse,0.705538,0.590454,0.596942,0.691499,0.686424
tf+stem+stop / ridge_sparse,0.69964,0.586383,0.594094,0.686144,0.681981


Only Naive Bayes benefits from stop word removal except wrt weighted precision. Other models seem to dislike the lack of stop words.

## Hyperparameter tuning: RandomizedSearch

I'd like to do a full grid search but it's very expensive and there are a lot of parameters.

In [32]:
from typing import Any
from scipy.stats import uniform
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

def tune(classifier, classifier_parameters: dict[str, Any], n_iter: int = 50):
    model = Pipeline([
        ("vectorizer", TfidfVectorizer()),
        ("classifier", classifier),
    ])

    parameters = {
        "vectorizer__token_pattern": [None],
        "vectorizer__tokenizer": [StemTokenizer(), LemmaTokenizer()],
        "vectorizer__max_features": [1024, 2048, 4096, 8192, None],
    } | classifier_parameters

    clf = RandomizedSearchCV(model, parameters, n_iter=n_iter, cv=3, n_jobs=-1, scoring="accuracy", verbose=1)
    clf.fit(X, y)
    return clf.best_estimator_, clf.best_score_

In [33]:
classifier = LogisticRegression()
classifier_parameters = {
    "classifier__max_iter": [200],
    "classifier__n_jobs": [-1],
    "classifier__C": uniform(0, 10),
    "classifier__solver": ["lbfgs", "newton-cg"],
}
tune(classifier, classifier_parameters)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


(Pipeline(steps=[('vectorizer',
                  TfidfVectorizer(max_features=2048, token_pattern=None,
                                  tokenizer=<__main__.StemTokenizer object at 0x789764b8b440>)),
                 ('classifier',
                  LogisticRegression(C=np.float64(1.5367043404467962),
                                     max_iter=200, n_jobs=-1))]),
 np.float64(0.7243692265748388))

In [34]:
classifier = LogisticRegression(n_jobs=-1, max_iter=200, solver="newton-cg")
vectorizer = TfidfVectorizer(max_features=2048, tokenizer=LemmaTokenizer(), token_pattern=None)

model = Pipeline([
    ("vectorizer", vectorizer),
    ("classifier", classifier),
])

score(model, X, y, method=ScoringMethod.SUMMARY)

{'accuracy': 0.7338403041825095,
 'balanced_accuracy': 0.6070089399744572,
 'f1_macro': 0.6187656984194256,
 'f1_weighted': 0.7127198980449911,
 'precision_weighted': 0.715232009152425,
 'cohen_kappa': 0.504715838497183}

In [35]:
score(model, X, y, method=ScoringMethod.CONFUSION_MATRIX)

array([[ 26,  67,  23],
       [ 18, 376,  29],
       [  8,  65, 177]])

In [36]:
classifier = MultinomialNB()
classifier_parameters = {
    "classifier__alpha": uniform(0, 1)
}
tune(classifier, classifier_parameters)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


(Pipeline(steps=[('vectorizer',
                  TfidfVectorizer(max_features=1024, token_pattern=None,
                                  tokenizer=<__main__.LemmaTokenizer object at 0x789764bdb7a0>)),
                 ('classifier',
                  MultinomialNB(alpha=np.float64(0.4841938550112149)))]),
 np.float64(0.7000202305940415))

I feel like looking at the best accuracy is a mistake. But we can easily re-run while optimizing for a different metric. Interestingly, Naive Bayes can be tuned to make comparable, accuracy-wise.

## Deep Learning

References:
* https://www.tensorflow.org/tutorials/keras/text_classification
* https://www.tensorflow.org/text/tutorials/text_classification_rnn
* https://www.tensorflow.org/text/guide/word_embeddings

In [37]:
from matplotlib import pyplot as plt

# https://www.tensorflow.org/text/tutorials/text_classification_rnn#setup
def plot_training(history, metric: str, title: str = ""):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])
    plt.title(title)
    plt.show()

Nice little function, but I don't see much use. Simply looking at the training output gives us a sufficient idea.

In [38]:
from typing import Callable
from tensorflow.keras import layers, Sequential

VOCAB_SIZE = 2048

def lstm(units: int = 64,
         vocab_size: int = VOCAB_SIZE,
         standardize: str | Callable = "lower_and_strip_punctuation"):
    return Sequential([
        layers.TextVectorization(max_tokens=vocab_size, standardize=standardize),
        layers.Embedding(
            input_dim=vocab_size,
            output_dim=units,
            mask_zero=True,
        ),
        layers.Bidirectional(layers.LSTM(units)),
        layers.Dense(units, activation="relu"),
        layers.Dense(3),
    ])

2024-11-04 23:30:39.585273: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-04 23:30:39.585680: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-04 23:30:39.588076: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-04 23:30:39.595399: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730734239.609577   58659 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730734239.61

In [39]:
from tensorflow.math import softmax
from tensorflow.keras import losses, callbacks

def compile(model,
            loss=losses.SparseCategoricalCrossentropy(from_logits=True),
            optimizer="adam",
            metrics=["sparse_categorical_crossentropy", "accuracy"]):
    model.compile(loss=loss,
                  optimizer=optimizer,
                  metrics=metrics,
                  weighted_metrics=metrics)

def train(model,
          X_train: np.ndarray,
          y_train: np.ndarray,
          X_val: np.ndarray,
          y_val: np.ndarray,
          epochs: int,
          batch_size: int | None,
          early_stopping: bool):

    model.fit(X_train,
              y_train,
              validation_data=(X_val, y_val),
              epochs=epochs,
              batch_size=batch_size,
              #class_weight=CLASS_WEIGHTS,
              callbacks=[
                  callbacks.EarlyStopping(patience=5, restore_best_weights=True)
              ] if early_stopping else None)

def predict(model, X):
    return np.argmax(softmax(model.predict(X)), axis=1)

To combat overfitting, we use a validation dataset and early stopping. The other option is to add dropout layers.

In [40]:
def adapt_and_compile(model, X_train: np.ndarray):
    vectorizer = model.layers[0]
    if vectorizer.get_vocabulary() == ['', '[UNK]']:
        vectorizer.adapt(X_train)
    compile(model)
    return model

Write `score_dl` and `cross_validate_dl` specifically for deep learning models: they require additional setup. I attempted creating a class to be similar to `sklearn`, but it's simpler to write ad-hoc functions. Morever, we need to also tune the model. Classes do not make it easier.

In [41]:
from collections import defaultdict

def cross_validate_dl(build_model: Callable,
                      X: np.ndarray,
                      y: np.ndarray,
                      epochs: int,
                      batch_size: int | None,
                      early_stopping: bool) -> dict[str, float]:
    results = defaultdict(list)

    skf = StratifiedKFold(n_splits=3)
    for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train, y_train = X[train_idx], y[train_idx]
        X_val, y_val = X[val_idx], y[val_idx]

        print(f"Fold {i}")
        model = adapt_and_compile(build_model(), X_train)
        history = train(model,
                        X_train,
                        y_train,
                        X_val,
                        y_val,
                        epochs=epochs,
                        batch_size=batch_size,
                        early_stopping=early_stopping)
        y_pred = predict(model, X_val)

        summary = custom_summary(y_val, y_pred)
        for metric in summary:
            results[metric].append(summary[metric])

    return {
        f"cv_{metric}": sum(results[metric]) / len(results[metric])
        for metric in results
    }

In [42]:
def score_dl(build_model: Callable,
             X: np.ndarray,
             y: np.ndarray,
             method: ScoringMethod = ScoringMethod.SUMMARY,
             scoring: str = "balanced_accuracy",
             epochs: int = 1,
             batch_size: int | None = None,
             early_stopping: bool = False) -> str | np.ndarray | dict[str, float]:

    if method == ScoringMethod.CROSS_VAL_SCORE:
        return cross_validate_dl(build_model,
                                 X,
                                 y,
                                 epochs=epochs,
                                 batch_size=batch_size,
                                 early_stopping=early_stopping)

    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      random_state=0,
                                                      stratify=y,
                                                      test_size=VALIDATION_SPLIT)

    model = adapt_and_compile(build_model(), X_train)
    history = train(model,
                    X_train,
                    y_train,
                    X_val,
                    y_val,
                    epochs=epochs,
                    batch_size=batch_size,
                    early_stopping=early_stopping)
    y_pred = predict(model, X_val)

    return {
        ScoringMethod.CLASSIFICATION_REPORT: classification_report,
        ScoringMethod.CONFUSION_MATRIX: confusion_matrix,
        ScoringMethod.SUMMARY: custom_summary,
    }.get(method)(y_val, y_pred)

In [43]:
score_dl(lstm, X, y, epochs=20, batch_size=100, early_stopping=True)

Epoch 1/20


2024-11-04 23:30:40.806548: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


[1m 4/45[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 24ms/step - accuracy: 0.3781 - loss: 1.0965 - sparse_categorical_crossentropy: 4.2113 - weighted_accuracy: 0.3781 - weighted_sparse_categorical_crossentropy: 4.2113

2024-11-04 23:30:43.998911: E tensorflow/core/util/util.cc:131] oneDNN supports DT_BOOL only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 34ms/step - accuracy: 0.5040 - loss: 1.0239 - sparse_categorical_crossentropy: 3.2662 - weighted_accuracy: 0.5040 - weighted_sparse_categorical_crossentropy: 3.2662 - val_accuracy: 0.6122 - val_loss: 0.8772 - val_sparse_categorical_crossentropy: 3.6881 - val_weighted_accuracy: 0.6122 - val_weighted_sparse_categorical_crossentropy: 3.6881
Epoch 2/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.6572 - loss: 0.8045 - sparse_categorical_crossentropy: 3.4364 - weighted_accuracy: 0.6572 - weighted_sparse_categorical_crossentropy: 3.4364 - val_accuracy: 0.6755 - val_loss: 0.7421 - val_sparse_categorical_crossentropy: 3.8094 - val_weighted_accuracy: 0.6755 - val_weighted_sparse_categorical_crossentropy: 3.8094
Epoch 3/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.7713 - loss: 0.5770 - sparse_categorical_crossentropy: 2.4290 - weighted_accuracy:

{'accuracy': 0.714828897338403,
 'balanced_accuracy': 0.6199279910871988,
 'f1_macro': 0.6323837759302812,
 'f1_weighted': 0.706612084425848,
 'precision_weighted': 0.7066984472116347,
 'cohen_kappa': 0.49044606634267496}

Comparable to traditional models. Disappointing? Does seem to have better balanced accuracy and macro-averaged f1 score.

In [44]:
score_dl(lstm, X, y, epochs=20, batch_size=100, early_stopping=True, method=ScoringMethod.CONFUSION_MATRIX)

Epoch 1/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - accuracy: 0.5106 - loss: 1.0250 - sparse_categorical_crossentropy: 5.1629 - weighted_accuracy: 0.5106 - weighted_sparse_categorical_crossentropy: 5.1629 - val_accuracy: 0.6248 - val_loss: 0.8725 - val_sparse_categorical_crossentropy: 4.5797 - val_weighted_accuracy: 0.6248 - val_weighted_sparse_categorical_crossentropy: 4.5797
Epoch 2/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.6392 - loss: 0.8285 - sparse_categorical_crossentropy: 4.0718 - weighted_accuracy: 0.6392 - weighted_sparse_categorical_crossentropy: 4.0718 - val_accuracy: 0.6806 - val_loss: 0.7420 - val_sparse_categorical_crossentropy: 4.0781 - val_weighted_accuracy: 0.6806 - val_weighted_sparse_categorical_crossentropy: 4.0781
Epoch 3/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.7564 - loss: 0.5890 - sparse_categorical_crossentropy: 3.0413 - weighte

array([[ 37,  53,  26],
       [ 34, 346,  43],
       [ 20,  58, 172]])

Need to check confusion matrices regularly so that we know the models are actually trying to learn all the classes and not just the majority ones.

In [45]:
def lstm2(units: int = 32, vocab_size: int = VOCAB_SIZE):
    return Sequential([
        layers.TextVectorization(max_tokens=vocab_size),
        layers.Embedding(
            input_dim=vocab_size,
            output_dim=units,
            mask_zero=True,
        ),
        layers.Bidirectional(layers.LSTM(units, return_sequences=True)),
        layers.Bidirectional(layers.LSTM(units)),
        layers.Dense(units, activation="relu"),
        layers.Dense(3),
    ])

In [46]:
def simple_embed(units: int = 64, vocab_size: int = VOCAB_SIZE):
    model = Sequential([
        layers.TextVectorization(max_tokens=vocab_size),
        layers.Embedding(input_dim=vocab_size, 
                         output_dim=units,
                         mask_zero=True),
        layers.GlobalAveragePooling1D(),
        layers.Dropout(0.2),
        layers.Dense(3),
    ])

    return model

In [47]:
restab = {}
for name, model in [
    ("lstm", lstm),
    ("lstm2", lstm2),
    ("simple_embed", simple_embed),
]:
    restab[name] = score_dl(model,
                            X,
                            y,
                            method=ScoringMethod.CROSS_VAL_SCORE,
                            epochs=50,
                            batch_size=100,
                            early_stopping=True,)


pd.DataFrame(restab).T.style.highlight_max(color="green", axis=0).highlight_min(color="red", axis=0)

Fold 0
Epoch 1/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 48ms/step - accuracy: 0.5211 - loss: 1.0328 - sparse_categorical_crossentropy: 4.4071 - weighted_accuracy: 0.5211 - weighted_sparse_categorical_crossentropy: 4.4071 - val_accuracy: 0.5859 - val_loss: 0.9114 - val_sparse_categorical_crossentropy: 4.0287 - val_weighted_accuracy: 0.5859 - val_weighted_sparse_categorical_crossentropy: 4.0287
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - accuracy: 0.6349 - loss: 0.8418 - sparse_categorical_crossentropy: 3.8435 - weighted_accuracy: 0.6349 - weighted_sparse_categorical_crossentropy: 3.8435 - val_accuracy: 0.6423 - val_loss: 0.8160 - val_sparse_categorical_crossentropy: 4.1357 - val_weighted_accuracy: 0.6423 - val_weighted_sparse_categorical_crossentropy: 4.1357
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.7208 - loss: 0.6920 - sparse_categorical_crossentropy: 3.4668 - 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 56ms/step - accuracy: 0.4740 - loss: 1.0578 - sparse_categorical_crossentropy: 6.6885 - weighted_accuracy: 0.4740 - weighted_sparse_categorical_crossentropy: 6.6885 - val_accuracy: 0.5360 - val_loss: 0.9360 - val_sparse_categorical_crossentropy: 6.4834 - val_weighted_accuracy: 0.5360 - val_weighted_sparse_categorical_crossentropy: 6.4834
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.5696 - loss: 0.9154 - sparse_categorical_crossentropy: 5.3022 - weighted_accuracy: 0.5696 - weighted_sparse_categorical_crossentropy: 5.3022 - val_accuracy: 0.6159 - val_loss: 0.8695 - val_sparse_categorical_crossentropy: 4.1650 - val_weighted_accuracy: 0.6159 - val_weighted_sparse_categorical_crossentropy: 4.1650
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - accuracy: 0.6792 - loss: 0.7678 - sparse_categorical_crossentropy: 4.0693 - weighted_accuracy:

Unnamed: 0,cv_accuracy,cv_balanced_accuracy,cv_f1_macro,cv_f1_weighted,cv_precision_weighted,cv_cohen_kappa
lstm,0.69831,0.60385,0.607821,0.689325,0.685393,0.464702
lstm2,0.685564,0.568468,0.54948,0.658259,0.643336,0.410696
simple_embed,0.698881,0.579058,0.589419,0.67989,0.679604,0.443532


Everytime I run the above cell, I get different results. But generally, these deep learning models are worse than traditional models in both performance and time.

In [48]:
import tensorflow as tf

def standardize(text):
    regex_url = re.escape(r"(http|www)\S+")
    regex_twitter_mentions = re.escape(r"@\w+")
    regex_ticker = re.escape(r"\$[A-Za-z]{1,5}(\.[A-Za-z]{1,3})?")
    regex_punctuation = re.escape(r"[^\w\s']+")

    replacements = [
        (regex_url, ""),
        (regex_twitter_mentions, ""),
        #(regex_ticker, ""),
        (regex_punctuation, ""),
    ] 

    text = tf.strings.lower(text)
    for regex, replacement in replacements:
        text = tf.strings.regex_replace(text, regex, replacement)

    return text

Need to re-write `preprocessor` function to take tensors as input. I would like to add type-hints to all my functions, but dealing with tf/keras is already a pain.

In [49]:
from functools import partial

restab = {}
for name, model in [
    ("lstm", lstm),
    ("lstm+standardize", partial(lstm, standardize=standardize)),
]:
    restab[name] = score_dl(model,
                            X,
                            y,
                            method=ScoringMethod.CROSS_VAL_SCORE,
                            epochs=50,
                            batch_size=100,
                            early_stopping=True,)


pd.DataFrame(restab).T.style.highlight_max(color="green", axis=0).highlight_min(color="red", axis=0)

Fold 0
Epoch 1/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 44ms/step - accuracy: 0.5270 - loss: 1.0096 - sparse_categorical_crossentropy: 5.4875 - weighted_accuracy: 0.5270 - weighted_sparse_categorical_crossentropy: 5.4875 - val_accuracy: 0.6030 - val_loss: 0.8990 - val_sparse_categorical_crossentropy: 4.6497 - val_weighted_accuracy: 0.6030 - val_weighted_sparse_categorical_crossentropy: 4.6497
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.6493 - loss: 0.8304 - sparse_categorical_crossentropy: 4.0744 - weighted_accuracy: 0.6493 - weighted_sparse_categorical_crossentropy: 4.0744 - val_accuracy: 0.6349 - val_loss: 0.8555 - val_sparse_categorical_crossentropy: 4.8735 - val_weighted_accuracy: 0.6349 - val_weighted_sparse_categorical_crossentropy: 4.8735
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.7132 - loss: 0.6918 - sparse_categorical_crossentropy: 3.9170 - 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 40ms/step - accuracy: 0.5222 - loss: 1.0310 - sparse_categorical_crossentropy: 5.2996 - weighted_accuracy: 0.5222 - weighted_sparse_categorical_crossentropy: 5.2996 - val_accuracy: 0.5765 - val_loss: 0.8947 - val_sparse_categorical_crossentropy: 4.4034 - val_weighted_accuracy: 0.5765 - val_weighted_sparse_categorical_crossentropy: 4.4034
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.6185 - loss: 0.8651 - sparse_categorical_crossentropy: 4.2236 - weighted_accuracy: 0.6185 - weighted_sparse_categorical_crossentropy: 4.2236 - val_accuracy: 0.6518 - val_loss: 0.7984 - val_sparse_categorical_crossentropy: 4.3086 - val_weighted_accuracy: 0.6518 - val_weighted_sparse_categorical_crossentropy: 4.3086
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.6909 - loss: 0.7045 - sparse_categorical_crossentropy: 4.1593 - weighted_accuracy:

Unnamed: 0,cv_accuracy,cv_balanced_accuracy,cv_f1_macro,cv_f1_weighted,cv_precision_weighted,cv_cohen_kappa
lstm,0.683473,0.600084,0.579975,0.66922,0.661498,0.433781
lstm+standardize,0.712005,0.574467,0.569813,0.680743,0.701341,0.43945


Once again, the above cell gives different results everytime.

## Keras Tuner

In [50]:
import keras_tuner
import random
import shutil

def choice_lstm2(hp):
    model = Sequential([
        layers.Embedding(
            VOCAB_SIZE,
            hp.Choice("output_dim", [8, 16, 32, 64, 128]),
            mask_zero=True,
        ),
        layers.Bidirectional(layers.LSTM(
            hp.Choice("units_1", [8, 16, 32, 64, 128]),
            return_sequences=True,
        )),
        layers.Bidirectional(layers.LSTM(
            hp.Choice("units_2", [8, 16, 32, 64, 128]),
        )),
        layers.Dense(
            hp.Choice("units_3", [8, 16, 32, 64, 128]),
            activation="relu"
        ),
        layers.Dense(3),
    ])
    compile(model)
    return model

def tuning_lstm(choice_model: Callable,
                X: np.ndarray,
                y: np.ndarray):
    shutil.rmtree("untitled_project")
    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      random_state=0,
                                                      stratify=y,
                                                      test_size=VALIDATION_SPLIT)

    vectorizer = layers.TextVectorization(max_tokens=VOCAB_SIZE,
                                          standardize=standardize)
    vectorizer.adapt(X_train)
    X_train, X_val = vectorizer(X_train), vectorizer(X_val)

    tuner = keras_tuner.BayesianOptimization(
        choice_model,
        objective="val_loss",
    )
    tuner.search(X_train,
                 y_train,
                 epochs=50,
                 batch_size=100,
                 validation_data=(X_val, y_val),
                 callbacks=[
                     callbacks.EarlyStopping(patience=5, restore_best_weights=True)
                 ])

    return tuner.get_best_models()[0]

model = tuning_lstm(choice_lstm2, X, y)
model.summary()

Trial 10 Complete [00h 00m 34s]
val_loss: 0.678244948387146

Best val_loss So Far: 0.6639225482940674
Total elapsed time: 00h 06m 38s


  saveable.load_own_variables(weights_store.get(inner_path))


Once again, different runs give different results. Is there a `random_state` I can set somewhere!?

In [51]:
def best():
    return Sequential([
        layers.TextVectorization(max_tokens=VOCAB_SIZE, standardize=standardize),
        layers.Embedding(
            input_dim=VOCAB_SIZE,
            output_dim=64,
            mask_zero=True,
        ),
        layers.Bidirectional(layers.LSTM(32, return_sequences=True)),
        layers.Bidirectional(layers.LSTM(16)),
        layers.Dense(64, activation="relu"),
        layers.Dense(3),
    ])
    
score_dl(best, X, y, epochs=50, batch_size=100, early_stopping=True, method=ScoringMethod.CROSS_VAL_SCORE)

Fold 0
Epoch 1/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 105ms/step - accuracy: 0.5137 - loss: 1.0335 - sparse_categorical_crossentropy: 2.9957 - weighted_accuracy: 0.5137 - weighted_sparse_categorical_crossentropy: 2.9957 - val_accuracy: 0.5722 - val_loss: 0.9095 - val_sparse_categorical_crossentropy: 3.1265 - val_weighted_accuracy: 0.5722 - val_weighted_sparse_categorical_crossentropy: 3.1265
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 74ms/step - accuracy: 0.6320 - loss: 0.8601 - sparse_categorical_crossentropy: 3.2723 - weighted_accuracy: 0.6320 - weighted_sparse_categorical_crossentropy: 3.2723 - val_accuracy: 0.6241 - val_loss: 0.8261 - val_sparse_categorical_crossentropy: 4.1753 - val_weighted_accuracy: 0.6241 - val_weighted_sparse_categorical_crossentropy: 4.1753
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 75ms/step - accuracy: 0.7196 - loss: 0.6908 - sparse_categorical_crossentropy: 3.5340 

{'cv_accuracy': 0.7036327591491341,
 'cv_balanced_accuracy': 0.5904466601216283,
 'cv_f1_macro': 0.5848323384746731,
 'cv_f1_weighted': 0.6829437337372389,
 'cv_precision_weighted': 0.694807263059649,
 'cv_cohen_kappa': 0.43779264625472347}

Definitely some improvements in macro-averaged f1 and balanced accuracy, so I'll take it.

In [52]:
score_dl(best, X, y, epochs=50, batch_size=100, early_stopping=True, method=ScoringMethod.CONFUSION_MATRIX)

Epoch 1/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 60ms/step - accuracy: 0.5309 - loss: 1.0236 - sparse_categorical_crossentropy: 6.0196 - weighted_accuracy: 0.5309 - weighted_sparse_categorical_crossentropy: 6.0196 - val_accuracy: 0.6122 - val_loss: 0.8801 - val_sparse_categorical_crossentropy: 5.3382 - val_weighted_accuracy: 0.6122 - val_weighted_sparse_categorical_crossentropy: 5.3382
Epoch 2/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - accuracy: 0.6550 - loss: 0.8160 - sparse_categorical_crossentropy: 4.8317 - weighted_accuracy: 0.6550 - weighted_sparse_categorical_crossentropy: 4.8317 - val_accuracy: 0.6793 - val_loss: 0.7566 - val_sparse_categorical_crossentropy: 4.0915 - val_weighted_accuracy: 0.6793 - val_weighted_sparse_categorical_crossentropy: 4.0915
Epoch 3/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - accuracy: 0.7315 - loss: 0.6502 - sparse_categorical_crossentropy: 3.4130 - weighte

array([[ 41,  55,  20],
       [ 29, 368,  26],
       [ 27,  65, 158]])

In [53]:
score_dl(partial(lstm, standardize=standardize),
         X,
         y,
         method=ScoringMethod.CONFUSION_MATRIX,
         epochs=50,
         batch_size=100,
         early_stopping=True,)

Epoch 1/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 46ms/step - accuracy: 0.5351 - loss: 1.0083 - sparse_categorical_crossentropy: 5.5748 - weighted_accuracy: 0.5351 - weighted_sparse_categorical_crossentropy: 5.5748 - val_accuracy: 0.6388 - val_loss: 0.8381 - val_sparse_categorical_crossentropy: 4.9881 - val_weighted_accuracy: 0.6388 - val_weighted_sparse_categorical_crossentropy: 4.9881
Epoch 2/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.6720 - loss: 0.7751 - sparse_categorical_crossentropy: 4.4008 - weighted_accuracy: 0.6720 - weighted_sparse_categorical_crossentropy: 4.4008 - val_accuracy: 0.6984 - val_loss: 0.7260 - val_sparse_categorical_crossentropy: 3.7418 - val_weighted_accuracy: 0.6984 - val_weighted_sparse_categorical_crossentropy: 3.7418
Epoch 3/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.7550 - loss: 0.5947 - sparse_categorical_crossentropy: 2.9020 - weighte

array([[ 28,  61,  27],
       [ 20, 361,  42],
       [ 20,  60, 170]])

## Pre-trained Embeddings

Embeddings make all the difference right? https://en.wikipedia.org/wiki/Word_embedding

References:
* https://keras.io/examples/nlp/text_classification_from_scratch/
* https://keras.io/examples/nlp/tweet-classification-using-tfdf/
* https://keras.io/examples/nlp/pretrained_word_embeddings/

In [54]:
import numpy as np

EMBEDDING_DIM = 300

def get_embedding_index(embedding_dim: int = EMBEDDING_DIM):
    glove_file = f"glove.6B.{embedding_dim}d.txt"
    with open(glove_file) as f:
        def word_coeff():
            for line in f:
                line = line.split()
                yield line[0], np.array(line[1:])
        return {word: coeff for word, coeff in word_coeff()}

In [55]:
def build_embedding_weights(vocabulary: list[str],
                            embedding_index: dict[str, np.ndarray]):
    vocab_size = len(vocabulary)
    embedding_dim = len(embedding_index["the"])
    embedding_matrix = np.zeros((vocab_size + 2, embedding_dim))

    known_words_count = 0
    for index, word in enumerate(vocabulary):
        if word in embedding_index:
            embedding_matrix[index] = embedding_index[word]
            known_words_count += 1

    print(f"Hits/Misses: {known_words_count}/{vocab_size - known_words_count}")
    return embedding_matrix

In [56]:
VOCAB_SIZE = 20000
OUTPUT_SEQUENCE_LENGTH = 200

vectorization_layer = layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_sequence_length=OUTPUT_SEQUENCE_LENGTH,
    standardize=standardize
)
vectorization_layer.adapt(X)

In [57]:
vlayer = layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_sequence_length=OUTPUT_SEQUENCE_LENGTH,
    standardize=standardize
)
vlayer.get_vocabulary()

['', '[UNK]']

In [58]:
embedding_index = get_embedding_index(EMBEDDING_DIM)
embedding_weights = build_embedding_weights(vectorization_layer.get_vocabulary(),
                                            embedding_index)

Hits/Misses: 9204/2840


In [59]:
embedding_layer = layers.Embedding(
    input_dim=len(vectorization_layer.get_vocabulary()) + 2,
    output_dim=EMBEDDING_DIM,
    trainable=False,
)
embedding_layer.build((1, ))
embedding_layer.set_weights([embedding_weights])

In [60]:
def pretrained_embed_lstm():
    return Sequential([
        vectorization_layer,
        embedding_layer,
        layers.Bidirectional(layers.LSTM(32, return_sequences=True)),
        layers.Bidirectional(layers.LSTM(32)),
        layers.Dense(16, activation="relu"),
        layers.Dense(3),
    ])

In [61]:
model = pretrained_embed_lstm()
compile(model)
model.summary()

In [62]:
score_dl(pretrained_embed_lstm, X, y, epochs=50, batch_size=100, early_stopping=True)

Epoch 1/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 139ms/step - accuracy: 0.4879 - loss: 1.0163 - sparse_categorical_crossentropy: 3.7704 - weighted_accuracy: 0.4879 - weighted_sparse_categorical_crossentropy: 3.7704 - val_accuracy: 0.6464 - val_loss: 0.8287 - val_sparse_categorical_crossentropy: 1.2858 - val_weighted_accuracy: 0.6464 - val_weighted_sparse_categorical_crossentropy: 1.2858
Epoch 2/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 130ms/step - accuracy: 0.6481 - loss: 0.8148 - sparse_categorical_crossentropy: 1.7472 - weighted_accuracy: 0.6481 - weighted_sparse_categorical_crossentropy: 1.7472 - val_accuracy: 0.6831 - val_loss: 0.7517 - val_sparse_categorical_crossentropy: 1.3835 - val_weighted_accuracy: 0.6831 - val_weighted_sparse_categorical_crossentropy: 1.3835
Epoch 3/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 134ms/step - accuracy: 0.6649 - loss: 0.7647 - sparse_categorical_crossentropy: 1.4304 - wei

{'accuracy': 0.7427122940430925,
 'balanced_accuracy': 0.6443824352599113,
 'f1_macro': 0.6475984589461687,
 'f1_weighted': 0.732140216769749,
 'precision_weighted': 0.727849753330259,
 'cohen_kappa': 0.5276024046535399}

Definitely better than the self-trained one! Let's tune further

In [63]:
import keras_tuner
import random

def choice_lstm3(hp):
    model = Sequential([
        embedding_layer,
        layers.Bidirectional(layers.LSTM(
            hp.Choice("units_1", [8, 16, 32, 64, 128]),
            return_sequences=True,
        )),
        layers.Bidirectional(layers.LSTM(
            hp.Choice("units_2", [8, 16, 32, 64, 128]),
        )),
        layers.Dense(
            hp.Choice("units_3", [8, 16, 32, 64, 128]),
            activation="relu"
        ),
        layers.Dense(3),
    ])
    compile(model)
    return model

def tuning_lstm_pretrained_embeds(choice_model: Callable,
                                  X: np.ndarray,
                                  y: np.ndarray):
    shutil.rmtree("untitled_project")
    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      random_state=0,
                                                      stratify=y,
                                                      test_size=VALIDATION_SPLIT)

    X_train, X_val = vectorization_layer(X_train), vectorization_layer(X_val)

    tuner = keras_tuner.BayesianOptimization(
        choice_model,
        objective="val_loss",
    )
    tuner.search(X_train,
                 y_train,
                 epochs=50,
                 batch_size=100,
                 validation_data=(X_val, y_val),
                 callbacks=[
                     callbacks.EarlyStopping(patience=5, restore_best_weights=True)
                 ])

    return tuner.get_best_models()[0]

model = tuning_lstm_pretrained_embeds(choice_lstm3, X, y)
model.summary()

Trial 10 Complete [00h 01m 30s]
val_loss: 0.6102324724197388

Best val_loss So Far: 0.5960491299629211
Total elapsed time: 00h 19m 57s


  saveable.load_own_variables(weights_store.get(inner_path))


In [64]:
def best_pretrained_embed():
    return Sequential([
        vectorization_layer,
        embedding_layer,
        layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
        layers.Bidirectional(layers.LSTM(256)),
        layers.Dense(64, activation="relu"),
        layers.Dense(3),
    ])

In [65]:
score_dl(best_pretrained_embed, X, y, epochs=50, batch_size=100, early_stopping=True, method=ScoringMethod.SUMMARY)

Epoch 1/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 714ms/step - accuracy: 0.5689 - loss: 0.9349 - sparse_categorical_crossentropy: 3.6247 - weighted_accuracy: 0.5689 - weighted_sparse_categorical_crossentropy: 3.6247 - val_accuracy: 0.6603 - val_loss: 0.7663 - val_sparse_categorical_crossentropy: 2.7869 - val_weighted_accuracy: 0.6603 - val_weighted_sparse_categorical_crossentropy: 2.7869
Epoch 2/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 828ms/step - accuracy: 0.6692 - loss: 0.7369 - sparse_categorical_crossentropy: 2.9085 - weighted_accuracy: 0.6692 - weighted_sparse_categorical_crossentropy: 2.9085 - val_accuracy: 0.7275 - val_loss: 0.6567 - val_sparse_categorical_crossentropy: 1.9953 - val_weighted_accuracy: 0.7275 - val_weighted_sparse_categorical_crossentropy: 1.9953
Epoch 3/50
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 848ms/step - accuracy: 0.7406 - loss: 0.6101 - sparse_categorical_crossentropy: 1.9217 - w

{'accuracy': 0.7376425855513308,
 'balanced_accuracy': 0.6267940545093883,
 'f1_macro': 0.6369024629835923,
 'f1_weighted': 0.7223103536859881,
 'precision_weighted': 0.7205334311572524,
 'cohen_kappa': 0.5123793908053226}

Seems great! Finally, let's try our best models on the test dataset.

## Test dataset

In [66]:
restab = {}

In [67]:
model = best_pretrained_embed()
compile(model)
model.fit(X,
          y,
          validation_split=TEST_SPLIT,
          epochs=50,
          batch_size=100,
          callbacks=[
              callbacks.EarlyStopping(patience=5, restore_best_weights=True)
          ])

y_pred = predict(model, X_test)

restab["pretrained_embed_lstm"] = custom_summary(y_test, y_pred)

Epoch 1/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 833ms/step - accuracy: 0.5547 - loss: 0.9281 - sparse_categorical_crossentropy: 4.7888 - weighted_accuracy: 0.5547 - weighted_sparse_categorical_crossentropy: 4.7888 - val_accuracy: 0.6369 - val_loss: 0.8183 - val_sparse_categorical_crossentropy: 3.2209 - val_weighted_accuracy: 0.6369 - val_weighted_sparse_categorical_crossentropy: 3.2209
Epoch 2/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 850ms/step - accuracy: 0.6707 - loss: 0.7180 - sparse_categorical_crossentropy: 2.8622 - weighted_accuracy: 0.6707 - weighted_sparse_categorical_crossentropy: 2.8622 - val_accuracy: 0.7072 - val_loss: 0.7073 - val_sparse_categorical_crossentropy: 2.4213 - val_weighted_accuracy: 0.7072 - val_weighted_sparse_categorical_crossentropy: 2.4213
Epoch 3/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 849ms/step - accuracy: 0.7466 - loss: 0.5959 - sparse_categorical_crossentropy: 1.8440 - w

In [68]:
model = best()
adapt_and_compile(model, X)
model.fit(X,
          y,
          validation_split=TEST_SPLIT,
          epochs=50,
          batch_size=100,
          callbacks=[
              callbacks.EarlyStopping(patience=5, restore_best_weights=True)
          ])

y_pred = predict(model, X_test)
restab["lstm2"] = custom_summary(y_test, y_pred)

Epoch 1/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 73ms/step - accuracy: 0.5158 - loss: 1.0226 - sparse_categorical_crossentropy: 2.7890 - weighted_accuracy: 0.5158 - weighted_sparse_categorical_crossentropy: 2.7890 - val_accuracy: 0.6008 - val_loss: 0.8687 - val_sparse_categorical_crossentropy: 2.7941 - val_weighted_accuracy: 0.6008 - val_weighted_sparse_categorical_crossentropy: 2.7941
Epoch 2/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 57ms/step - accuracy: 0.6526 - loss: 0.8000 - sparse_categorical_crossentropy: 2.8663 - weighted_accuracy: 0.6526 - weighted_sparse_categorical_crossentropy: 2.8663 - val_accuracy: 0.6787 - val_loss: 0.7812 - val_sparse_categorical_crossentropy: 3.7264 - val_weighted_accuracy: 0.6787 - val_weighted_sparse_categorical_crossentropy: 3.7264
Epoch 3/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 57ms/step - accuracy: 0.8312 - loss: 0.4432 - sparse_categorical_crossentropy: 1.6605 - weight

In [69]:
classifier = LogisticRegression(n_jobs=-1, max_iter=200, solver="newton-cg")
vectorizer = TfidfVectorizer(max_features=2048, tokenizer=LemmaTokenizer(), token_pattern=None)

model = Pipeline([
    ("vectorizer", vectorizer),
    ("classifier", classifier),
])

model.fit(X, y)
y_pred = model.predict(X_test)

restab["logistic"] = custom_summary(y_test, y_pred)

In [70]:
model = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', MultinomialNB(alpha=0.5))
])

model.fit(X, y)
y_pred = model.predict(X_test)

restab["multi_nb"] = custom_summary(y_test, y_pred)

In [71]:
pd.DataFrame(restab).T.style.highlight_max(color="green", axis=0).highlight_min(color="red", axis=0)

Unnamed: 0,accuracy,balanced_accuracy,f1_macro,f1_weighted,precision_weighted,cohen_kappa
pretrained_embed_lstm,0.705983,0.660225,0.656909,0.707352,0.708962,0.515659
lstm2,0.690598,0.614824,0.615526,0.690833,0.691145,0.472087
logistic,0.707692,0.595085,0.605717,0.695234,0.693474,0.478399
multi_nb,0.683761,0.564276,0.57607,0.666134,0.667053,0.408838


Where are the medals?