In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import TweetTokenizer
nltk.download('punkt')
nltk.download('wordnet')
from xgboost import XGBClassifier
import wandb
import string
import os

In [None]:
random_state = 8

In [None]:
def get_df(dataset_name, dropna=True, fillna_value=''):
    df = pd.read_csv(f"data/{dataset_name}.csv")
    
    if dropna:
        df.dropna(inplace=True)
    else:
        df.fillna(fillna_value, inplace=True)

    df.drop(['ID'], axis=1, inplace=True) # remove useless data

    df['Rating'] = df['Rating'] - 1 # make y 0-indexed
    return df

In [None]:
df_train = get_df('train', dropna=True)
df_valid = get_df('valid', dropna=False)
df_test = get_df('test', dropna=False)

In [None]:
df_train.head(3)

In [None]:
df_train.info()

In [None]:
df_train['Rating'].hist()

In [None]:
tokenizer = TweetTokenizer()

stop_words = set(stopwords.words("english"))
keep_stop_words = ['not', 'no', 'against', 'above', 'below']
for word in keep_stop_words:
    stop_words.remove(word)

punctuation_to_remove = set(string.punctuation)
punctuation_to_remove.remove('!')

lemmatizer = WordNetLemmatizer()

def tokenize_text(text: str):
    if (not isinstance(text, str)):
        raise ValueError(f"The text {text} isn't a string!")

    # Change 't to 'not'
    text = re.sub(r"n\'t", " not", text)
    # Remove punctuation except '!'
    text = ''.join([char for char in text if char not in punctuation_to_remove])
    # Remove numbers
    text = re.sub(r"\d", "", text)
    
    text = text.lower()

    word_tokens = tokenizer.tokenize(text)

    filtered_words_tokens = [w for w in word_tokens if not w in stop_words]

    lemmatized_text = [lemmatizer.lemmatize(w) for w in filtered_words_tokens]
    
    return ' '.join(lemmatized_text)

In [None]:
def get_tokenized_X(X):
    return [tokenize_text(text) for text in X]

In [None]:
def get_tfidf_vectorizer(X_train, ngram_range=(1, 1), max_features=None):
    # settings inspired by https://www.linkedin.com/pulse/another-twitter-sentiment-analysis-python-part-5-tfidf-ricky-kim/
    # TODO consider options: max_df=0.9, min_df=2
    tfidf_vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
    tfidf_vectorizer.fit(get_tokenized_X(X_train))

    return tfidf_vectorizer

In [None]:
def get_count_vectorizer(X_train, ngram_range=(1, 1)):
    count_vectorizer = CountVectorizer(ngram_range=ngram_range)
    count_vectorizer.fit(get_tokenized_X(X_train))

    return count_vectorizer

In [None]:
def get_vectorized_X(X, vectorizer):
    tokenized_X = get_tokenized_X(X)

    vectorized_X = vectorizer.transform(tokenized_X)

    return vectorized_X.toarray()

In [None]:
def get_df_tokenizer_applied(df, columns):
    df = df.copy()
    for column in columns:
        print(f"Applying tokenize_text() to column {column}")
        df[column] = df[column].apply(tokenize_text)
    return df

In [None]:
df_train_pp = get_df_tokenizer_applied(df_train, ['Title', 'Review Text'])
df_valid_pp = get_df_tokenizer_applied(df_valid, ['Title', 'Review Text'])
df_test_pp = get_df_tokenizer_applied(df_test, ['Title', 'Review Text'])

In [None]:
df_train_pp.head(3)

---

In [None]:
def get_ohe_for_df_column(df_column):
    ohe_column = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    ohe_column.fit(df_column.to_numpy().reshape(-1, 1))
    return ohe_column

In [None]:
ohe_divisionname = get_ohe_for_df_column(df_train_pp['Division Name'])
ohe_departmentname = get_ohe_for_df_column(df_train_pp['Department Name'])
ohe_classname = get_ohe_for_df_column(df_train_pp['Class Name'])

In [None]:
def get_df_with_ohe_column(df, ohe_column_name, ohe):
    ohe_column_encoded = ohe.transform(df[ohe_column_name].to_numpy().reshape(-1, 1))

    ohe_df = pd.DataFrame(ohe_column_encoded.tolist(), columns=ohe.get_feature_names_out(input_features=[ohe_column_name]), dtype=int)
    # reset_index() is necessary here: https://stackoverflow.com/questions/50368145/pandas-concat-increases-number-of-rows
    # drop=True as well: https://stackoverflow.com/questions/12203901/pandas-crashes-on-repeated-dataframe-reset-index
    new_df = pd.concat([df.reset_index(drop=True), ohe_df], axis=1).drop([ohe_column_name], axis=1)

    return new_df

In [None]:
def get_df_with_ohe_columns(df):
    df = get_df_with_ohe_column(df, 'Division Name', ohe_divisionname)
    df = get_df_with_ohe_column(df, 'Department Name', ohe_departmentname)
    df = get_df_with_ohe_column(df, 'Class Name', ohe_classname)
    
    return df

In [None]:
do_categorical_value_ohe = True

if (do_categorical_value_ohe):
    df_train_wo_cat = get_df_with_ohe_columns(df_train_pp)
    df_valid_wo_cat = get_df_with_ohe_columns(df_valid_pp)
    df_test_wo_cat = get_df_with_ohe_columns(df_test_pp)
else:
    df_train_wo_cat = df_train_pp.drop(['Division Name', 'Department Name', 'Class Name'], axis=1)
    df_valid_wo_cat = df_valid_pp.drop(['Division Name', 'Department Name', 'Class Name'], axis=1)
    df_test_wo_cat = df_test_pp.drop(['Division Name', 'Department Name', 'Class Name'], axis=1)

df_train_wo_cat.head(3)

---

In [None]:
# title_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Title'], (1, 3)) # 0.618
# title_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Title'], (1, 3), 10000) # 0.622
# title_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Title'], (1, 3), 7000) # 0.619
# title_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Title'], (1, 3), 4000) # 0.617
# title_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Title'], (1, 3), 1000) # 0.615
# title_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Title'], (1, 3), 15000) # 0.617
# title_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Title'], (1, 3), 20000) # 0.617
# title_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Title'], (1, 3), 8500) # 0.623
# title_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Title'], (1, 3), 9000) # 0.622
# title_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Title'], (1, 3), 8000) # 0.623
# title_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Title'], (1, 3), 7500) # 0.622
# title_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Title'], (1, 3), 8250) # 0.622
title_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Title'], (1, 3), 8000)

# reviewtext_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Review Text'], (1, 2), 8000) # 0.636
# reviewtext_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Review Text'], (1, 2), 5000) # 0.632
# reviewtext_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Review Text'], (1, 2), 3000) # 0.635
# reviewtext_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Review Text'], (1, 2), 1000) # 0.635
# reviewtext_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Review Text'], (1, 2), 500) # 0.627
# reviewtext_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Review Text'], (1, 2), 10000) # 0.639
# reviewtext_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Review Text'], (1, 2), 15000) # 0.640
# reviewtext_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Review Text'], (1, 2), 20000) # 0.639
# reviewtext_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Review Text'], (1, 2), 25000) # 0.639
# reviewtext_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Review Text'], (1, 2), 17500) # 0.640
# reviewtext_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Review Text'], (1, 2), 12500) # 0.641
# reviewtext_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Review Text'], (1, 2), 14000) # 0.642
# reviewtext_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Review Text'], (1, 2), 14500) # 0.640
# reviewtext_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Review Text'], (1, 2), 13000) # 0.643
# reviewtext_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Review Text'], (1, 2), 13500) # 0.640
reviewtext_vectorizer = get_tfidf_vectorizer(df_train_wo_cat['Review Text'], (1, 2), 13000)

In [None]:
X_train_title = get_vectorized_X(df_train_wo_cat['Title'], title_vectorizer)
X_valid_title = get_vectorized_X(df_valid_wo_cat['Title'], title_vectorizer)

X_train_reviewtext = get_vectorized_X(df_train_wo_cat['Review Text'], reviewtext_vectorizer)
X_valid_reviewtext = get_vectorized_X(df_valid_wo_cat['Review Text'], reviewtext_vectorizer)

In [None]:
y_train_title = df_train_wo_cat['Rating'].tolist()
y_valid_title = df_valid_wo_cat['Rating'].tolist()

y_train_reviewtext = df_train_wo_cat['Rating'].tolist()
y_valid_reviewtext = df_valid_wo_cat['Rating'].tolist()

In [None]:
print(len(X_train_title), len(y_train_title))
print(len(X_valid_title), len(y_valid_title))

In [None]:
# nb_title = MultinomialNB() # Accuracy:  0.5936967632027257
# nb_title.fit(X_train_title, y_train_title)
# y_pred_title = nb_title.predict(X_valid_title)
# accuracy = accuracy_score(y_valid_title, y_pred_title)
# print("Naive Bayes 'Title Sentiment' Accuracy: ", accuracy)

In [None]:
logreg_title = LogisticRegression(random_state=random_state, solver="saga", max_iter=200) # Accuracy:  0.623
logreg_title.fit(X_train_title, y_train_title)
y_pred_title = logreg_title.predict(X_valid_title)
accuracy = accuracy_score(y_valid_title, y_pred_title)
print("Logistic Regression 'Title Sentiment' Accuracy: ", accuracy)

In [None]:
# nb_reviewtext = MultinomialNB() # Accuracy:  0.608
# nb_reviewtext.fit(X_train_reviewtext, y_train_reviewtext)
# y_pred_reviewtext = nb_reviewtext.predict(X_valid_reviewtext)
# accuracy = accuracy_score(y_valid_reviewtext, y_pred_reviewtext)
# print("Naive Bayes 'Review Text' Accuracy: ", accuracy)

In [None]:
# xgb_reviewtext = XGBClassifier(random_state=random_state, objective='multi:softmax') # Accuracy:  0.615 (3.5m runtime) with 8000 features
# xgb_reviewtext.fit(X_train_reviewtext, y_train_reviewtext)
# y_pred_reviewtext = xgb_reviewtext.predict(X_valid_reviewtext)
# accuracy = accuracy_score(y_valid_reviewtext, y_pred_reviewtext)
# print("XGBoost 'Review Text Sentiment' Accuracy: ", accuracy)

In [None]:
logreg_reviewtext = LogisticRegression(random_state=random_state, solver="saga", max_iter=200) # Accuracy:  0.643
logreg_reviewtext.fit(X_train_reviewtext, y_train_reviewtext)
y_pred_reviewtext = logreg_reviewtext.predict(X_valid_reviewtext)
accuracy = accuracy_score(y_valid_reviewtext, y_pred_reviewtext)
print("Logistic Regression 'Review Text Sentiment' Accuracy: ", accuracy)

In [None]:
X_valid_reviewtext.shape # (2348, 212416)

---

In [None]:
def preprocess_df(df: pd.DataFrame):
    vectorized_title = title_vectorizer.transform(df['Title']).toarray()
    df['Title Sentiment'] = logreg_title.predict(vectorized_title)
    # df = pd.DataFrame(vectorized_title, index=df.index)

    vectorized_reviewtext = reviewtext_vectorizer.transform(df['Review Text']).toarray()
    df['Review Text Sentiment'] = logreg_reviewtext.predict(vectorized_reviewtext)
    # df = pd.DataFrame(vectorized_reviewtext, index=df.index)

    # df_with_title_and_text = pd.concat([df, title_df, text_df], axis=1)

    df.drop(['Title'], axis=1, inplace=True)
    df.drop(['Review Text'], axis=1, inplace=True)

    return df

In [None]:
print('preprocessing train data')
df_train_final = preprocess_df(df_train_wo_cat)
print('preprocessing valid data')
df_valid_final = preprocess_df(df_valid_wo_cat)
print('preprocessing test data')
df_test_final = preprocess_df(df_test_wo_cat)

In [None]:
df_train_final.head(3)

In [None]:
y_train = df_train_final['Rating'].values
y_valid = df_valid_final['Rating'].values
y_test = df_test_final['Rating'].values

In [None]:
df_train_final.drop(['Rating'], axis=1, inplace=True)
X_train = df_train_final.values
df_valid_final.drop(['Rating'], axis=1, inplace=True)
X_valid = df_valid_final.values
df_test_final.drop(['Rating'], axis=1, inplace=True)
X_test = df_test_final.values

In [None]:
# rf_tab = RandomForestClassifier(random_state=random_state, n_estimators=1000)
# rf_tab.fit(X_train, y_train)
# y_preds = rf_tab.predict(X_valid)
# rf_tab_accuracy = accuracy_score(y_valid, y_preds)
# print(f"Random Forest tabular accuracy: {rf_tab_accuracy}")

In [None]:
# gbt_tab = GradientBoostingClassifier(random_state=random_state, n_estimators=1000)
# gbt_tab.fit(X_train, y_train)
# y_preds = gbt_tab.predict(X_valid)
# gbt_tab_accuracy = accuracy_score(y_valid, y_preds)
# print(f"Gradient Boosting tabular accuracy: {gbt_tab_accuracy}")

In [None]:
# # logreg_tab = LogisticRegression(random_state=random_state, solver="saga", max_iter=20000) # accuracy: 0.662
# logreg_tab = LogisticRegression(random_state=random_state, solver="lbfgs", max_iter=20000) # accuracy: 0.676
# logreg_tab.fit(X_train, y_train)
# y_preds = logreg_tab.predict(X_valid)
# logreg_tab_accuracy = accuracy_score(y_valid, y_preds)
# print(f"Logistic Regression tabular accuracy: {logreg_tab_accuracy}")

In [None]:
# # bst_tab = XGBClassifier(random_state=random_state, n_estimators=9, max_depth=2, learning_rate=.9, objective='multi:softmax') # 0.664
# bst_tab = XGBClassifier(random_state=random_state, objective='multi:softmax') # 0.662
# bst_tab.fit(X_train, y_train)
# y_preds = bst_tab.predict(X_valid)
# bst_tab_accuracy = accuracy_score(y_valid, y_preds)
# print(f"XGBoost tabular accuracy before hyperparameter tuning: {bst_tab_accuracy}")

---

In [None]:
# # helpful article on XGBoost hyperparameter optimization using hyperopt: https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning/notebook

# import wandb
# from hyperopt import STATUS_OK, fmin, hp, tpe, Trials
# import copy

# """
# hp.choice(label, options) — Returns one of the options, which should be a list or tuple.
# hp.randint(label, upper) — Returns a random integer between the range [0, upper).
# hp.uniform(label, low, high) — Returns a value uniformly between low and high.
# hp.quniform(label, low, high, q) — Returns a value round(uniform(low, high) / q) * q, i.e it rounds the decimal values and returns an integer.
# hp.normal(label, mean, std) — Returns a real value that's normally-distributed with mean and standard deviation sigma.

# use uniform for a range from to (float)
# use uniform for a range from to (float but just 1 decimal 0)
# use choice hack (hp.choice('n_estimators', np.arange(1, 500, dtype=int))) for range from to with no decimal (int)
# """

# def fix_xgb_args(obj):
#     keys_to_remove = ['early_stopping_rounds'] # only relevant during train time, not inference time
#     keys_requiring_ints = ['max_depth']
#     new_obj = copy.deepcopy(obj)
#     for key, value in obj.items():
#         if key in keys_to_remove:
#             new_obj.pop(key)
#         if key in keys_requiring_ints:
#             new_obj[key] = int(value)
#     return new_obj

In [None]:
# def objective(hyperparams):
#     clf = XGBClassifier(
#         **fix_xgb_args(hyperparams),
#         objective = 'multi:softmax',
#         random_state = random_state
#     )
    
#     clf.fit(
#         X_train, y_train,
#         eval_set = [(X_valid, y_valid)], 
#         verbose = False
#     )
    
#     pred = clf.predict(X_valid)
#     accuracy = accuracy_score(y_valid, pred)
#     loss = 1 - accuracy

#     wandb.log({"accuracy": accuracy, "loss": loss})

#     return {'loss': loss, 'status': STATUS_OK }

In [None]:
# wandb.init(project="trees-vs-nns")

In [None]:
# range_2_20 = np.arange(2, 20, dtype=int)
# range_2_100 = np.arange(2, 100, dtype=int)

# hyperparams = {
#     'max_depth': hp.choice('max_depth', range_2_20),
#     'gamma': hp.quniform('gamma', 0, 1, 0.01),
#     'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01),
#     'min_child_weight' : hp.quniform('min_child_weight', 1, 20, 1),
#     'subsample': hp.quniform('subsample', 0.1, 1, 0.01),
#     'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
#     'learning_rate': hp.quniform('learning_rate', 0.01, 0.99, 0.01),
#     'n_estimators': hp.choice('n_estimators', range_2_100),
# }

# trials = Trials()

# best_hyperparams = fmin(
#     fn = objective,
#     space = hyperparams,
#     algo = tpe.suggest,
#     max_evals = 500,
#     trials = trials,
# )

In [None]:
# wandb.finish()

In [None]:
# # train and fit on best hyperparameters
# bst_tab_hat = XGBClassifier(random_state=random_state, objective='multi:softmax', **fix_xgb_args(best_hyperparams))
# bst_tab_hat.fit(X_train, y_train)

# # test on valid dataset
# y_preds = bst_tab_hat.predict(X_valid)
# bst_tab_hat_accuracy = accuracy_score(y_valid, y_preds)
# print(f"XGBoost tabular accuracy: {bst_tab_hat_accuracy}")
# print("hyperparameters used:", fix_xgb_args(best_hyperparams))

---

In [None]:
# # test on test dataset
# y_preds_test = bst_tab_hat.predict(X_test)
# bst_tab_hat_accuracy_test = accuracy_score(y_test, y_preds_test)
# print(f"XGBoost tabular accuracy: {bst_tab_hat_accuracy_test}")

# lr_y_preds_test = logreg_tab.predict(X_test)
# lr_logreg_tab_accuracy = accuracy_score(y_test, lr_y_preds_test)
# print(f"Logistic Regression tabular accuracy: {lr_logreg_tab_accuracy}")

---

"Illegal" optimizations after using the last unseen data. Goal: use wandb for the hyperparameter sweep instead of hyperopt and create one of these fancy parallel coordinates diagram: https://docs.wandb.ai/assets/images/intro_what_it_is-8462e8215e06544eaa40dfdfe656d03d.png

In [None]:
project_name = 'trees-vs-nns'
sweep_name = f"{project_name}-sweep"

In [None]:
# wandb sweep configuration keys https://docs.wandb.ai/guides/sweeps/define-sweep-configuration#configuration-keys

sweep_config = {
    "name": sweep_name,
    "metric": { "name": "accuracy", "goal": "maximize" },
    "method": "bayes",
    "parameters": {
        'max_depth': { "values": np.arange(2, 20).tolist() },
        'gamma': {'min': 0.0, 'max': 1.0},
        'colsample_bytree' : {'min': 0.1, 'max': 1.0},
        'min_child_weight' : { "values": np.arange(1, 20).tolist() },
        'subsample': {'min': 0.1, 'max': 1.0},
        'eta': {'min': 0.025, 'max': 0.5},
        'learning_rate': {'min': 0.01, 'max': 0.99},
        'n_estimators': { "values": np.arange(2, 100).tolist() },
    }
}

In [None]:
def objective(hyperparams):
    clf = XGBClassifier(
        **hyperparams,
        objective = 'multi:softmax',
        random_state = random_state
    )
    
    clf.fit(
        X_train, y_train,
        eval_set = [(X_valid, y_valid)], 
        verbose = False
    )
    
    pred = clf.predict(X_valid)

    accuracy = accuracy_score(y_valid, pred)
    print(f"accuracy: {accuracy}")
    
    return accuracy

In [None]:
def main():
    wandb.init(project=project_name)
    accuracy = objective(wandb.config)
    wandb.log({'accuracy': accuracy})

In [None]:
os.environ['WANDB_SILENT']="true" # avoids a lengthy log per sweep
sweep_id = wandb.sweep(sweep=sweep_config, project=project_name)
wandb.agent(sweep_id, function=main, count=50)

In [None]:
api = wandb.Api()
sweep = api.sweep(f"trees-vs-nns/{sweep_id}")
best_run = sweep.best_run()
best_run_hyperparams = best_run.config
best_run_hyperparams

In [None]:
# train and fit on best hyperparameters
bst_tab_hat = XGBClassifier(random_state=random_state, objective='multi:softmax', **best_run_hyperparams)
bst_tab_hat.fit(X_train, y_train)

# test on valid dataset
y_preds = bst_tab_hat.predict(X_valid)
bst_tab_hat_accuracy = accuracy_score(y_valid, y_preds)
print(f"XGBoost tabular accuracy: {bst_tab_hat_accuracy}")

In [None]:
# test on test dataset
y_preds_test = bst_tab_hat.predict(X_test)
bst_tab_hat_accuracy_test = accuracy_score(y_test, y_preds_test)
print(f"XGBoost tabular accuracy: {bst_tab_hat_accuracy_test}")