In [None]:
%%time
# Download spaCy model with word embeddings
!python -m spacy download en_core_web_lg

# Data Preparation

Clone GitHub repository to Colab storage.

In [None]:
!git clone https://github.com/megagonlabs/HappyDB.git

In [None]:
!ls

In [None]:
!ls HappyDB/happydb/data

# Utility functions

In [None]:
import numpy as np

from sklearn.base import clone
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

def run_cv(X, y, clf, num_classes, n_splits=5):
  kf = KFold(n_splits=n_splits, random_state=1)
  cm = np.zeros([num_classes,
                  num_classes],
                  dtype="int") # Initialize confusion matrix with 0
  f1_list = []
  for i, (train_index, test_index) in enumerate(kf.split(X)):
    print("Fold {}".format(i + 1))
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    cur_clf = clone(clf)
    cur_clf.fit(X_train, y_train)
    y_pred = cur_clf.predict(X_test)
    cm += confusion_matrix(y_test, y_pred)
    f1_list.append(f1_score(y_test, y_pred, average="macro"))
  f1_scores = np.array(f1_list)
  return (f1_scores, cm)

## Loading CSV file as DataFrame

Use `.read_csv()` function to load a CSV file.

In [None]:
import pandas as pd

In [None]:
hm_df = pd.read_csv("HappyDB/happydb/data/cleaned_hm.csv")
hm_df.head()

In [None]:
# Filtering out samples that do not have ground truth labels
#   or # of sentences > 3
filtered_hm_df = hm_df[(hm_df["num_sentence"] <= 3) &
                       (~ hm_df["ground_truth_category"].isnull())]
                       
print("Original # of HM: {}".format(len(hm_df)))
print("Filtered # of HM: {}".format(len(filtered_hm_df)))

# Label vector & Feature matrix creation

Let's create label vector and feature matrix from the DataFrame.

In [None]:
# Label Encoder
le = LabelEncoder()
y = le.fit_transform(filtered_hm_df["ground_truth_category"])
y

In [None]:
le.classes_

In [None]:
Xcount = CountVectorizer().fit_transform(filtered_hm_df["cleaned_hm"])

# Word embeddings

In [None]:
## You need to restart the runtime to use spaCy in the usual style.
# import spacy
# nlp = spacy.load("en_core_web_lg")  # "en_core_web_lg" does not provice embeddings
## Use the following style instead.
import en_core_web_lg
nlp = en_core_web_lg.load()

In [None]:
# Sample code
doc = nlp("Apple is looking at buying U.K. startup for $1 billion. jkdsjaflksj")
info_list = []
for token in doc:
    info_list.append([token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop,
            token.vector_norm, token.is_oov])
pd.DataFrame(
    info_list, columns=["TEXT", "LEMMA", "POS", "TAG", "DEP", "SHAPE", "ALPHA", "STOP",
                        "VECTOR_NORM", "OOV"])

## Visualize word embeddings

In [None]:
from sklearn.decomposition import TruncatedSVD
import seaborn as sns

## TRY! Change the following line and see how the word embeddings look like
words = ["carmine", "red", "purple", "orange", "green", "white", "cat", "dog"]
## ============================================================================

wvecs = np.array([nlp(w).vector for w in words])
wvecs_2d = TruncatedSVD(n_components=2).fit_transform(wvecs)

# Visualize plots
ax = sns.scatterplot(wvecs_2d[:, 0], wvecs_2d[:, 1])
for i, w in enumerate(words):
  ax.text(wvecs_2d[i, 0] + 0.1, wvecs_2d[i, 1] + 0.1, w)

## Cosine similarity

In [None]:
# Word embeddings
from scipy.spatial.distance import cosine
def cossim(x, y):
  return 1.0 - cosine(x, y)

for w1, w2 in [("carmine", "red"),
               ("carmine", "purple"),
               ("carmine", "orange"),
               ("carmine", "green"),
               ("carmine", "white"),
               ("carmine", "cat")]:
  print("cossim(\"{}\",\"{}\")={:.4f}".format(w1, w2,cossim(nlp(w1).vector, nlp(w2).vector)))



# Use sentence embeddings as features

In [None]:
def sent2vec(nlp, s):
  """Converts a sentence into a vector representation."""
  wvec_list = []
  for token in nlp(s):
    # Skip OOV words (= zero vector)
    if token.is_oov:
      continue
    wvec_list.append(token.vector)
  wvecs = np.array(wvec_list)
  return wvecs.mean(axis=0)

In [None]:
# Takes about 2 minutes
%%time
Xsentvec = np.array(
        filtered_hm_df["cleaned_hm"].apply(lambda x: sent2vec(nlp, x)).tolist())

# Try other feature extraction methods

In [None]:
%%time
f1_scores_count_lr, _ = run_cv(Xcount, y, LogisticRegression(), len(le.classes_))
f1_scores_sentvec_lr, _ = run_cv(Xsentvec, y, LogisticRegression(), len(le.classes_))

## [Optional] Uncomment below
#f1_scores_count_gbt, _ = run_cv(Xcount, y, GradientBoostingClassifier(), len(le.classes_))
#f1_scores_sentvec_gbt, _ = run_cv(Xsentvec, y, GradientBoostingClassifier(), len(le.classes_))

In [None]:
eval_df = pd.DataFrame({"LR+CountVec": f1_scores_count_lr,
                        "LR+Sent2vec": f1_scores_sentvec_lr})

## [Optional] Use the code below if you also run GBT
"""
eval_df = pd.DataFrame({"LR+CountVec": f1_scores_count_lr,
                        "LR+Sent2vec": f1_scores_sentvec_lr,
                        "GBT+CountVec": f1_scores_count_gbt,
                        "GBT+Sent2vec": f1_scores_sentvec_gbt})
"""
eval_df

In [None]:
eval_df.mean(axis=0)

# [Advanced] Fine-tuning BERT for happiness category classification

Fine-tune a BERT model for the same task. `transformers` library by Huggingface is the most common and easy-to-use Python library. 

https://github.com/huggingface/transformers
