# Data Preparation

Clone GitHub repository to Colab storage.

In [None]:
!git clone https://github.com/megagonlabs/HappyDB.git

In [None]:
!ls

In [None]:
!ls HappyDB/happydb/data

# Utility functions

In [None]:
import numpy as np

from sklearn.base import clone
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, f1_score

import warnings
warnings.filterwarnings('ignore')

def run_cv(X, y, clf, num_classes):
  kf = KFold(n_splits=5, random_state=1)
  cm = np.zeros([num_classes,
                  num_classes],
                  dtype="int") # Initialize confusion matrix with 0
  f1_list = []
  for i, (train_index, test_index) in enumerate(kf.split(X)):
    print("Fold {}".format(i + 1))
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    cur_clf = clone(clf)
    cur_clf.fit(X_train, y_train)
    y_pred = cur_clf.predict(X_test)
    cm += confusion_matrix(y_test, y_pred)
    f1_list.append(f1_score(y_test, y_pred, average="macro"))
  f1_scores = np.array(f1_list)
  return (f1_scores, cm)

## Loading CSV file as DataFrame

Use `.read_csv()` function to load a CSV file.

In [None]:
import pandas as pd

In [None]:
hm_df = pd.read_csv("HappyDB/happydb/data/cleaned_hm.csv")
hm_df.head()

In [None]:
# Filtering out samples that do not have ground truth labels
#   or # of sentences > 3
filtered_hm_df = hm_df[(hm_df["num_sentence"] <= 3) &
                       (~ hm_df["ground_truth_category"].isnull())]
                       
print("Original # of HM: {}".format(len(hm_df)))
print("Filtered # of HM: {}".format(len(filtered_hm_df)))

# Label vector & Feature matrix creation

Let's create label vector and feature matrix from the DataFrame.

In [None]:
# Label Encoder
le = LabelEncoder()
y = le.fit_transform(filtered_hm_df["ground_truth_category"])
y

In [None]:
le.classes_

In [None]:
Xcount = CountVectorizer().fit_transform(filtered_hm_df["cleaned_hm"])

# Try other feature extraction methods

In [None]:
%%time
# Creates feature vectors
Xtfidf = TfidfVectorizer().fit_transform(filtered_hm_df["cleaned_hm"])
Xlda = LatentDirichletAllocation().fit_transform(
        CountVectorizer().fit_transform(filtered_hm_df["cleaned_hm"]))

In [None]:
Xcount_lda = np.concatenate([Xcount.todense(), Xlda], axis=1)

In [None]:
f1_scores_count, _ = run_cv(Xcount, y, LogisticRegression(), len(le.classes_))
f1_scores_tfidf, _ = run_cv(Xtfidf, y, LogisticRegression(), len(le.classes_))
f1_scores_lda, _ = run_cv(Xlda, y, LogisticRegression(), len(le.classes_))
f1_scores_count_lda, _ = run_cv(Xcount_lda, y, LogisticRegression(), len(le.classes_))

In [None]:
eval_df = pd.DataFrame({"CountVec": f1_scores_count,
                        "TfidfVec": f1_scores_tfidf,
                        "LDA": f1_scores_lda,
                        "Count+LDA": f1_scores_count_lda})
eval_df

Try!
- Try different configurations of `CountVectorizer()` `TfidfVectorizer()` `LatentDirichletAllocation()`.
- Replace `LogisticRegression()` with other algorithms.
- Replace `LogisticRegression()` wigh `GridSearchCV(LogisticRegression(), ...)`

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

# Sample code from spaCy
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
info_list = []
for token in doc:
    info_list.append([token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop])
pd.DataFrame(
    info_list, columns=["TEXT", "LEMMA", "POS", "TAG", "DEP", "SHAPE", "ALPHA", "STOP"])

# Feature Engineering

Use the following ideas as preprocessing
- Remove stop words
- Filter adjectives, nouns, and verbs

In [None]:
pos_set = ["ADJ", "PROPN", "NOUN", "VERB"]
proc_hm_list = []
for hm in filtered_hm_df["cleaned_hm"].tolist():
  filtered_tokens = []
  for token in nlp(hm):
    # Remove stop words
    if token.is_stop:
      continue
    # Filter tokens that belong to predefined POS types
    if token.pos_ not in pos_set:
      continue
    filtered_tokens.append(token.lemma_)
  proc_hm = " ".join(filtered_tokens)
  proc_hm_list.append(proc_hm)
filtered_hm_df["proc_hm"] = proc_hm_list

In [None]:
filtered_hm_df["proc_hm"]

In [None]:
Xcount_proc = CountVectorizer().fit_transform(filtered_hm_df["proc_hm"])
f1_scores_count_proc, _ = run_cv(Xcount_proc, y, LogisticRegression(), len(le.classes_))

In [None]:
eval_df = pd.DataFrame({"CountVec": f1_scores_count,
                        "TfidfVec": f1_scores_tfidf,
                        "LDA": f1_scores_lda,
                        "Count+LDA": f1_scores_count_lda,
                        "Proc+CountVec": f1_scores_count_proc})
eval_df.mean(axis=0)