# Data Preparation

Clone GitHub repository to Colab storage.

In [None]:
!git clone https://github.com/megagonlabs/HappyDB.git

In [None]:
!ls

In [None]:
!ls HappyDB/happydb/data

## Loading CSV file as DataFrame

Use `.read_csv()` function to load a CSV file.

In [None]:
import pandas as pd

In [None]:
hm_df = pd.read_csv("HappyDB/happydb/data/cleaned_hm.csv")
hm_df.head()

In [None]:
hm_df.columns

In [None]:
# Showing basic statistics 
hm_df.describe()

In [None]:
# Take a look at label distribution
hm_df["ground_truth_category"].value_counts()

In [None]:
# Take a look at # of sentences
hm_df["num_sentence"].value_counts().sort_index()

In [None]:
# Filtering out samples that do not have ground truth labels
#   or # of sentences > 3
filtered_hm_df = hm_df[(hm_df["num_sentence"] <= 3) &
                       (~ hm_df["ground_truth_category"].isnull())]
                       
print("Original # of HM: {}".format(len(hm_df)))
print("Filtered # of HM: {}".format(len(filtered_hm_df)))

# Label vector & Feature matrix creation

Let's create label vector and feature matrix from the DataFrame.

In [None]:
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, f1_score

In [None]:
# Label Encoder
le = LabelEncoder()
y = le.fit_transform(filtered_hm_df["ground_truth_category"])
y

In [None]:
le.classes_

In [None]:
# Count vectorizer creates BoW representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(filtered_hm_df["cleaned_hm"])
X

# Train-test split

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=1)


In [None]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
print("Training Accuracy: {:.4f}".format(clf.score(X_train, y_train)))
print("Test Accuracy: {:.4f}".format(clf.score(X_test, y_test)))

### Try! Change hyper-parameter `C` and run the same script.

In [None]:
clf = LogisticRegression(C=10.0) # <= Change the value and re-run the code block
clf.fit(X_train, y_train)
print("Training Accuracy: {:.4f}".format(clf.score(X_train, y_train)))
print("Test Accuracy: {:.4f}".format(clf.score(X_test, y_test)))

## (Optional) Draw train/test accuracy curve vs C 

This curve helps us understand the ``trend'' of C values

### Try! Test other supervised learning algorithms

See https://scikit-learn.org/ and try different classifier to see if it performs better than Logistic Regression.

In [None]:
# ===============================================
clf = None # <== Try other classifier
clf.fit(X_train, y_train)
print("Training Accuracy: {:2f}".format(clf.score(X_train, y_train)))
print("Test Accuracy: {:2f}".format(clf.score(X_test, y_test)))
# ===============================================

# (Advanced) Cross validation

Use cross validation instead of one-shot train-test split for evaluation. You can also try learning algorithms other then Logistic Regression for further analysis.


In [None]:
import warnings
warnings.filterwarnings('ignore')

kf = KFold(n_splits=5, random_state=1)
cm = np.zeros([len(le.classes_),
               len(le.classes_)],
              dtype="int") # Initialize confusion matrix with 0
f1_list = []
for train_index, test_index in kf.split(X):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  clf = GridSearchCV(LogisticRegression(),
                     param_grid={"C": [0.01, 0.1, 1.0]})
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  cm += confusion_matrix(y_test, y_pred)
  f1_list.append(f1_score(y_test, y_pred, average="macro"))

f1_scores = np.array(f1_list)

In [None]:
f1_scores

In [None]:
cm

In [None]:
le.classes_

# Next Steps

Tomorrow, we will explore more sophisticated feature engineering and data analysis including visualization.
