### Format the notebook

In [None]:
import IPython.core.display as di

di.display_html("""
$('<style>.code_cell { margin-bottom: 80px !important;}</style>').appendTo('head');
""", raw=True)

### Imports

In [None]:
# standard imports
import sys
import pandas as pd
import pickle

# external imports
sys.path.append("../")
from YouReader.Reader import Reader
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.stem.snowball import SnowballStemmer


# pandas settings
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.max_rows', None)
pd.set_option('max_rows', None)

# Constants
TOKEN_PATTERN = r"[^\s]+"

### Create Stemmer TF-IDF Vectorizer

In [None]:
# SnowballStemmer Override for TfidfVectorizer
# referenced from https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn

english_stemmer = SnowballStemmer("english")
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: ([english_stemmer.stem(w) for w in analyzer(doc)])

stem_vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=.75, stop_words="english", ngram_range=(1,1), token_pattern=TOKEN_PATTERN)

### Read in the data

In [None]:
reader = Reader()
count = reader.load_captions("../data/dataset.json")
df = reader.to_dataframe()
df

### Generate a template for generating the matrix (tokenizes terms)
* fit() helps us generate a list of unique terms
* creates a vocabulary

In [None]:
# generate vectorizer template (skip to below if already generated)
fit_vectorizer = stem_vectorizer.fit(df["clean"])
print(f"There are {len(fit_vectorizer.vocabulary_)} unique terms")

In [None]:
# only run if you've saved the model
with open("model/idf_fit_vectorizer.pkl", "rb") as idf_fit_file:
    fit_vectorizer = pickle.load(idf_fit_file)
    
print(f"There are {len(fit_vectorizer.vocabulary_)} unique terms")

### Generate training and testing sets
* won't go too much into detail
* here we are using stratified sampling to generate test and training datasets
* 1/10 for testing, 9/10 for training
* better to use k-fold cross validation or leave one out validation

In [None]:
# generate training and testing sets (skip if already saved to pickle)
all_keys = list(df.index.values)
subjects = df["subject"].unique()

test_data = []
test_labels = []
train_data = []
train_labels = []
fold = 10

# partition testing dataset to be 1/10, and training dataset to be 9/10 of original
for subject in subjects:
    subject_keys = [key for key in all_keys if df.loc[key]["subject"] == subject]
    subject_clean = [df.loc[key]["clean"] for key in subject_keys]
    subject_subject = [df.loc[key]["subject"] for key in subject_keys]
    
    test_size = 200 // fold
    test_data.extend(subject_clean[:test_size])
    test_labels.extend(subject_subject[:test_size])
    train_data.extend(subject_clean[test_size:])
    train_labels.extend(subject_subject[test_size:])

### Transform to training vector and testing vector to sparse matrix

In [None]:
# generate sparse matrix (skip if already saved as pickle)
train_transform = fit_vectorizer.transform(train_data)
test_transform = fit_vectorizer.transform(test_data)

print(train_transform.shape)
print(test_transform.shape)

In [None]:
# only run if you've saved the model
with open("model/train_transform.pkl", "rb") as train_transform_file:
    train_transform = pickle.load(train_transform_file)

with open("model/test_transform.pkl", "rb") as test_transform_file:
    test_transform = pickle.load(test_transform_file)
    
print(train_transform.shape)
print(test_transform.shape)

### Save data into pickle (no need to regenerate)

In [None]:
with open("model/idf_fit_vectorizer.pkl", "wb") as idf_fit_file:
    pickle.dump(fit_vectorizer, idf_fit_file)

with open("model/train_transform.pkl", "wb") as train_transform_file:
    pickle.dump(train_transform, train_transform_file)
with open("model/train_labels.pkl", "wb") as train_labels_file:
    pickle.dump(train_labels, train_labels_file)

with open("model/test_transform.pkl", "wb") as test_transform_file:
    pickle.dump(test_transform, test_transform_file)
with open("model/test_labels.pkl", "wb") as test_labels_file:
    pickle.dump(test_labels, test_labels_file)
