In [57]:
# imports
import sys
import pandas as pd
sys.path.append("../")
from YouReader.Reader import Reader
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from nltk.stem.snowball import SnowballStemmer
from pprint import pprint
import time

# pandas settings
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.max_rows', None)
pd.set_option('max_rows', None)

# Constants
TOKEN_PATTERN = r"[^\s]+"

In [22]:
# SnowballStemmer Override for  TfidfVectorizer
# referenced from https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn

english_stemmer = SnowballStemmer("english")
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: ([english_stemmer.stem(w) for w in analyzer(doc)])

<h2>Loading Data from Save</h2>

In [35]:
start_time = time.time()


reader = Reader()
count = reader.load_captions("../data/save.json")
df = reader.to_dataframe()
subject_totals = df.groupby("subject")["link"].count()


print("Loaded", count, "captions from save.json\n\n")
print(subject_totals, "\n\n")
print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

Loaded 1151 captions from save.json


subject
BIOL      82
BUS      135
CS       239
ENGL      65
MATH     269
POLI     137
PSYCH     70
Name: link, dtype: int64 


This took 1.30 seconds


<h2>Vectorize and stem the corpus</h2>

In [47]:
# generate the custom stemming vectorizer
stem_vectorizer = StemmedTfidfVectorizer(min_df=.000001, max_df=1, stop_words="english", token_pattern=TOKEN_PATTERN)

In [49]:
start_time = time.time()


# fit data to all transcripts to generate vocabulary / features
stem_fit_vectorizer = stem_vectorizer.fit(df["clean"])


print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

This took 62.68 seconds


<h2>Transform Training and Test Datasets</h2>

In [62]:
start_time = time.time()


# generate training and testing sets
all_keys = list(df.index.values)
all_subjects = df["subject"].unique()

test_data = []
test_labels = []
train_data = []
train_labels = []

# partition testing dataset to be 1/5, and training dataset to be 4/5 of original
for subject, total in zip(subjects, subject_totals):
    subject_keys = [key for key in all_keys if df.loc[key]["subject"] == subject]
    subject_clean = [df.loc[key]["clean"] for key in subject_keys]
    subject_subject = [df.loc[key]["subject"] for key in subject_keys]
    
    test_size = total // 10
    test_data.extend(subject_clean[:test_size])
    test_labels.extend(subject_subject[:test_size])
    train_data.extend(subject_clean[test_size:])
    train_labels.extend(subject_subject[test_size:])

    
print("Size of testing dataset is: ", len(test_data))
print("Size of training dataset is: ", len(train_data))    
print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

Size of testing dataset is:  96
Size of training dataset is:  901
This took 0.92 seconds


In [63]:
start_time = time.time()


# transforms datasets into feature vectors for ML analysis
train_transform = stem_fit_vectorizer.transform(train_data)
test_transform = stem_fit_vectorizer.transform(test_data)

print("Training dataset has the shape: ", train_transform.shape)
print("Testing dataset has the shape: ", test_transform.shape)
print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

Training dataset has the shape:  (901, 30271)
Testing dataset has the shape:  (96, 30271)
This took 65.70 seconds


<h2>KNN Fitting and Prediction</h2>

In [80]:
# generates and fits classifier to training dataset
classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(train_transform, train_labels)

KNeighborsClassifier(n_neighbors=3)

In [81]:
prediction = classifier.predict(test_transform)
prediction

array(['CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS',
       'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS',
       'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS',
       'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS',
       'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS',
       'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS',
       'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS',
       'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS',
       'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS'], dtype='<U5')

In [82]:
sample_text = ["political"]
sample_text_transform = stem_fit_vectorizer.transform(sample_text)
classifier.predict(sample_text_transform)

array(['CS'], dtype='<U5')