In [9]:
# standard imports
import pandas as pd
import pickle
import sys
import time

# external imports
sys.path.append("../")
from YouReader.Reader import Reader
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from nltk.stem.snowball import SnowballStemmer
from pprint import pprint


# pandas settings
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.max_rows', None)
pd.set_option('max_rows', None)

# Constants
TOKEN_PATTERN = r"[^\s]+"

In [4]:
# SnowballStemmer Override for TfidfVectorizer
# referenced from https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn

english_stemmer = SnowballStemmer("english")
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: ([english_stemmer.stem(w) for w in analyzer(doc)])

<h2>Loading Data from Save</h2>

In [16]:
start_time = time.time()


reader = Reader()
count = reader.load_captions("../data/dataset.json")
df = reader.to_dataframe()
subject_totals = df.groupby("subject")["link"].count()

print("Loaded", count, "captions from dataset.json\n\n")
print(subject_totals, "\n\n")
print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

Loaded 2600 captions from dataset.json


subject
BIOL    200
BUS     200
CHE     200
CHEM    200
CS      200
ECON    200
ENGL    200
HIST    200
MATH    200
PHIL    200
PHYS    200
POSC    200
PSYC    200
Name: link, dtype: int64 


This took 6.32 seconds


<h1>Load models if you've generated one before</h3>

<h3>Load Fitted Stem Vectorizer Model</h3>

In [13]:
start_time = time.time()

# Pickle Load fitted stem vectorizer model
with open("../models/model1/stemidf_vectorizer.pkl", "rb") as stemvec_file:
    stem_fit_vectorizer = pickle.load(stemvec_file)
    
print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

This took 0.03 seconds


<h3>Load Transformed Training Data and Labels</h3>

In [24]:
start_time = time.time()

# Pickle Load transformed training model
with open("../models/model1/stemidf_train_transform.pkl", "rb") as train_transform_file:
    train_transform = pickle.load(train_transform_file)

# Pickle Load train labels
with open("../models/model1/stemidf_train_labels.pkl", "rb") as train_labels_file:
    train_labels = pickle.load(train_labels_file)

print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

This took 0.00 seconds


<h3>Load Transformed Testing Data and Labels</h3>

In [None]:
start_time = time.time()

# Pickle Load transformed testing model
with open("../models/model1/stemidf_test_transform.pkl", "rb") as test_transform_file:
    test_transform = pickle.load(test_transform_file)
    
# Pickle Load test labels
with open("../models/model1/stemidf_test_labels.pkl", "rb") as test_labels_file:
    test_labels = pickle.load(test_labels_file)

print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

<h1>Skip down to KNN Analysis if you've already loaded all data!!!</h1>

<h2>Vectorize and stem the corpus</h2>

In [104]:
# generate the custom stemming vectorizer
stem_vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=.7, stop_words="english", token_pattern=TOKEN_PATTERN)

In [105]:
start_time = time.time()

# fit data to all transcripts to generate vocabulary / features
stem_fit_vectorizer = stem_vectorizer.fit(df["clean"])

print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

This took 116.59 seconds


<h3>Save Fitted Stem Vectorizer Model</h3>

In [106]:
start_time = time.time()

# Pickle Save fitted stem vectorizer model
with open("../models/model2/stemidf_vectorizer.pkl", "wb") as stemvec_file:
    pickle.dump(stem_fit_vectorizer, stemvec_file)
    
print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

This took 0.04 seconds


<h2>Transform Training and Test Datasets</h2>

In [170]:
start_time = time.time()

# generate training and testing sets
all_keys = list(df.index.values)
subjects = df["subject"].unique()

test_data = []
test_labels = []
train_data = []
train_labels = []
fold = 10

# partition testing dataset to be 1/10, and training dataset to be 9/10 of original
for subject, total in zip(subjects, subject_totals):
    subject_keys = [key for key in all_keys if df.loc[key]["subject"] == subject]
    subject_clean = [df.loc[key]["clean"] for key in subject_keys]
    subject_subject = [df.loc[key]["subject"] for key in subject_keys]
    
    test_size = total // fold
    test_data.extend(subject_clean[:test_size])
    test_labels.extend(subject_subject[:test_size])
    train_data.extend(subject_clean[test_size:])
    train_labels.extend(subject_subject[test_size:])

    
print("Size of testing dataset is: ", len(test_data))
print("Size of training dataset is: ", len(train_data))    
print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

Size of testing dataset is:  260
Size of training dataset is:  2340
This took 4.24 seconds


In [108]:
start_time = time.time()

# transforms datasets into feature vectors for ML analysis
train_transform = stem_fit_vectorizer.transform(train_data)
test_transform = stem_fit_vectorizer.transform(test_data)

print("Training dataset has the shape: ", train_transform.shape)
print("Testing dataset has the shape: ", test_transform.shape)
print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

Training dataset has the shape:  (2340, 13691)
Testing dataset has the shape:  (260, 13691)
This took 115.29 seconds


<h3>Save Transformed Training Data</h3>

In [109]:
start_time = time.time()

# Pickle Save transformed training model
with open("../models/model2/stemidf_train_transform.pkl", "wb") as train_transform_file:
    pickle.dump(train_transform, train_transform_file)
    
# Pickle Save train labels
with open("../models/model2/stemidf_train_labels.pkl", "wb") as train_labels_file:
    pickle.dump(train_labels, train_labels_file)

print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

This took 0.05 seconds


<h3>Save Transformed Testing Data</h3>

In [110]:
start_time = time.time()

# Pickle Save transformed testing model
with open("../models/model2/stemidf_test_transform.pkl", "wb") as test_transform_file:
    pickle.dump(test_transform, test_transform_file)
    
# Pickle Save test labels
with open("../models/model2/stemidf_test_labels.pkl", "wb") as test_labels_file:
    pickle.dump(test_labels, test_labels_file)

print("This took", "{0:.2f}".format(time.time() - start_time), "seconds")

This took 0.01 seconds


<h2>KNN Fitting and Prediction</h2>

In [160]:
# generates and fits classifier to training dataset
classifier = KNeighborsClassifier(n_neighbors=75)
classifier.fit(train_transform, train_labels)

KNeighborsClassifier(n_neighbors=75)

In [173]:
results = {}

for subject in subjects:
    results[subject] = []

for label, pred in zip(test_labels,prediction):
    results[label].append(pred)
    
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
BIOL,BIOL,BIOL,BIOL,BIOL,BIOL,BIOL,BIOL,BIOL,BIOL,BIOL,BIOL,BIOL,BIOL,BIOL,BIOL,BIOL,BIOL,BIOL,BIOL,BIOL
BUS,BUS,BUS,BUS,ECON,BUS,PHIL,BUS,BUS,BUS,BUS,BUS,BUS,BUS,BUS,BUS,BUS,BUS,BUS,BUS,BUS
CHEM,CHEM,PHYS,CHEM,CHEM,CHEM,PHYS,CHEM,CHEM,CHEM,PHYS,PHIL,CHEM,CHEM,CHEM,CHEM,CHEM,CHEM,CHEM,CHEM,CHEM
CS,CS,CS,CS,CS,CS,CS,CS,CS,CS,CS,CS,CS,CS,CS,CS,BUS,CS,CS,CS,CS
ECON,ECON,BUS,ECON,ECON,ECON,ECON,ECON,ECON,ECON,ECON,ECON,ECON,BUS,ECON,ECON,ECON,ECON,ECON,ECON,ECON
ENGL,POSC,PSYC,PSYC,ENGL,PSYC,ENGL,ENGL,POSC,PSYC,PSYC,PSYC,PSYC,ENGL,ENGL,BUS,ENGL,ENGL,POSC,ENGL,ENGL
HIST,HIST,HIST,HIST,HIST,HIST,HIST,HIST,HIST,HIST,PHIL,PHIL,HIST,HIST,HIST,HIST,HIST,HIST,HIST,HIST,HIST
MATH,MATH,MATH,MATH,MATH,MATH,MATH,MATH,MATH,MATH,MATH,MATH,MATH,MATH,MATH,MATH,MATH,MATH,MATH,MATH,MATH
PHIL,PHIL,PHIL,PHIL,PSYC,PHIL,PHIL,PHIL,PHIL,PHIL,PSYC,PHIL,PSYC,PHIL,PHIL,PHIL,PHIL,PHIL,PHIL,PHIL,PHIL
PHYS,PHYS,PHYS,PHYS,PHYS,PHYS,PHYS,PHYS,MATH,PHYS,PHYS,PHYS,PHYS,PHYS,PHYS,PHYS,PHYS,CHE,MATH,PHYS,CHE


In [None]:
def calculate_score(prediction, test_labels):
    return sum([1 if subject1 == subject2 else 0 for subject1, subject2 in zip(prediction, test_labels)]) / len(prediction)

def df_compare_accuracy()

In [161]:
prediction = classifier.predict(test_transform)
prediction

array(['BIOL', 'BIOL', 'BIOL', 'BIOL', 'BIOL', 'BIOL', 'BIOL', 'BIOL',
       'BIOL', 'BIOL', 'BIOL', 'BIOL', 'BIOL', 'BIOL', 'BIOL', 'BIOL',
       'BIOL', 'BIOL', 'BIOL', 'BIOL', 'BUS', 'BUS', 'BUS', 'ECON', 'BUS',
       'PHIL', 'BUS', 'BUS', 'BUS', 'BUS', 'BUS', 'BUS', 'BUS', 'BUS',
       'BUS', 'BUS', 'BUS', 'BUS', 'BUS', 'BUS', 'CHEM', 'PHYS', 'CHEM',
       'CHEM', 'CHEM', 'PHYS', 'CHEM', 'CHEM', 'CHEM', 'PHYS', 'PHIL',
       'CHEM', 'CHEM', 'CHEM', 'CHEM', 'CHEM', 'CHEM', 'CHEM', 'CHEM',
       'CHEM', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS', 'CS',
       'CS', 'CS', 'CS', 'CS', 'CS', 'BUS', 'CS', 'CS', 'CS', 'CS',
       'ECON', 'BUS', 'ECON', 'ECON', 'ECON', 'ECON', 'ECON', 'ECON',
       'ECON', 'ECON', 'ECON', 'ECON', 'BUS', 'ECON', 'ECON', 'ECON',
       'ECON', 'ECON', 'ECON', 'ECON', 'POSC', 'PSYC', 'PSYC', 'ENGL',
       'PSYC', 'ENGL', 'ENGL', 'POSC', 'PSYC', 'PSYC', 'PSYC', 'PSYC',
       'ENGL', 'ENGL', 'BUS', 'ENGL', 'ENGL', 'POSC', 'ENGL', 'ENGL',
  

In [162]:
score = sum([1 if subject1 == subject2 else 0 for subject1, subject2 in zip(prediction, test_labels)]) / len(prediction)
score

0.8269230769230769

In [163]:
sample_text = ["computer science c python finance code code"]
sample_text_transform = stem_fit_vectorizer.transform(sample_text)
classifier.predict(sample_text_transform)

array(['CS'], dtype='<U4')