# Baseline Model for Question 1 (data from 2013, 2018, 2020)
## Multi-Label Classification using Classifier Chains model


In [210]:
#load dependencies
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from skmultilearn.problem_transform import ClassifierChain
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from scipy.sparse import csr_matrix

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, precision_score


**This is after importing preprocessing comments**


In [None]:
#### This code is from preprocessing to be used before 
#### running code below to create clean texts for fitting 
#----------------------------------------------------------------

# #Create class object
# c_pp = comment_preprocessor()

# #clean X training set
# clean_doc_train, vocab_train = c_pp.preprocess_text(list(X_train_Q1['Comment']))
# #clean X valid
# clean_doc_valid, vocab_valid = c_pp.preprocess_text(list(X_valid_Q1['Comment']))

# ## For baseline, convert list of lists into list of sentences using the following code

# clean_text = []
# for docs in clean_doc_train:
#     clean_text.append(' '.join(docs)) 

# #save clean text 
# X_train_Q1['clean_text'] = clean_text
# X_train_Q1.to_csv('data/X_train_Q1_clean.csv', index=False)

# #validation set preprocess
# clean_text_val = []
# for docs in clean_doc_valid:
#     clean_text_val.append(' '.join(docs))

# #save valid clean text 
# X_valid_Q1['clean_text'] = clean_text_val
# X_valid_Q1.to_csv('data/X_valid_Q1_clean.csv', index=False)

In [289]:
#load in preprocessed train and validation datas
#your path here

X_train_Q1 = pd.read_csv('data/X_train_pp.csv')
X_valid_Q1 = pd.read_csv('data/X_valid_pp.csv')

y_train_Q1 = pd.read_csv('data/y_train_Q1.csv')
y_valid_Q1 = pd.read_csv('data/y_valid_Q1.csv')

## Tfid Vectorizer Representation
First we'll use Tfid as vectorizer


In [296]:
#Tfid Vectorizer Representation

def tfid_vectorizer(train, valid):
    """
    Fits the TfidVectorizer() on your X training 
    set and transform on X validation set
    Returns the matrixes.
    """
    tfid = TfidfVectorizer() 
    X = tfid.fit_transform(train)
    X_valid = tfid.transform(valid)
    return X, X_valid


#To view the representation
#bow = pd.DataFrame(X_train_tfid, columns=sorted(tfid.vocabulary_), index=final_comments)
#bow.head()

In [299]:
#Vectorize X_train and convert Y_train to an array

X_train, X_valid = tfid_vectorizer(X_train_Q1['Comment'].values.astype('U'), 
                                    X_valid_Q1['Comment'].values.astype('U')) #had to convert type 
#ytrain for all themes and subthemes
Y_train = (np.array(y_train_Q1))

In [300]:
#slice y to themes and subthemes
#y_train
subthemes_ytrain = y_train_Q1.loc[:, 'CPD_Improve_new_employee_orientation':'OTH_Covid']
themes_ytrain = y_train_Q1[['CPD','CB', 'EWC', 'Exec', 'FEW', 'SP', 'RE', 'Sup', 'SW', 'TEPE',
                            'VMG', 'OTH', 'Unrelated']]

#y_valid
subthemes_yvalid = y_valid_Q1.loc[:, 'CPD_Improve_new_employee_orientation':'OTH_Covid']
themes_yvalid = y_valid_Q1[['CPD','CB', 'EWC', 'Exec', 'FEW', 'SP', 'RE', 'Sup', 
                                      'SW', 'TEPE','VMG', 'OTH', 'Unrelated']]

#shape check: 13 themes and 62 subthemes
print('Theme columns:',themes_ytrain.shape[1])
print('Subtheme columns:', subthemes_ytrain.shape[1])

Theme columns: 13
Subtheme columns: 62


In [301]:
np.shape(subthemes_yvalid)

(2594, 62)

In [302]:
np.shape(themes_yvalid)

(2594, 13)

# Classifier Chain
## Subthemes only
### trying different models to choose best baseline classifier

**Starting with Subthemes Y only**

In [303]:
#Parts of code adapated from DSCI 573 lab 4
#Dictionary of Base Models

models = {
    'LinearSVC': LinearSVC(),
    'MultinomialNB' : MultinomialNB(),
    'GaussianNB' : GaussianNB()#,
    #'Random Forest' : RandomForestClassifier(), too slow will use function 
    #'KNeighborsClassifier': KNeighborsClassifier(),
    #'Neural Net' : MLPClassifier()
}


In [304]:
#For subthemes only
#Note takes about 15 min to run

results_dict = []

for model_name, model in models.items():

    classifier_chain = ClassifierChain(model)
    model = classifier_chain.fit(X_train, subthemes_ytrain)
    train = model.score(X_train, np.array(subthemes_ytrain))
    valid = model.score(X_valid, np.array(subthemes_yvalid))
    y_pred = model.predict(X_valid)
    recall = recall_score(np.array(subthemes_yvalid), y_pred, average= 'micro')
    precision = precision_score(np.array(subthemes_yvalid), y_pred, average= 'micro')
    
    case= {'Model': model_name,
           'Train Accuracy': train,
           'Validation Accuracy': valid,
           'Recall Score': recall,
           'Precision Score': precision}
    
    results_dict.append(case)

In [305]:
#inital look at df
pd.DataFrame(results_dict)

Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,LinearSVC,0.770625,0.319584,0.39901,0.707638
1,MultinomialNB,0.052525,0.04202,0.034653,0.952381
2,GaussianNB,0.423188,0.026214,0.314356,0.074143


In [306]:
#list-dictionary of basemodels
results_dict2 = []

In [307]:
def Classifier_Chain(Ytrain, Yvalid, base_model):
    """
    Fits a Classifier Chain Model with the base classifier of choice and 
    using either themes or subthemes Y.
    Show results of training accuracy score, validation accuracy score, 
    validation recall and precision scores.
    """
    classifier_chain = ClassifierChain(base_model)
    
    model = classifier_chain.fit(X_train, subthemes_ytrain)
    train = model.score(X_train, np.array(subthemes_ytrain))
    valid = model.score(X_valid, np.array(subthemes_yvalid))
    y_pred = model.predict(X_valid)
    recall = recall_score(np.array(subthemes_yvalid), y_pred, average= 'micro')
    precision = precision_score(np.array(subthemes_yvalid), y_pred, average= 'micro')
    
    case= {'Model': base_model,
           'Train Accuracy': train,
           'Validation Accuracy': valid,
           'Recall Score': recall,
           'Precision Score': precision}
    
    results_dict2.append(case)    

In [308]:
#Random Forest' : RandomForestClassifier()
#Note takes quite some time ~15 min on my computer
Classifier_Chain(subthemes_ytrain, subthemes_yvalid, RandomForestClassifier())
results_dict2

[{'Model': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=None, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, n_estimators=100,
                         n_jobs=None, oob_score=False, random_state=None,
                         verbose=0, warm_start=False),
  'Train Accuracy': 0.9973978411719352,
  'Validation Accuracy': 0.19236700077101002,
  'Recall Score': 0.19876237623762377,
  'Precision Score': 0.8606645230439443}]

In [309]:
results_dict2[0]['Model'] = 'RandomForest' #rename

In [311]:
#Decision Tree' : DecisionTreeClassifier()
#Note takes quite some time ~15 min on my computer
Classifier_Chain(subthemes_ytrain, subthemes_yvalid, DecisionTreeClassifier())
results_dict2

[{'Model': 'RandomForest',
  'Train Accuracy': 0.9973978411719352,
  'Validation Accuracy': 0.19236700077101002,
  'Recall Score': 0.19876237623762377,
  'Precision Score': 0.8606645230439443},
 {'Model': DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                         max_depth=None, max_features=None, max_leaf_nodes=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, presort='deprecated',
                         random_state=None, splitter='best'),
  'Train Accuracy': 0.9982652274479569,
  'Validation Accuracy': 0.21703932151117963,
  'Recall Score': 0.43143564356435643,
  'Precision Score': 0.4634405743153417}]

In [312]:
results_dict2[1]['Model'] = 'DecisionTree' #rename

In [313]:
#LinearSVC(multi_class= "crammer_singer")
#"crammer_singer" optimizes a joint objective over all classes.
Classifier_Chain(subthemes_ytrain, subthemes_yvalid, LinearSVC(multi_class= "crammer_singer"))




In [314]:
results_dict2[2]['Model'] = 'LinearSVC(multi_class= "crammer_singer")' #rename

In [315]:
#Trying last year's base classifier LinearSVC(C=0.5, tol=0.2)

#results_dict2 = []
Classifier_Chain(subthemes_ytrain, subthemes_yvalid, LinearSVC(C=0.5, tol=0.2))

In [316]:
results_dict2[3]['Model'] = 'LinearSVC(C=0.5, tol=0.2) 2019 capstone base' #rename

In [318]:
results_dict.extend(results_dict2)

In [319]:
my_results =pd.DataFrame(results_dict)
my_results

Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,LinearSVC,0.770625,0.319584,0.39901,0.707638
1,MultinomialNB,0.052525,0.04202,0.034653,0.952381
2,GaussianNB,0.423188,0.026214,0.314356,0.074143
3,RandomForest,0.997398,0.192367,0.198762,0.860665
4,DecisionTree,0.998265,0.217039,0.431436,0.463441
5,"LinearSVC(multi_class= ""crammer_singer"")",0.681766,0.338088,0.425,0.654594
6,"LinearSVC(C=0.5, tol=0.2) 2019 capstone base",0.543562,0.312645,0.373762,0.754246


***All models overfitting especially the Tree models. Best one overall is LinearSVC with default multi-class='ovr'. Still overfitting though, try tune regularization (less regularization is larger C)***

In [320]:
#Looping through various C value in LinearSVC

scores = []
for i in (10.0**np.arange(-4,4)):
    model = ClassifierChain(LinearSVC(C = i, max_iter = 100000)).fit(X_train, subthemes_ytrain)
    #train = model.score(X_train, np.array(subthemes_ytrain))
    valid = model.score(X_valid, np.array(subthemes_yvalid))
    
    scores.append(valid)

In [321]:
10.0**np.arange(-4,4)

array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])

In [322]:
scores

[0.0007710100231303007,
 0.0007710100231303007,
 0.05127216653816499,
 0.24518118735543562,
 0.31958365458750965,
 0.2779491133384734,
 0.24518118735543562,
 0.23014649190439476]

> Regularization doesn't help here. The best validation score is given by the default C hyperparameter = 1.0. Let's try this on LinearSVC(multi-class='crammer_singer')

In [101]:
#Looping through various C value in LinearSVC multiclass= crammer_singer 

scores = []
for i in (10.0**np.arange(-4,4)):
    model = ClassifierChain(LinearSVC(multi_class= "crammer_singer", C = i, max_iter = 100000)).fit(X_train, subthemes_ytrain)
    #train = model.score(X_train, np.array(subthemes_ytrain))
    valid = model.score(X_valid, np.array(subthemes_yvalid))
    
    scores.append(valid)

In [102]:
scores

[0.0007710100231303007,
 0.0007710100231303007,
 0.009252120277563608,
 0.19275250578257516,
 0.3430994602929838,
 0.2679259830377795,
 0.25212027756360833,
 0.24710871241326138]

> Same results for C values of crammer_singer. Seems that Linear(SVC) with its default regularization gives the highest

> Due to lack of resources could not find proper way to GridSeach with multilabel classifier. https://stackoverflow.com/questions/26018543/gridsearch-for-multi-label-classification-in-scikit-learn

In [112]:
models

{'LinearSVC': LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
           intercept_scaling=1, loss='squared_hinge', max_iter=1000,
           multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
           verbose=0),
 'MultinomialNB': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
 'GaussianNB': GaussianNB(priors=None, var_smoothing=1e-09)}

## For themes Y 

## working the same steps on Themes to find base classifier

In [323]:
#For themes only
#Note takes about ~15 min to run
results_dict_themes = []

for model_name, model in models.items():

    classifier_chain = ClassifierChain(model)
    model = classifier_chain.fit(X_train, themes_ytrain)
    train = model.score(X_train, np.array(themes_ytrain))
    valid = model.score(X_valid, np.array(themes_yvalid))
    y_pred = model.predict(X_valid)
    recall = recall_score(np.array(themes_yvalid), y_pred, average= 'micro')
    precision = precision_score(np.array(themes_yvalid), y_pred, average= 'micro')
    
    case= {'Model': model_name,
           'Train Accuracy': train,
           'Validation Accuracy': valid,
           'Recall Score': recall,
           'Precision Score': precision}
    
    results_dict_themes.append(case)

In [324]:
pd.DataFrame(results_dict_themes)

Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,LinearSVC,0.791345,0.476099,0.608347,0.749915
1,MultinomialNB,0.246338,0.212028,0.181039,0.942446
2,GaussianNB,0.336546,0.047417,0.622996,0.14409


In [325]:
#empty out list
results_dict3 = []

In [326]:
def Classifier_Chain(Ytrain, Yvalid, base_model):
    """
    Fits a Classifier Chain Model with the base classifier of choice and 
    using either themes or subthemes Y.
    Show results of training accuracy score, validation accuracy score, 
    validation recall and precision scores.
    """
    classifier_chain = ClassifierChain(base_model)
    
    model = classifier_chain.fit(X_train, themes_ytrain) #subthemes_ytrain)
    train = model.score(X_train, np.array(themes_ytrain)) #subthemes_ytrain
    valid = model.score(X_valid, np.array(themes_yvalid)) #subthemes_yvalid
    y_pred = model.predict(X_valid)
    recall = recall_score(np.array(themes_yvalid), y_pred, average= 'micro') #subthemes_yvalid
    precision = precision_score(np.array(themes_yvalid), y_pred, average= 'micro') #subthemes_yvalid
    
    case= {'Model': base_model,
           'Train Accuracy': train,
           'Validation Accuracy': valid,
           'Recall Score': recall,
           'Precision Score': precision}
    
    #print(case)
    results_dict3.append(case)    

In [327]:
#Random Forest' : RandomForestClassifier()
#Note takes quite some time ~15 min on my computer
Classifier_Chain(themes_ytrain, themes_yvalid, RandomForestClassifier())

In [328]:
results_dict3[0]['Model'] = 'RandomForest' #rename

In [329]:
#Decision Tree' : DecisionTreeClassifier()
#Note takes quite some time ~15 min on my computer
Classifier_Chain(themes_ytrain, themes_yvalid, DecisionTreeClassifier())

In [330]:
results_dict3[1]['Model'] = 'DecisionTree' #rename

In [331]:
#LinearSVC(multi_class= "crammer_singer")
#"crammer_singer" optimizes a joint objective over all classes.
Classifier_Chain(themes_ytrain, themes_yvalid, LinearSVC(multi_class= "crammer_singer"))




In [332]:
results_dict3[2]['Model'] = 'LinearSVC(multi_class= "crammer_singer")' #rename

In [333]:
results_dict_themes.extend(results_dict3)

In [334]:
pd.DataFrame(results_dict_themes)

Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,LinearSVC,0.791345,0.476099,0.608347,0.749915
1,MultinomialNB,0.246338,0.212028,0.181039,0.942446
2,GaussianNB,0.336546,0.047417,0.622996,0.14409
3,RandomForest,0.998458,0.417116,0.453566,0.857367
4,DecisionTree,0.998651,0.35081,0.58817,0.590128
5,"LinearSVC(multi_class= ""crammer_singer"")",0.736507,0.499229,0.628524,0.720989


> For themes, also looks like LinearSVC() is the best. Hypothesizing that the best regularization parameter C for this model is their default values. Let's see if regularizing Decision Tree will help with overfitting and get better validation results

In [174]:
#Looping through various max_depth value in LinearSVC multiclass= crammer_singer 

scores = []
for i in range(10,100,10):
    model = ClassifierChain(DecisionTreeClassifier(max_depth = i)).fit(X_train, themes_ytrain)
    #train = model.score(X_train, np.array(subthemes_ytrain))
    valid = model.score(X_valid, np.array(themes_yvalid))
    
    scores.append(valid)

In [175]:
scores

[0.3797224363916731,
 0.4055512721665382,
 0.4109483423284503,
 0.40670778720123363,
 0.3982266769468003,
 0.40400925212027755,
 0.39552814186584423,
 0.39167309175019277,
 0.38781804163454126]

> Looks like none of these max depths improves the validation accuracy with None of result 0.42

In [338]:
#reset list
results_dict3 = []

In [339]:
#Trying last year's base classifier LinearSVC(C=0.5, tol=0.2)
Classifier_Chain(themes_ytrain, themes_yvalid, LinearSVC(C=0.5, tol=0.2))

In [340]:
results_dict3[0]['Model'] = 'LinearSVC(C=0.5, tol=0.2) 2019 capstone base' #rename

In [341]:
results_dict_themes.extend(results_dict3)

In [342]:
themes_results = pd.DataFrame(results_dict_themes)

# Summary of results:
## Subthemes

In [336]:
#SUBTHEMES RESULTS 
print("SUBTHEMES RESULTS:")
my_results

SUBTHEMES RESULTS:


Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,LinearSVC,0.770625,0.319584,0.39901,0.707638
1,MultinomialNB,0.052525,0.04202,0.034653,0.952381
2,GaussianNB,0.423188,0.026214,0.314356,0.074143
3,RandomForest,0.997398,0.192367,0.198762,0.860665
4,DecisionTree,0.998265,0.217039,0.431436,0.463441
5,"LinearSVC(multi_class= ""crammer_singer"")",0.681766,0.338088,0.425,0.654594
6,"LinearSVC(C=0.5, tol=0.2) 2019 capstone base",0.543562,0.312645,0.373762,0.754246


> LinearSVC() worked the best with their default C regularization parameter. 

    * train score: 0.77
    * validation score: 0.32
    * recall score: 0.40
    * precision score: 0.71

## Themes

In [344]:
#THEMES RESULTS 
print("THEMES RESULTS:")

themes_results

THEMES RESULTS:


Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,LinearSVC,0.791345,0.476099,0.608347,0.749915
1,MultinomialNB,0.246338,0.212028,0.181039,0.942446
2,GaussianNB,0.336546,0.047417,0.622996,0.14409
3,RandomForest,0.998458,0.417116,0.453566,0.857367
4,DecisionTree,0.998651,0.35081,0.58817,0.590128
5,"LinearSVC(multi_class= ""crammer_singer"")",0.736507,0.499229,0.628524,0.720989
6,"LinearSVC(C=0.5, tol=0.2) 2019 capstone base",0.692174,0.478797,0.592593,0.77992


> LAST YEARS RESULTS: Results from 2019 Capstone results on test data for baseline model (BoW CountVectorizer and BinaryRelevance(LinearSVC(C=0.5, tol=0.2))

     * Accuracy: 45%
     * Recall: 0.64
     * Precision: 0.74
  
> OUR RESULTS: Using LinearSVC(C=0.5, tol=0.2) 2019 capstone base, overfitting went down and only recall decreased in score. Our best results for baseline model is:

     * Accuracy: 48%
     * Recall: 0.59
     * Precision: 0.78

# Using universal sentence encoder as embedding

In [346]:
import tensorflow as tf

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [350]:
X_train_raw = pd.read_csv('data/X_train_Q1_clean.csv')

In [351]:
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), 
                 tf.tables_initializer()])
    training_embeddings = session.run(embed(X_train_raw.Comment.to_list()))

In [352]:
use = ClassifierChain(LinearSVC(C=0.5, tol=0.2))

In [353]:
use.fit(training_embeddings, themes_ytrain)

ClassifierChain(classifier=LinearSVC(C=0.5, class_weight=None, dual=True,
                                     fit_intercept=True, intercept_scaling=1,
                                     loss='squared_hinge', max_iter=1000,
                                     multi_class='ovr', penalty='l2',
                                     random_state=None, tol=0.2, verbose=0),
                order=None, require_dense=[True, True])

In [354]:
use.score(training_embeddings, themes_ytrain)

0.4995181187355436

In [355]:
#transforming valiation set
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), 
                 tf.tables_initializer()])
    test_embeddings = session.run(embed(X_valid_Q1.Comment.to_list()))

In [356]:
use.score(test_embeddings, themes_yvalid)

0.44680030840400925

In [357]:
y_pred = use.predict(test_embeddings)

In [358]:
recall_score(np.array(themes_yvalid), y_pred, average= 'micro') 

0.5702045328911001

In [359]:
precision_score(np.array(themes_yvalid), y_pred, average= 'micro') 

0.7094222833562586

In [362]:
uni_sent = []
case= {'Model': 'Universal Sentence Encoder LinearSVC',
       'Train Accuracy': use.score(training_embeddings, themes_ytrain),
       'Validation Accuracy': use.score(test_embeddings, themes_yvalid),
       'Recall Score': recall_score(np.array(themes_yvalid), y_pred, average= 'micro') ,
       'Precision Score': precision_score(np.array(themes_yvalid), y_pred, average= 'micro') }

#print(case)
uni_sent.append(case)    
results_dict_themes.extend(uni_sent)


In [363]:
pd.DataFrame(results_dict_themes)

Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,LinearSVC,0.791345,0.476099,0.608347,0.749915
1,MultinomialNB,0.246338,0.212028,0.181039,0.942446
2,GaussianNB,0.336546,0.047417,0.622996,0.14409
3,RandomForest,0.998458,0.417116,0.453566,0.857367
4,DecisionTree,0.998651,0.35081,0.58817,0.590128
5,"LinearSVC(multi_class= ""crammer_singer"")",0.736507,0.499229,0.628524,0.720989
6,"LinearSVC(C=0.5, tol=0.2) 2019 capstone base",0.692174,0.478797,0.592593,0.77992
7,Universal Sentence Encoder LinearSVC,0.499518,0.4468,0.570205,0.709422


# Decision Tree Hierachical Multi-Classifier

- trying to get hmm to work but exceptions trouble importing
https://github.com/davidwarshaw/hmc

In [256]:
class ClassHierarchy:
    """
    Class for class heirarchy.
    Parameters
    ----------
        root :
    Attributes
    ----------
    """
    def __init__(self, root):
        self.root = root
        self.nodes = {}

    def _get_parent(self, child):
        # Return the parent of this node
        return self.nodes[child] if (child in self.nodes and child != self.root) else self.root

    def _get_children(self, parent):
        # Return a list of children nodes in alpha order
        return sorted([child for child, childs_parent in
                       self.nodes.iteritems() if childs_parent == parent])

    def _get_ancestors(self, child):
        # Return a list of the ancestors of this node
        # Not including root, not including the child
        ancestors = []
        while True:
            child = self._get_parent(child)
            if child == self.root:
                break
            ancestors.append(child)
        return ancestors

    def _get_descendants(self, parent):
        # Return a list of the descendants of this node
        # Not including the parent
        descendants = []
        self._depth_first(parent, descendants)
        descendants.remove(parent)
        return descendants

    def _is_descendant(self, parent, child):
        while child != self.class_hierarchy.root and child != parent:
            child = self.class_hierarchy._get_parent(child)
        return child == parent

    def _is_ancestor(self, parent, child):
        return _is_descendant(parent, child)

    def _depth_first_print(self, parent, indent, last):
        print(indent, end="")
        if last:
            print(u"\u2514\u2500", end="")
            indent += "  "
        else:
            print(u"\u251C\u2500", end="")
            indent += u"\u2502 "
        print(parent)
        num_nodes = len(self._get_children(parent))
        node_count = 0
        for node in self._get_children(parent):
            node_count += 1
            self._depth_first_print(node, indent, node_count == num_nodes)

    def _depth_first(self, parent, classes):
        classes.append(parent)
        for node in self._get_children(parent):
            self._depth_first(node, classes)

    def add_node(self, child, parent):
        """
        Add a child-parent node to the class hierarchy.
        """
        if child == self.root:
            raise ValueError('The hierarchy root: ' + child.encode('utf-8') + ' is not a valid child node.')
        if child in self.nodes.keys():
            if self.nodes[child] != parent:
                raise ValueError('Node: ' + child.encode('utf-8') + ' has already been assigned parent: ' +
                                 child.encode('utf-8'))
            else:
                return
        self.nodes[child] = parent

    def nodes_(self):
        """
        Return the hierarchy classes as a list of child-parent nodes.
        """
        return self.nodes

    def classes_(self):
        """
        Return the hierarchy classes as a list of unique classes.
        """
        classes = []
        self._depth_first(self.root, classes)
        return classes

    def print_(self):
        """
        Pretty print the class hierarchy.
        """
        self._depth_first_print(self.root, "", True)

# =============================================================================
# Decision Tree Hierarchical Classifier
# =============================================================================


class DecisionTreeHierarchicalClassifier:

    def __init__(self, class_hierarchy):
        self.stages = []
        self.class_hierarchy = class_hierarchy
        self._depth_first_stages(self.stages, self.class_hierarchy.root, 0)

    def _depth_first_class_prob(self, tree, node, indent, last, hand):
        if node == -1:
            return
        print(indent, end="")
        if last:
            print(u"\u2514\u2500", end="")
            indent += "    "
        else:
            print(u"\u251C\u2500", end="")
            indent += u"\u2502   "
        print(hand + " " + node.encode('utf-8'))
        for k, count in enumerate(tree.tree_.value[node][0]):
            print(indent + tree.classes_[k].encode('utf-8') + ":" +
                  stage(count / tree.tree_.n_node_samples[node], 2).encode('utf-8'))
        self._depth_first_class_prob(tree, tree.tree_.children_right[node], indent, False, "R")
        self._depth_first_class_prob(tree, tree.tree_.children_left[node], indent, True, "L")

    def _depth_first_stages(self, stages, parent, depth):
        # Get the children of this parent
        children = self.class_hierarchy._get_children(parent)
        # If there are children, build a classification stage
        if len(children) > 0:
            # Assign stage props and append
            stage = {}
            stage['depth'] = depth
            stage['stage'] = parent
            stage['labels'] = children
            stage['classes'] = stage['labels'] + [stage['stage']]
            stage['target'] = 'target_stage_' + parent
            stages.append(stage)
            # Recurse through children
            for node in children:
                self._depth_first_stages(stages, node, depth + 1)

    def _recode_label(self, classes, label):
        # Reassign labels to their parents until either we hit the root, or an output class
        while label != self.class_hierarchy.root and label not in classes:
            label = self.class_hierarchy._get_parent(label)
        return label

    def _prep_data(self, X, y):
        # Design matrix columns
        dm_cols = range(0, X.shape[1])
        # Target columns
        target = X.shape[1]
        # Dataframe
        df = pd.concat([X, y], axis=1, ignore_index=True)
        # Create a target column for each stage with the recoded labels
        for stage_number, stage in enumerate(self.stages):
            df[stage['target']] = pd.DataFrame.apply(
                df[[target]],
                lambda row: self._recode_label(stage['classes'], row[target]),
                axis=1)
        return df, dm_cols

    def fit(self, X, y):
        """
        Build a decision tree multi-classifier from training data (X, y).
        """
        # Prep data
        df, dm_cols = self._prep_data(X, y)
        # Fit each stage
        for stage_number, stage in enumerate(self.stages):
            dm = df[df[stage['target']].isin(stage['classes'])][dm_cols]
            y_stage = df[df[stage['target']].isin(stage['classes'])][[stage['target']]]
            stage['tree'] = tree.DecisionTreeClassifier()
            if dm.empty:
                warnings.warn('No samples to fit for stage ' + stage['stage'].encode('utf-8'),
                              NoSamplesForStageWarning)
                continue
            stage['tree'] = stage['tree'].fit(dm, y_stage)
        return self

    def _check_fit(self):
        for stage in self.stages:
            if 'tree' not in stage.keys():
                raise ClassifierNotFitError(
                    'Estimators not fitted, call `fit` before exploiting the model.')

    def _predict_stages(self, X):
        # Score each stage
        for stage_number, stage in enumerate(self.stages):
            if stage_number == 0:
                y_hat = pd.DataFrame(
                    [self.class_hierarchy.root] * len(X),
                    columns=[self.class_hierarchy.root],
                    index=X.index)
            else:
                y_hat[stage['stage']] = y_hat[self.stages[stage_number - 1]['stage']]
            dm = X[y_hat[stage['stage']].isin([stage['stage']])]
            # Skip empty matrices
            if dm.empty:
                warnings.warn('No samples to predict for stage ' + stage['stage'].encode('utf-8'),
                              NoSamplesForStageWarning)
                continue
            if not stage['tree'].tree_:
                warnings.warn('No tree was fit for stage ' + stage['stage'].encode('utf-8'),
                              StageNotFitWarning)
                continue
            # combine_first reorders DataFrames, so we have to do this the ugly way
            y_hat_stage = pd.DataFrame(stage['tree'].predict(dm), index=dm.index)
            y_hat = y_hat.assign(stage_col=y_hat_stage)
            y_hat.stage_col = y_hat.stage_col.fillna(y_hat[stage['stage']])
            y_hat = y_hat.drop(stage['stage'], axis=1)
            y_hat = y_hat.rename(columns={'stage_col': stage['stage']})
        # Return predicted class for each stage
        return y_hat

    def predict(self, X):
        """
        Predict class for X.
        """
        # Check that the trees have been fit
        self._check_fit()
        y_hat = self._predict_stages(X)
        # Return only final predicted class
        return y_hat.ix[:, y_hat.shape[1] - 1].as_matrix()

    def score(self, X, y):
        """
        Returns the mean accuracy on the given test data (X, y).
        """
        # Check that the trees have been fit
        self._check_fit()
        y_pred = pd.DataFrame(self.predict(X), columns=['y_hat'], index=y.index)
        return metrics.accuracy_score(self.class_hierarchy, y, y_pred)

In [266]:
from sklearn import tree

In [267]:
dt = tree.DecisionTreeClassifier()
dt = dt.fit(X_train, y_train)
dt_predicted = dt.predict(X_test)
dt_accuracy = dt.score(X_test, y_test)

dth = hmc.DecisionTreeHierarchicalClassifier(ch)
dth = dth.fit(X_train, y_train)
dth_predicted = dth.predict(X_test)
dth_accuracy = dth.score(X_test, y_test)

NameError: name 'y_train' is not defined

<br>
<br>
<br>
<br>
<br>
<br>

***Previous work:***

### Using CountVecorizer for comparison vs TFID

In [85]:
#CountVectorizer Vectorizer Representation

def count_vectorizer(train, valid):
    cv = CountVectorizer() 
    X = cv.fit_transform(train)
    X_valid = cv.transform(valid)
    return X, X_valid

#bow = pd.DataFrame(X_train_tfid, columns=sorted(tfid.vocabulary_), index=final_comments)
#X_valid_tfid = tfid.transform(X_valid)

In [86]:
#Vectorize X_train and convert Y_train to an array

cvX_train, cvX_valid = count_vectorizer(X_train_Q1['clean_text'].values.astype('U'), 
                                    X_valid_Q1['clean_text'].values.astype('U')) #had to convert type 


In [87]:
print("CountVectorizer- FOR THEMES ONLY using LinearSVC():\n")
Classifier_Chain(cvX_train, themes_ytrain, cvX_valid, themes_yvalid, LinearSVC())

CountVectorizer- FOR THEMES ONLY using LinearSVC():

Training Score for Classifer Chain: 0.9588473400154202
Validation Score for Classifer Chain: 0.41518889745566695
Validation Recall: 0.6337755666113875
Validation Precision: 0.6297720406481736


In [88]:
print("CountVectorizer-FOR SUBTHEMES ONLY using LinearSVC():\n")
Classifier_Chain(cvX_train, subthemes_ytrain, cvX_valid, subthemes_yvalid, LinearSVC())


CountVectorizer-FOR SUBTHEMES ONLY using LinearSVC():

Training Score for Classifer Chain: 0.968195836545875
Validation Score for Classifer Chain: 0.27833461835003853
Validation Recall: 0.4777227722772277
Validation Precision: 0.5130249867091973


In [89]:
print("CountVectorizer- FOR THEMES ONLY using Decision Tree():\n")

Classifier_Chain(cvX_train, themes_ytrain, cvX_valid, themes_yvalid, DecisionTreeClassifier())

CountVectorizer- FOR THEMES ONLY using Decision Tree():

Training Score for Classifer Chain: 0.9985543562066307
Validation Score for Classifer Chain: 0.39321511179645335
Validation Recall: 0.6252072968490879
Validation Precision: 0.5751334858886347


In [90]:
print("CountVectorizer- FOR SUBTHEMES ONLY using Decision Tree:\n")

Classifier_Chain(cvX_train, subthemes_ytrain, cvX_valid, subthemes_yvalid, DecisionTreeClassifier())

CountVectorizer- FOR SUBTHEMES ONLY using Decision Tree:

Training Score for Classifer Chain: 0.9979760986892829
Validation Score for Classifer Chain: 0.2243639167309175
Validation Recall: 0.45024752475247526
Validation Precision: 0.44376677238350815


### ***For all themes and subthemes:***

In [79]:
#LinearSVC multi_class= "ovr"
#Multiclass as One-Vs-The-Rest:

classifier_svc = ClassifierChain(
    classifier = LinearSVC(multi_class= "ovr")
)
classifier_svc.fit(X_train, Y_train)

ClassifierChain(classifier=LinearSVC(C=1.0, class_weight=None, dual=True,
                                     fit_intercept=True, intercept_scaling=1,
                                     loss='squared_hinge', max_iter=1000,
                                     multi_class='ovr', penalty='l2',
                                     random_state=None, tol=0.0001, verbose=0),
                order=None, require_dense=[True, True])

In [80]:
#Train score
print("Training Score for LinearSVC Classifer Chain:", 
      classifier_svc.score(X_train, Y_train))
print("Validation Score for LinearSVC Classifer Chain:",
     classifier_svc.score(X_valid, np.array(y_valid_Q1)))

y_pred = classifier_svc.predict(X_valid)
print("Validation Recall for LinearSVC Classifer Chain:",
      recall_score(np.array(y_valid_Q1), y_pred, average= 'micro'))
print("Validation Precision for LinearSVC Classifer Chain:",
     precision_score(np.array(y_valid_Q1), y_pred, average= 'micro'))
      

Training Score for LinearSVC Classifer Chain: 0.7907671549730146
Validation Score for LinearSVC Classifer Chain: 0.3338473400154202
Validation Recall for LinearSVC Classifer Chain: 0.5321232697832332
Validation Precision for LinearSVC Classifer Chain: 0.6822367319604888


In [81]:
#Recall score for training 

y_pred = classifier_svc.predict(X_train)
recall_score(Y_train, y_pred, average= 'micro')

0.8729869538341413