# Baseline Model for Question 1 (data from 2013, 2018, 2020)
## Multi-Label Classification using Classifier Chains model


In [210]:
#load dependencies
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from skmultilearn.problem_transform import ClassifierChain
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from scipy.sparse import csr_matrix

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, precision_score


**This is after importing preprocessing comments**


In [289]:
#load in preprocessed train and validation datas
#your path here

X_train_Q1 = pd.read_csv('data/X_train_pp.csv')
X_valid_Q1 = pd.read_csv('data/X_valid_pp.csv')

y_train_Q1 = pd.read_csv('data/y_train_Q1.csv')
y_valid_Q1 = pd.read_csv('data/y_valid_Q1.csv')

## Tfid Vectorizer Representation
First we'll use Tfid as vectorizer


In [375]:
#Tfid Vectorizer Representation

def tfid_vectorizer(train, valid):
    """
    Fits the TfidVectorizer() on your X training 
    set and transform on X validation set
    Returns the matrixes.
    """
    tfid = TfidfVectorizer() 
    X = tfid.fit_transform(train)
    X_valid = tfid.transform(valid)
    return X, X_valid


#To view the representation
#bow = pd.DataFrame(X_train_tfid, columns=sorted(tfid.vocabulary_), index=final_comments)
#bow.head()

In [376]:
#Vectorize X_train and convert Y_train to an array

X_train, X_valid = tfid_vectorizer(X_train_Q1['Comment'].values.astype('U'), 
                                    X_valid_Q1['Comment'].values.astype('U')) #had to convert type 
#ytrain for all themes and subthemes
Y_train = (np.array(y_train_Q1))

In [377]:
#slice y to themes and subthemes
#y_train
subthemes_ytrain = y_train_Q1.loc[:, 'CPD_Improve_new_employee_orientation':'OTH_Covid']
themes_ytrain = y_train_Q1[['CPD','CB', 'EWC', 'Exec', 'FEW', 'SP', 'RE', 'Sup', 'SW', 'TEPE',
                            'VMG', 'OTH', 'Unrelated']]

#y_valid
subthemes_yvalid = y_valid_Q1.loc[:, 'CPD_Improve_new_employee_orientation':'OTH_Covid']
themes_yvalid = y_valid_Q1[['CPD','CB', 'EWC', 'Exec', 'FEW', 'SP', 'RE', 'Sup', 
                                      'SW', 'TEPE','VMG', 'OTH', 'Unrelated']]

#shape check: 13 themes and 62 subthemes
print('Theme columns:',themes_ytrain.shape[1])
print('Subtheme columns:', subthemes_ytrain.shape[1])

Theme columns: 13
Subtheme columns: 62


In [301]:
np.shape(subthemes_yvalid)

(2594, 62)

In [302]:
np.shape(themes_yvalid)

(2594, 13)

# Classifier Chain
## Subthemes only
### trying different models to choose best baseline classifier

**Starting with Subthemes Y only**

In [303]:
#Parts of code adapated from DSCI 573 lab 4
#Dictionary of Base Models

models = {
    'LinearSVC': LinearSVC(),
    'MultinomialNB' : MultinomialNB(),
    'GaussianNB' : GaussianNB()#,
    #'Random Forest' : RandomForestClassifier(), too slow will use function 
    #'KNeighborsClassifier': KNeighborsClassifier(),
    #'Neural Net' : MLPClassifier()
}


In [304]:
#For subthemes only
#Note takes about 15 min to run

results_dict = []

for model_name, model in models.items():

    classifier_chain = ClassifierChain(model)
    model = classifier_chain.fit(X_train, subthemes_ytrain)
    train = model.score(X_train, np.array(subthemes_ytrain))
    valid = model.score(X_valid, np.array(subthemes_yvalid))
    y_pred = model.predict(X_valid)
    recall = recall_score(np.array(subthemes_yvalid), y_pred, average= 'micro')
    precision = precision_score(np.array(subthemes_yvalid), y_pred, average= 'micro')
    
    case= {'Model': model_name,
           'Train Accuracy': train,
           'Validation Accuracy': valid,
           'Recall Score': recall,
           'Precision Score': precision}
    
    results_dict.append(case)

In [305]:
#inital look at df
pd.DataFrame(results_dict)

Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,LinearSVC,0.770625,0.319584,0.39901,0.707638
1,MultinomialNB,0.052525,0.04202,0.034653,0.952381
2,GaussianNB,0.423188,0.026214,0.314356,0.074143


In [379]:
#list-dictionary of basemodels
results_dict2 = []

In [380]:
def Classifier_Chain(Ytrain, Yvalid, base_model):
    """
    Fits a Classifier Chain Model with the base classifier of choice and 
    using either themes or subthemes Y.
    Show results of training accuracy score, validation accuracy score, 
    validation recall and precision scores.
    """
    classifier_chain = ClassifierChain(base_model)
    
    model = classifier_chain.fit(X_train, Ytrain)
    train = model.score(X_train, np.array(Ytrain))
    valid = model.score(X_valid, np.array(Yvalid))
    y_pred = model.predict(X_valid)
    recall = recall_score(np.array(Yvalid), y_pred, average= 'micro')
    precision = precision_score(np.array(Yvalid), y_pred, average= 'micro')
    
    case= {'Model': base_model,
           'Train Accuracy': train,
           'Validation Accuracy': valid,
           'Recall Score': recall,
           'Precision Score': precision}
    
    results_dict2.append(case)    

In [381]:
#Random Forest' : RandomForestClassifier()
#Note takes quite some time ~15 min on my computer
Classifier_Chain(subthemes_ytrain, subthemes_yvalid, RandomForestClassifier())
#results_dict2

[{'Model': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=None, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, n_estimators=100,
                         n_jobs=None, oob_score=False, random_state=None,
                         verbose=0, warm_start=False),
  'Train Accuracy': 0.9969159599074788,
  'Validation Accuracy': 0.18696993060909792,
  'Recall Score': 0.1905940594059406,
  'Precision Score': 0.8680947012401353}]

In [383]:
results_dict2[0]['Model'] = 'RandomForest' #rename

In [384]:
#Decision Tree' : DecisionTreeClassifier()
#Note takes quite some time ~15 min on my computer
Classifier_Chain(subthemes_ytrain, subthemes_yvalid, DecisionTreeClassifier())

[{'Model': 'RandomForest',
  'Train Accuracy': 0.9969159599074788,
  'Validation Accuracy': 0.18696993060909792,
  'Recall Score': 0.1905940594059406,
  'Precision Score': 0.8680947012401353},
 {'Model': DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                         max_depth=None, max_features=None, max_leaf_nodes=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, presort='deprecated',
                         random_state=None, splitter='best'),
  'Train Accuracy': 0.9982652274479569,
  'Validation Accuracy': 0.20855821125674634,
  'Recall Score': 0.43094059405940593,
  'Precision Score': 0.45960929250263993}]

In [385]:
results_dict2[1]['Model'] = 'DecisionTree' #rename

In [387]:
#LinearSVC(multi_class= "crammer_singer")
#"crammer_singer" optimizes a joint objective over all classes.
Classifier_Chain(subthemes_ytrain, subthemes_yvalid, LinearSVC(multi_class= "crammer_singer"))




In [388]:
results_dict2[2]['Model'] = 'LinearSVC(multi_class= "crammer_singer")' #rename

In [389]:
#Trying last year's base classifier LinearSVC(C=0.5, tol=0.2)

#results_dict2 = []
Classifier_Chain(subthemes_ytrain, subthemes_yvalid, LinearSVC(C=0.5, tol=0.2))

In [390]:
results_dict2[3]['Model'] = 'LinearSVC(C=0.5, tol=0.2) 2019 capstone base' #rename

In [396]:
results_dict.extend(results_dict2)

In [397]:
my_results =pd.DataFrame(results_dict)
my_results

Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,LinearSVC,0.770625,0.319584,0.39901,0.707638
1,MultinomialNB,0.052525,0.04202,0.034653,0.952381
2,GaussianNB,0.423188,0.026214,0.314356,0.074143
3,RandomForest,0.996916,0.18697,0.190594,0.868095
4,DecisionTree,0.998265,0.208558,0.430941,0.459609
5,"LinearSVC(multi_class= ""crammer_singer"")",0.681766,0.338088,0.425,0.654594
6,"LinearSVC(C=0.5, tol=0.2) 2019 capstone base",0.544237,0.313801,0.37302,0.753877


***All models overfitting especially the Tree models. Best one overall is LinearSVC with default multi-class='ovr'. Still overfitting though, try tune regularization (less regularization is larger C)***

In [320]:
#Looping through various C value in LinearSVC

scores = []
for i in (10.0**np.arange(-4,4)):
    model = ClassifierChain(LinearSVC(C = i, max_iter = 100000)).fit(X_train, subthemes_ytrain)
    #train = model.score(X_train, np.array(subthemes_ytrain))
    valid = model.score(X_valid, np.array(subthemes_yvalid))
    
    scores.append(valid)

In [321]:
10.0**np.arange(-4,4)

array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])

In [322]:
scores

[0.0007710100231303007,
 0.0007710100231303007,
 0.05127216653816499,
 0.24518118735543562,
 0.31958365458750965,
 0.2779491133384734,
 0.24518118735543562,
 0.23014649190439476]

> Regularization doesn't help here. The best validation score is given by the default C hyperparameter = 1.0. Let's try this on LinearSVC(multi-class='crammer_singer')

In [101]:
#Looping through various C value in LinearSVC multiclass= crammer_singer 

scores = []
for i in (10.0**np.arange(-4,4)):
    model = ClassifierChain(LinearSVC(multi_class= "crammer_singer", C = i, max_iter = 100000)).fit(X_train, subthemes_ytrain)
    #train = model.score(X_train, np.array(subthemes_ytrain))
    valid = model.score(X_valid, np.array(subthemes_yvalid))
    
    scores.append(valid)

In [102]:
scores

[0.0007710100231303007,
 0.0007710100231303007,
 0.009252120277563608,
 0.19275250578257516,
 0.3430994602929838,
 0.2679259830377795,
 0.25212027756360833,
 0.24710871241326138]

> Same results for C values of crammer_singer. Seems that Linear(SVC) with its default regularization gives the highest

> Due to lack of resources could not find proper way to GridSeach with multilabel classifier. https://stackoverflow.com/questions/26018543/gridsearch-for-multi-label-classification-in-scikit-learn

In [112]:
models

{'LinearSVC': LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
           intercept_scaling=1, loss='squared_hinge', max_iter=1000,
           multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
           verbose=0),
 'MultinomialNB': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
 'GaussianNB': GaussianNB(priors=None, var_smoothing=1e-09)}

## For themes Y 

## working the same steps on Themes to find base classifier

In [323]:
#For themes only
#Note takes about ~15 min to run
results_dict_themes = []

for model_name, model in models.items():

    classifier_chain = ClassifierChain(model)
    model = classifier_chain.fit(X_train, themes_ytrain)
    train = model.score(X_train, np.array(themes_ytrain))
    valid = model.score(X_valid, np.array(themes_yvalid))
    y_pred = model.predict(X_valid)
    recall = recall_score(np.array(themes_yvalid), y_pred, average= 'micro')
    precision = precision_score(np.array(themes_yvalid), y_pred, average= 'micro')
    
    case= {'Model': model_name,
           'Train Accuracy': train,
           'Validation Accuracy': valid,
           'Recall Score': recall,
           'Precision Score': precision}
    
    results_dict_themes.append(case)

In [324]:
pd.DataFrame(results_dict_themes)

Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,LinearSVC,0.791345,0.476099,0.608347,0.749915
1,MultinomialNB,0.246338,0.212028,0.181039,0.942446
2,GaussianNB,0.336546,0.047417,0.622996,0.14409


In [398]:
#empty out list
results_dict3 = []

In [399]:
def Classifier_Chain(Ytrain, Yvalid, base_model):
    """
    Fits a Classifier Chain Model with the base classifier of choice and 
    using either themes or subthemes Y.
    Show results of training accuracy score, validation accuracy score, 
    validation recall and precision scores.
    """
    classifier_chain = ClassifierChain(base_model)
    
    model = classifier_chain.fit(X_train, Ytrain) #subthemes_ytrain)
    train = model.score(X_train, np.array(Ytrain)) #subthemes_ytrain
    valid = model.score(X_valid, np.array(Yvalid)) #subthemes_yvalid
    y_pred = model.predict(X_valid)
    recall = recall_score(np.array(Yvalid), y_pred, average= 'micro') #subthemes_yvalid
    precision = precision_score(np.array(Yvalid), y_pred, average= 'micro') #subthemes_yvalid
    
    case= {'Model': base_model,
           'Train Accuracy': train,
           'Validation Accuracy': valid,
           'Recall Score': recall,
           'Precision Score': precision}
    
    #print(case)
    #change results dictionary name
    results_dict3.append(case)    

In [400]:
#Random Forest' : RandomForestClassifier()
#Note takes quite some time ~15 min on my computer
Classifier_Chain(themes_ytrain, themes_yvalid, RandomForestClassifier())

In [401]:
results_dict3[0]['Model'] = 'RandomForest' #rename

In [403]:
#Decision Tree' : DecisionTreeClassifier()
#Note takes quite some time ~15 min on my computer
Classifier_Chain(themes_ytrain, themes_yvalid, DecisionTreeClassifier())

In [404]:
results_dict3[1]['Model'] = 'DecisionTree' #rename

In [406]:
#LinearSVC(multi_class= "crammer_singer")
#"crammer_singer" optimizes a joint objective over all classes.
Classifier_Chain(themes_ytrain, themes_yvalid, LinearSVC(multi_class= "crammer_singer"))




In [407]:
results_dict3[2]['Model'] = 'LinearSVC(multi_class= "crammer_singer")' #rename

In [412]:
results_dict_themes.extend(results_dict3)

In [413]:
pd.DataFrame(results_dict_themes)

Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,LinearSVC,0.791345,0.476099,0.608347,0.749915
1,MultinomialNB,0.246338,0.212028,0.181039,0.942446
2,GaussianNB,0.336546,0.047417,0.622996,0.14409
3,RandomForest,0.998169,0.410177,0.449972,0.854593
4,DecisionTree,0.998651,0.363531,0.60199,0.600496
5,"LinearSVC(multi_class= ""crammer_singer"")",0.736507,0.499229,0.628524,0.720989


> For themes, also looks like LinearSVC() is the best. Hypothesizing that the best regularization parameter C for this model is their default values. Let's see if regularizing Decision Tree will help with overfitting and get better validation results

In [174]:
#Looping through various max_depth value in LinearSVC multiclass= crammer_singer 

scores = []
for i in range(10,100,10):
    model = ClassifierChain(DecisionTreeClassifier(max_depth = i)).fit(X_train, themes_ytrain)
    #train = model.score(X_train, np.array(subthemes_ytrain))
    valid = model.score(X_valid, np.array(themes_yvalid))
    
    scores.append(valid)

In [175]:
scores

[0.3797224363916731,
 0.4055512721665382,
 0.4109483423284503,
 0.40670778720123363,
 0.3982266769468003,
 0.40400925212027755,
 0.39552814186584423,
 0.39167309175019277,
 0.38781804163454126]

> Looks like none of these max depths improves the validation accuracy with None of result 0.42

In [414]:
#reset list
results_dict3 = []

In [415]:
#Trying last year's base classifier LinearSVC(C=0.5, tol=0.2)
Classifier_Chain(themes_ytrain, themes_yvalid, LinearSVC(C=0.5, tol=0.2))

In [416]:
results_dict3[0]['Model'] = 'LinearSVC(C=0.5, tol=0.2) 2019 capstone base' #rename

In [417]:
results_dict_themes.extend(results_dict3)

In [419]:
themes_results = pd.DataFrame(results_dict_themes)

# Summary of results:
## Subthemes

In [420]:
#SUBTHEMES RESULTS 
print("SUBTHEMES RESULTS:")
my_results

SUBTHEMES RESULTS:


Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,LinearSVC,0.770625,0.319584,0.39901,0.707638
1,MultinomialNB,0.052525,0.04202,0.034653,0.952381
2,GaussianNB,0.423188,0.026214,0.314356,0.074143
3,RandomForest,0.996916,0.18697,0.190594,0.868095
4,DecisionTree,0.998265,0.208558,0.430941,0.459609
5,"LinearSVC(multi_class= ""crammer_singer"")",0.681766,0.338088,0.425,0.654594
6,"LinearSVC(C=0.5, tol=0.2) 2019 capstone base",0.544237,0.313801,0.37302,0.753877


> LinearSVC() worked the best with their default C regularization parameter. 

    * train score: 0.77
    * validation score: 0.32
    * recall score: 0.40
    * precision score: 0.71

## Themes

In [421]:
#THEMES RESULTS 
print("THEMES RESULTS:")

themes_results

THEMES RESULTS:


Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,LinearSVC,0.791345,0.476099,0.608347,0.749915
1,MultinomialNB,0.246338,0.212028,0.181039,0.942446
2,GaussianNB,0.336546,0.047417,0.622996,0.14409
3,RandomForest,0.998169,0.410177,0.449972,0.854593
4,DecisionTree,0.998651,0.363531,0.60199,0.600496
5,"LinearSVC(multi_class= ""crammer_singer"")",0.736507,0.499229,0.628524,0.720989
6,"LinearSVC(C=0.5, tol=0.2) 2019 capstone base",0.691789,0.479568,0.593145,0.779797


> LAST YEARS RESULTS: Results from 2019 Capstone results on test data for baseline model (BoW CountVectorizer and BinaryRelevance(LinearSVC(C=0.5, tol=0.2))

     * Accuracy: 45%
     * Recall: 0.64
     * Precision: 0.74
  
> OUR RESULTS: Using LinearSVC(C=0.5, tol=0.2) 2019 capstone base, overfitting went down and only recall decreased in score. Our best results for baseline model is:

     * Accuracy: 48%
     * Recall: 0.59
     * Precision: 0.78

# Using universal sentence encoder as embedding

In [346]:
import tensorflow as tf

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [350]:
X_train_raw = pd.read_csv('data/X_train_Q1_clean.csv')

In [351]:
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), 
                 tf.tables_initializer()])
    training_embeddings = session.run(embed(X_train_raw.Comment.to_list()))

In [352]:
use = ClassifierChain(LinearSVC(C=0.5, tol=0.2))

In [353]:
use.fit(training_embeddings, themes_ytrain)

ClassifierChain(classifier=LinearSVC(C=0.5, class_weight=None, dual=True,
                                     fit_intercept=True, intercept_scaling=1,
                                     loss='squared_hinge', max_iter=1000,
                                     multi_class='ovr', penalty='l2',
                                     random_state=None, tol=0.2, verbose=0),
                order=None, require_dense=[True, True])

In [354]:
use.score(training_embeddings, themes_ytrain)

0.4995181187355436

In [355]:
#transforming valiation set
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), 
                 tf.tables_initializer()])
    test_embeddings = session.run(embed(X_valid_Q1.Comment.to_list()))

In [356]:
use.score(test_embeddings, themes_yvalid)

0.44680030840400925

In [357]:
y_pred = use.predict(test_embeddings)

In [358]:
recall_score(np.array(themes_yvalid), y_pred, average= 'micro') 

0.5702045328911001

In [359]:
precision_score(np.array(themes_yvalid), y_pred, average= 'micro') 

0.7094222833562586

In [362]:
uni_sent = []
case= {'Model': 'Universal Sentence Encoder LinearSVC',
       'Train Accuracy': use.score(training_embeddings, themes_ytrain),
       'Validation Accuracy': use.score(test_embeddings, themes_yvalid),
       'Recall Score': recall_score(np.array(themes_yvalid), y_pred, average= 'micro') ,
       'Precision Score': precision_score(np.array(themes_yvalid), y_pred, average= 'micro') }

#print(case)
uni_sent.append(case)    
results_dict_themes.extend(uni_sent)


In [363]:
pd.DataFrame(results_dict_themes)

Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,LinearSVC,0.791345,0.476099,0.608347,0.749915
1,MultinomialNB,0.246338,0.212028,0.181039,0.942446
2,GaussianNB,0.336546,0.047417,0.622996,0.14409
3,RandomForest,0.998458,0.417116,0.453566,0.857367
4,DecisionTree,0.998651,0.35081,0.58817,0.590128
5,"LinearSVC(multi_class= ""crammer_singer"")",0.736507,0.499229,0.628524,0.720989
6,"LinearSVC(C=0.5, tol=0.2) 2019 capstone base",0.692174,0.478797,0.592593,0.77992
7,Universal Sentence Encoder LinearSVC,0.499518,0.4468,0.570205,0.709422


# Decision Tree Hierachical Multi-Classifier

- trying to get hmm to work but exceptions trouble importing
https://github.com/davidwarshaw/hmc

In [262]:
y_train_Q1.columns[0:12]

Index(['CPD', 'CB', 'EWC', 'Exec', 'FEW', 'SP', 'RE', 'Sup', 'SW', 'TEPE',
       'VMG', 'OTH'],
      dtype='object')

<br>
<br>
<br>
<br>
<br>
<br>

***Previous work:***

### Using CountVecorizer for comparison vs TFID

In [422]:
#CountVectorizer Vectorizer Representation

def count_vectorizer(train, valid):
    cv = CountVectorizer() 
    X = cv.fit_transform(train)
    X_valid = cv.transform(valid)
    return X, X_valid

#bow = pd.DataFrame(X_train_tfid, columns=sorted(tfid.vocabulary_), index=final_comments)
#X_valid_tfid = tfid.transform(X_valid)

In [423]:
#Vectorize X_train and convert Y_train to an array

cvX_train, cvX_valid = count_vectorizer(X_train_Q1['Comment'].values.astype('U'), 
                                    X_valid_Q1['Comment'].values.astype('U')) #had to convert type 


In [424]:
#function with CountVec X
def Classifier_Chain(Ytrain, Yvalid, base_model):
    """
    Fits a Classifier Chain Model with the base classifier of choice and 
    using either themes or subthemes Y.
    Show results of training accuracy score, validation accuracy score, 
    validation recall and precision scores.
    """
    classifier_chain = ClassifierChain(base_model)
    
    model = classifier_chain.fit(cvX_train, Ytrain)
    train = model.score(cvX_train, np.array(Ytrain))
    valid = model.score(cvX_valid, np.array(Yvalid))
    y_pred = model.predict(cvX_valid)
    recall = recall_score(np.array(Yvalid), y_pred, average= 'micro')
    precision = precision_score(np.array(Yvalid), y_pred, average= 'micro')
    
    case= {'Model': base_model,
           'Train Accuracy': train,
           'Validation Accuracy': valid,
           'Recall Score': recall,
           'Precision Score': precision}
    
    results_dict2.append(case)    

In [430]:
print("CountVectorizer- FOR THEMES ONLY using LinearSVC():\n")
Classifier_Chain(themes_ytrain, themes_yvalid, LinearSVC())

CountVectorizer- FOR THEMES ONLY using LinearSVC():



In [431]:
pd.DataFrame(results_dict2)

Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,"LinearSVC(C=1.0, class_weight=None, dual=True,...",0.962702,0.275251,0.482673,0.513699
1,"LinearSVC(C=1.0, class_weight=None, dual=True,...",0.948824,0.406708,0.631288,0.624385


In [427]:
results_dict2 = []
print("CountVectorizer-FOR SUBTHEMES ONLY using LinearSVC():\n")
Classifier_Chain(subthemes_ytrain, subthemes_yvalid, LinearSVC())


CountVectorizer-FOR SUBTHEMES ONLY using LinearSVC():



In [428]:
pd.DataFrame(results_dict2)

Unnamed: 0,Model,Train Accuracy,Validation Accuracy,Recall Score,Precision Score
0,"LinearSVC(C=1.0, class_weight=None, dual=True,...",0.962702,0.275251,0.482673,0.513699


### ***For all themes and subthemes:***

In [79]:
#LinearSVC multi_class= "ovr"
#Multiclass as One-Vs-The-Rest:

classifier_svc = ClassifierChain(
    classifier = LinearSVC(multi_class= "ovr")
)
classifier_svc.fit(X_train, Y_train)

ClassifierChain(classifier=LinearSVC(C=1.0, class_weight=None, dual=True,
                                     fit_intercept=True, intercept_scaling=1,
                                     loss='squared_hinge', max_iter=1000,
                                     multi_class='ovr', penalty='l2',
                                     random_state=None, tol=0.0001, verbose=0),
                order=None, require_dense=[True, True])

In [80]:
#Train score
print("Training Score for LinearSVC Classifer Chain:", 
      classifier_svc.score(X_train, Y_train))
print("Validation Score for LinearSVC Classifer Chain:",
     classifier_svc.score(X_valid, np.array(y_valid_Q1)))

y_pred = classifier_svc.predict(X_valid)
print("Validation Recall for LinearSVC Classifer Chain:",
      recall_score(np.array(y_valid_Q1), y_pred, average= 'micro'))
print("Validation Precision for LinearSVC Classifer Chain:",
     precision_score(np.array(y_valid_Q1), y_pred, average= 'micro'))
      

Training Score for LinearSVC Classifer Chain: 0.7907671549730146
Validation Score for LinearSVC Classifer Chain: 0.3338473400154202
Validation Recall for LinearSVC Classifer Chain: 0.5321232697832332
Validation Precision for LinearSVC Classifer Chain: 0.6822367319604888


In [81]:
#Recall score for training 

y_pred = classifier_svc.predict(X_train)
recall_score(Y_train, y_pred, average= 'micro')

0.8729869538341413