## Baseline Model for Question 1 (data from 2013, 2018, 2020)
### Tfid Vectorizer Representation and Classifier Chains model


In [33]:
#load dependencies
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from skmultilearn.problem_transform import ClassifierChain
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from scipy.sparse import csr_matrix

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, precision_score


In [None]:
#### This code is from preprocessing to be used before 
#### running code below to create clean texts for fitting 
#----------------------------------------------------------------

# #Create class object
# c_pp = comment_preprocessor()

# #clean X training set
# clean_doc_train, vocab_train = c_pp.preprocess_text(list(X_train_Q1['Comment']))
# #clean X valid
# clean_doc_valid, vocab_valid = c_pp.preprocess_text(list(X_valid_Q1['Comment']))

# ## For baseline, convert list of lists into list of sentences using the following code

# clean_text = []
# for docs in clean_doc_train:
#     clean_text.append(' '.join(docs)) 

# #save clean text 
# X_train_Q1['clean_text'] = clean_text
# X_train_Q1.to_csv('data/X_train_Q1_clean.csv', index=False)

# #validation set preprocess
# clean_text_val = []
# for docs in clean_doc_valid:
#     clean_text_val.append(' '.join(docs))

# #save valid clean text 
# X_valid_Q1['clean_text'] = clean_text_val
# X_valid_Q1.to_csv('data/X_valid_Q1_clean.csv', index=False)

In [2]:
#load in preprocessed train and validation datas
#your path here

X_train_Q1 = pd.read_csv('data/X_train_Q1_clean.csv')
X_valid_Q1 = pd.read_csv('data/X_valid_Q1_clean.csv')

y_train_Q1 = pd.read_csv('data/y_train_Q1.csv')
y_valid_Q1 = pd.read_csv('data/y_valid_Q1.csv')

### Tfid Vectorizer Representation


In [3]:
#Tfid Vectorizer Representation

def tfid_vectorizer(train, valid):
    tfid = TfidfVectorizer() 
    X = tfid.fit_transform(train)
    X_valid = tfid.transform(valid)
    return X, X_valid

#bow = pd.DataFrame(X_train_tfid, columns=sorted(tfid.vocabulary_), index=final_comments)
#X_valid_tfid = tfid.transform(X_valid)

In [4]:
#list(X_train_Q1['clean_text'])

In [5]:
#Vectorize X_train and convert Y_train to an array

X_train, X_valid = tfid_vectorizer(X_train_Q1['clean_text'].values.astype('U'), 
                                    X_valid_Q1['clean_text'].values.astype('U')) #had to convert type 
Y_train = (np.array(y_train_Q1))

### Classifier Chain

In [6]:
def Classifier_Chain(base_classifier):
    classifier_chain = ClassifierChain(
        base_classifier
)
    classifier_chain.fit(X_train, Y_train)
    print("Training accuracy:", classifier_chain.score(X_train, Y_train))

In [7]:
#LinearSVC(multi_class= "crammer_singer")
#"crammer_singer" optimizes a joint objective over all classes.

Classifier_Chain(LinearSVC(multi_class= "crammer_singer"))



Training accuracy: 0.7072089437162683


In [45]:
#GaussianNB inhernetly multiclass, sample labelled only one class

classifier_gnb = ClassifierChain(
    classifier = GaussianNB()
)
classifier_gnb.fit(X_train, Y_train)

ClassifierChain(classifier=GaussianNB(priors=None, var_smoothing=1e-09),
                order=None, require_dense=[True, True])

In [46]:
predictions = classifier_gnb.predict(X_train)

In [47]:
classifier_gnb.score(X_train, Y_train)

0.3554356206630686

> train results too low, will not use during GridSearch

In [28]:
#LinearSVC multi_class= "ovr"
#Multiclass as One-Vs-The-Rest:

classifier_svc = ClassifierChain(
    classifier = LinearSVC(multi_class= "ovr")
)
classifier_svc.fit(X_train, Y_train)

ClassifierChain(classifier=LinearSVC(C=1.0, class_weight=None, dual=True,
                                     fit_intercept=True, intercept_scaling=1,
                                     loss='squared_hinge', max_iter=1000,
                                     multi_class='ovr', penalty='l2',
                                     random_state=None, tol=0.0001, verbose=0),
                order=None, require_dense=[True, True])

In [35]:
#Train score
print("Training Score for LinearSVC Classifer Chain:", 
      classifier_svc.score(X_train, Y_train))
print("Validation Score for LinearSVC Classifer Chain:",
     classifier_svc.score(X_valid, np.array(y_valid_Q1)))

y_pred = classifier_svc.predict(X_valid)
print("Validation Recall for LinearSVC Classifer Chain:",
      recall_score(y_pred, np.array(y_valid_Q1), average= 'micro'))
print("Validation Precision for LinearSVC Classifer Chain:",
     precision_score(y_pred, np.array(y_valid_Q1), average= 'micro'))
      

Training Score for LinearSVC Classifer Chain: 0.7907671549730146
Validation Score for LinearSVC Classifer Chain: 0.3338473400154202
Validation Recall for LinearSVC Classifer Chain: 0.6822367319604888
Validation Precision for LinearSVC Classifer Chain: 0.5321232697832332


In [32]:
#Recall score for training 

y_pred = classifier_svc.predict(X_train)
recall_score(Y_train, y_pred, average= 'micro')

0.8729869538341413

> decent accuracy for train set, low for validation. will tune hyperparameters during gridsearch

### Grid Search for LinearSVC(multi_class='ovr')

* next steps: find way to Grid search with multi label classification

In [24]:
#hyperparameter search for LinearSVC

parameters = [
    {'classifier': [LinearSVC()],
     'classifier__C': [np.logspace(-3,3,7)]}
    # 'classifier__penalty':['l1','l2'],
    # 'classifier__loss': ['hinge', 'squared_hinge']}
]
    
clf = GridSearchCV(ClassifierChain(), parameters, scoring= 'accuracy')
clf.fit(X_train, Y_train)
print('The best parameter combination is {}.'.format(clf.best_params_))
print('The accuracy on the valid split is {:.2f}.'.format(clf.score(X_valid, np.array(y_valid_Q1))))


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
#grid search using Multinomial Naive Bayes or SVC which supports sparse input
parameters = [
    {
        'classifier': [MultinomialNB()],
        'classifier__alpha': [0.7, 1.0],
    },
    {
        'classifier': [LinearSVC(multi_class= "ovr")],
        'classifier__kernel': ['rbf', 'linear'],
    },
]

clf = GridSearchCV(ClassifierChain(), parameters, scoring='accuracy')
clf.fit(X_sample, Y_sample)

print (clf.best_params_, clf.best_score_)
