In [1]:
# miscellaneous
from time import time
from collections import defaultdict
import warnings
warnings.simplefilter('ignore')

# dataframes and arrays
import pandas as pd
import numpy as np

# sklearn models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier

# sklearn metrics
from sklearn.metrics import accuracy_score, log_loss, hamming_loss, label_ranking_loss
from sklearn.metrics import multilabel_confusion_matrix, roc_auc_score, f1_score
from sklearn.metrics import make_scorer

# sklearn tools
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier # also implements multilabel clf just like MultiOutputClassifier (BR)
from sklearn.multioutput import MultiOutputClassifier 
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler, Binarizer
from sklearn.utils.extmath import safe_sparse_dot

from scipy import sparse

# utilities
from utilities.constants import CLASSES
from utilities.functions import *

# directories
DATA_DIR = './data/'


In [2]:
X_train_tfv = load_objects(DATA_DIR + 'X_train_tfv')
y = load_objects(DATA_DIR + 'y')

In [3]:
X_train_tfv.shape, y.shape

((159571, 30000), (159571, 6))

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X_train_tfv, y, random_state=42, test_size=0.20, shuffle=True)

In [5]:
for name, data in zip(['X_train', 'X_val', 'y_train', 'y_val'],[X_train, X_val, y_train, y_val]):
    print(name, data.shape)

X_train (127656, 30000)
X_val (31915, 30000)
y_train (127656, 6)
y_val (31915, 6)


### Scorers

In [6]:
mean_log_loss_scorer = make_scorer(mean_log_loss, greater_is_better=False, needs_proba=True)
mean_roc_auc_scorer = make_scorer(mean_roc_auc, needs_proba=True)

# 0. Dummy Classifier

In [7]:
dummy_clf = MultiOutputClassifier(DummyClassifier(strategy='most_frequent'))
dummy_clf.fit(X_train, y_train)
y_pred_probas = dummy_clf.predict_proba(X_val)
y_preds = dummy_clf.predict(X_val)
print('Loss {}'.format(mean_log_loss(y_val, y_pred_probas)))
print('AUC {}'.format(roc_auc_score(y_val, y_preds)))


Loss 1.275927224490043
AUC 0.5


# 1. Multinomial Naive Bayes

In [8]:
nb_params = [
    {
        'estimator__alpha': [0.001, 0.01, 0.1, 1.0]
    }]

In [9]:
# NOTE: must set refit=False when fitting with multiple scorers

nb_br = GridSearchCV(MultiOutputClassifier(MultinomialNB()), nb_params, 
                     scoring={"loss": mean_log_loss_scorer,
                              "roc_auc": mean_roc_auc_scorer},
                     n_jobs=-1, cv=5, refit=False)
nb_br.fit(X_train_tfv, y);


In [10]:
pd.DataFrame(nb_br.cv_results_)\
.loc[:,['param_estimator__alpha', 'mean_test_loss', 'mean_test_roc_auc']]\
.sort_values(by=['mean_test_roc_auc'], ascending=False)

Unnamed: 0,param_estimator__alpha,mean_test_loss,mean_test_roc_auc
2,0.1,-0.064448,0.959666
1,0.01,-0.07295,0.950096
0,0.001,-0.085522,0.932809
3,1.0,-0.071302,0.930884


### Best scores of NB:
- Loss: 0.064448
- ROC-AUC: 0.959666

### Best NB parameter:
- alpha = 0.1

# 2. Logistic Regression 

In [11]:
lr_pipe = Pipeline([('svd', TruncatedSVD(n_components=50)),
                    ('scaler', StandardScaler()),
                    ('lr', MultiOutputClassifier(LogisticRegression()))
                   ])

In [12]:
lr_params = [
    {
        'svd__n_components': [50, 100, 150],
        'lr__estimator__C': [10**n for n in range(-3,1) ],
        'lr__estimator__penalty': ['l1', 'l2']
    }
]

In [13]:
start_time = time()
lr_br = GridSearchCV(lr_pipe, lr_params, 
                     scoring={"loss": mean_log_loss_scorer,
                              "roc_auc": mean_roc_auc_scorer}, 
                     n_jobs=-1, cv=3, refit=False)
lr_br.fit(X_train_tfv, y)
end_time = time()

In [14]:
print("LogReg train time: {:2f}".format((end_time-start_time)/60))

LogReg train time: 11.802258


In [15]:
select_cols = ['param_lr__estimator__C','param_lr__estimator__penalty', 
               'param_svd__n_components', 'mean_test_loss', 'mean_test_roc_auc']

In [16]:
df_lr_cv = pd.DataFrame(lr_br.cv_results_)\
            .loc[:,select_cols]\
            .sort_values(by=['mean_test_roc_auc'], ascending=False)
df_lr_cv.columns = ['C','penalty','svd__n_components', 'mean_test_loss', 'mean_test_roc_auc']
df_lr_cv

Unnamed: 0,C,penalty,svd__n_components,mean_test_loss,mean_test_roc_auc
5,0.001,l2,150,-0.093568,0.948473
11,0.01,l2,150,-0.072674,0.947513
17,0.1,l2,150,-0.070597,0.947198
14,0.1,l1,150,-0.070458,0.946103
23,1.0,l2,150,-0.070449,0.945154
20,1.0,l1,150,-0.07037,0.944824
8,0.01,l1,150,-0.072919,0.940309
16,0.1,l2,100,-0.073475,0.939986
13,0.1,l1,100,-0.073609,0.939736
10,0.01,l2,100,-0.076267,0.939549


### Best scores of LogReg:
1. Min Loss: 0.069918 (roc-auc: 0.946209)
2. Max ROC-AUC: 0.948924 (loss: 0.093462)

### Best LogReg parameters:
1. Min Loss: C=1, penalty=l1, n_components=150 
2. Max ROC-AUC: C=0.001, penalty=l2, n_components=150 

# 3. SVM

In [17]:
svm_pipe = Pipeline([('svd', TruncatedSVD(n_components=100)),
                    ('scaler', StandardScaler()),
                    ('svm', MultiOutputClassifier(SVC(kernel='linear', probability=True, random_state=12)))
                   ])

In [18]:
svm_params = [
    {
        'svd__n_components': [100, 150],
        'svm__estimator__C': [10**n for n in range(-3,1)],
        'svm__estimator__tol': [10**n for n in range(-5,-2)]
    }
]

In [None]:
svm_pipe.fit(X_train,y_train)

In [None]:
start_time = time()
svm_br = GridSearchCV(svm_pipe, svm_params, 
                      scoring={"loss": mean_log_loss_scorer,
                              "roc_auc": mean_roc_auc_scorer},
                      n_jobs=-1, cv=3, refit=False)
svm_br.fit(X_train_tfv, y)
end_time = time()

In [None]:
print("SVM train time: {:2f}".format((end_time-start_time)/60))

# 4. NB-SVM

Note:
1. a Multinomial Naive Bayes classifier is a **linear (binary) classifier** ([proof](https://svivek.com/teaching/lectures/slides/naive-bayes/naive-bayes-linear.pdf)):

\begin{align}
y^{(i)} &= \text{sign}\left(\textbf{w}^T \textbf{x}^{(i)}+b\right)\\
\textbf{w} &= \log \left(\frac{\textbf{p}/||\textbf{p}||_1}{\textbf{q}/||\textbf{q}||_1}\right),\quad  &&b= \log\left(\frac{\sum_i \mathcal{I}\left\{ y^{(i)}=1 \right\} }{\sum_i \mathcal{I}\left\{ y^{(i)}=-1 \right\}}\right)\\
\textbf{p} &= \alpha + \sum_{i} \textbf{x}^{(i)}\cdot \mathcal{I}\left\{ y^{(i)}=1 \right\} ,\quad &&\textbf{q} = \alpha + \sum_{i} \textbf{x}^{(i)} \cdot \mathcal{I}\left\{ y^{(i)}=-1 \right\}
\end{align}

2. SVM with NB features ([NBSVM](https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf)):
\begin{align}
y^{(i)} &= \text{sign}\left({\textbf{w}^\prime}^T {\textbf{x}^\prime}^{(i)}+b\right)\\
{\textbf{x}^\prime}^{(i)} & = \textbf{r} \circ \textbf{x}^{(i)} \quad \text{with} \quad \textbf{r} = \log \left(\frac{\textbf{p}/||\textbf{p}||_1}{\textbf{q}/||\textbf{q}||_1}\right)\\
\textbf{w}^\prime & = (1-\beta) \,\bar{w} + \beta \,\textbf{w}
\end{align}

Advanced Tips:
1. **Binarization:**
 - Binarize features such that:  $\qquad\hat{\textbf{x}}^{(i)} = \mathcal{I}\left\{ \textbf{x}^{(i)}>0\right\}$. 
 - Calculate $\textbf{p}, \textbf{q}, \textbf{r}$ using $\hat{\textbf{x}}^{(i)}$.


2. **The Interpolation Parameter ($\beta$):**
 - Best to set $\beta$ in the range of $\left[1/4, 1/2\right]$. 
 - To avoid further hyperparameter tuning, set $\beta = 1$ s.t. $\textbf{w}^\prime = \textbf{w}$.




In [7]:
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, C=1.0, alpha=1.0, beta=1.0, dual=False, n_jobs=1):
        self.C = C
        self.alpha = alpha
        self.beta = beta
        self.dual = dual
        self.n_jobs = n_jobs


    def _pr(self, X, y_i, y):
        
        p = X[y==y_i].sum(axis=0)
        
        return (self.alpha + p) / (self.alpha + (y==y_i).sum())
    
    def _predict_proba_lr(self, X):

        prob = self.decision_function(X)
        expit(prob, out=prob)
        if prob.ndim == 1:
            return np.vstack([1 - prob, prob]).T
        else:
            # OvR normalization, like LibLinear's predict_probability
            prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
            return prob

    def fit(self, X, y):
        
        # Check that X and y have correct shape
        X, y = check_X_y(X, y, accept_sparse=True)

        # Create NB features 
        self._r = sparse.csr_matrix(np.log(self._pr(X, 1, y) / self._pr(X, 0, y)))
        X_nb = X.multiply(self._r)
        
        # Train a Logistic Regression
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(X_nb, y)
        
        # Define the scaled/shifted weight coefficients, intercept, classes
        self.coef_ = (1 - self.beta) * np.mean(self._clf.coef_) + self.beta * self._clf.coef_
        self.intercept_ = self._clf.intercept_
        self.classes_ = self._clf.classes_
        
        return self
    
    def decision_function(self, X):

        X = check_array(X, accept_sparse='csr')
        
        n_features = self.coef_.shape[1]
        if X.shape[1] != n_features:
            raise ValueError("X has %d features per sample; expecting %d"
                             % (X.shape[1], n_features))

        scores = safe_sparse_dot(X, self.coef_.T,
                                 dense_output=True) + self.intercept_
        return scores.ravel() if scores.shape[1] == 1 else scores
    
    
    def predict(self, X):
        
        X_nb = X.multiply(self._r)
        
        scores = self.decision_function(X_nb)
        
        if len(scores.shape) == 1:
            indices = (scores > 0).astype(np.int)
        else:
            indices = scores.argmax(axis=1)
            
        return self.classes_[indices]
    
    def predict_proba(self, X):
        
        return self._clf.predict_proba(X.multiply(self._r))
        

## 4.1 With Binarization

In [8]:
X_train_tfv_bin = Binarizer().fit_transform(X_train_tfv)

In [9]:
scores = defaultdict(list)

for beta in [0.25, 0.5, 0.75, 1]:
    for C in [0.01, 0.05, 0.1, 0.5, 1]:
        nbsvm_clf = MultiOutputClassifier(NbSvmClassifier(C=C, beta=beta, dual=True, n_jobs=-1))
        cv_results = cross_validate(nbsvm_clf, 
                                    X_train_tfv_bin, y, 
                                    scoring={"loss": mean_log_loss_scorer,
                                             "roc_auc": mean_roc_auc_scorer}
                                   )
        mean_test_loss = np.mean(cv_results['test_loss'])
        mean_test_roc_auc = np.mean(cv_results['test_roc_auc'])
        scores['beta'].append(beta)
        scores['C'].append(C)
        scores['loss'].append(mean_test_loss)
        scores['roc_auc'].append(mean_test_roc_auc)

In [10]:
df_nbsvm_bin = pd.DataFrame(scores).sort_values(by='roc_auc', ascending=False)
df_nbsvm_bin

Unnamed: 0,beta,C,loss,roc_auc
16,1.0,0.05,-0.058419,0.969499
1,0.25,0.05,-0.058419,0.969499
11,0.75,0.05,-0.058419,0.969499
6,0.5,0.05,-0.058419,0.969499
2,0.25,0.1,-0.057928,0.968727
7,0.5,0.1,-0.057928,0.968727
12,0.75,0.1,-0.057928,0.968727
17,1.0,0.1,-0.057928,0.968727
5,0.5,0.01,-0.064277,0.966037
10,0.75,0.01,-0.064277,0.966037


# Comment:
The value of the interpolation parameter ```beta``` seems to be irrelevant.

### Best scores of NB-SVM with Binarization:

In [11]:
find_best_scores(df_nbsvm_bin)

Min LogLoss: 0.057928 (roc-auc: 0.968727)
Max ROC-AUC: 0.969499 (logloss: 0.058419)


### Best parameters :
beta >= 0.25

In [12]:
find_best_params(df_nbsvm_bin, ['C'])

Best Min Loss Parametes
C = 0.01
Best Max ROC_AUC Parametes
C = 0.01


## 4.2 Without Binarization

In [13]:
scores = defaultdict(list)

for C in [0.01, 0.05, 0.1, 0.5, 1]:
    nbsvm_clf = MultiOutputClassifier(NbSvmClassifier(C=C, dual=True, n_jobs=-1))
    cv_results = cross_validate(nbsvm_clf, 
                                X_train_tfv, y, 
                                scoring={"loss": mean_log_loss_scorer,
                                         "roc_auc": mean_roc_auc_scorer}
                               )
    mean_test_loss = np.mean(cv_results['test_loss'])
    mean_test_roc_auc = np.mean(cv_results['test_roc_auc'])
    scores['C'].append(C)
    scores['loss'].append(mean_test_loss)
    scores['roc_auc'].append(mean_test_roc_auc)

In [14]:
df_nbsvm = pd.DataFrame(scores).sort_values(by='roc_auc', ascending=False)
df_nbsvm

Unnamed: 0,C,loss,roc_auc
3,0.5,-0.053391,0.978606
2,0.1,-0.060159,0.978248
4,1.0,-0.052714,0.977277
1,0.05,-0.065606,0.977228
0,0.01,-0.084353,0.974731


### Best scores of NB-SVM without Binarization:

In [15]:
find_best_scores(df_nbsvm)

Min LogLoss: 0.052714 (roc-auc: 0.977277)
Max ROC-AUC: 0.978606 (logloss: 0.053391)


### Best parameters :
beta >= 0.25

In [16]:
find_best_params(df_nbsvm, ['C'])

Best Min Loss Parametes
C = 0.1
Best Max ROC_AUC Parametes
C = 0.01
