In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
import os

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn import cross_validation
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, classification_report



In [4]:
from gensim.models import LdaMulticore
from gensim import corpora
from gensim.matutils import sparse2full

In [5]:
from scipy.sparse import hstack

For dockers + ensembles:

In [6]:
os.environ['JOBLIB_TEMP_FOLDER'] = '../data/tmp/'

### Data Import

In [7]:
train = pd.read_csv('data/labeledTrainData.tsv', sep='\t')
print(train.shape)

(25000, 3)


In [8]:
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [9]:
test = pd.read_csv('data/testData.tsv', sep='\t')
print(test.shape)

(25000, 2)


### Vectorize Text

In [18]:
count_vec = CountVectorizer(ngram_range=(1,2), 
                            min_df=5,
                            max_df=0.9,
                            strip_accents='unicode',
                            max_features=None)

In [26]:
tfidf_vec = TfidfVectorizer(ngram_range=(1,2), 
                            min_df=3,
                            #max_df=0.9,
                            strip_accents='unicode',
                            analyzer='word',
                            token_pattern=r'\w{1,}',
                            use_idf=1,
                            smooth_idf=1,
                            sublinear_tf=1,
                            max_features=None,
                            stop_words = 'english')

In [20]:
count_vec.fit(train["review"].fillna("").values)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=5,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [21]:
X_count = count_vec.transform(train["review"].fillna("").values)

In [22]:
print(type(X_count))
print(X_count.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(25000, 156877)


In [27]:
tfidf_vec.fit(train["review"].fillna("").values)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=1,
        stop_words='english', strip_accents='unicode', sublinear_tf=1,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=1,
        vocabulary=None)

In [28]:
X_tfidf = tfidf_vec.transform(train["review"].fillna("").values)

In [29]:
print(type(X_tfidf))
print(X_tfidf.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(25000, 165171)


In [30]:
y = train['sentiment'].as_matrix()

In [31]:
print(y.shape)

(25000,)


### Create Topic Vector

In [51]:
dictionary = corpora.Dictionary.load('models/topic_dict.dict')

In [52]:
lda = LdaMulticore.load('models/topic_lda_model')

In [58]:
text_row_count = train['review'].shape[0]
vector_size = lda.num_topics

In [73]:
train['review_topic_vector'] = train['review'].apply(lambda x: lda[dictionary.doc2bow(x.lower().split(" "))])

In [75]:
X_lda_matrix = np.reshape(np.concatenate(train['review_topic_vector']
                                         .apply(lambda x: sparse2full(x, vector_size))
                                         .as_matrix(), axis=0), (text_row_count,vector_size))

In [86]:
test['review_topic_vector'] = test['review'].apply(lambda x: lda[dictionary.doc2bow(x.lower().split(" "))])

In [87]:
X_lda_matrix_test = np.reshape(np.concatenate(test['review_topic_vector']
                                         .apply(lambda x: sparse2full(x, vector_size))
                                         .as_matrix(), axis=0), (test['review'].shape[0],vector_size))

### Concat Features

In [84]:
X_count_concat = hstack((X_count, X_lda_matrix))

In [85]:
X_tfidf_concat = hstack((X_tfidf, X_lda_matrix))

In [None]:
X_count_train, X_count_val, y_count_train, y_count_val = train_test_split(X_count_concat, y, test_size=0.1, random_state=2481632)

In [32]:
X_tfidf_train, X_tfidf_val, y_tfidf_train, y_tfidf_val = train_test_split(X_tfidf, y, test_size=0.1, random_state=2481632)

In [None]:
X_count_test = count_vec.transform(test['review'].fillna("").values)

In [35]:
X_tfidf_test = tfidf_vec.transform(test['review'].fillna("").values)

In [None]:
X_count_test = hstack((X_count_test, X_lda_matrix_test))
X_tfidf_test = hstack((X_tfidf_test, X_lda_matrix_test))

### Linear Models

In [36]:
lm_logit = LogisticRegression(penalty='l2',
                              dual=True,
                              tol=0.0001,
                              C=1.0,
                              fit_intercept=True,
                              intercept_scaling=1.0, 
                              class_weight=None,
                              random_state=42,
                              solver='liblinear',
                              max_iter=1000,
                              verbose=1,
                              n_jobs=-1)

In [37]:
lm_logit.fit(X_tfidf_train, y_tfidf_train)

  " = {}.".format(self.n_jobs))


[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1.0, max_iter=1000, multi_class='ovr',
          n_jobs=-1, penalty='l2', random_state=42, solver='liblinear',
          tol=0.0001, verbose=1, warm_start=False)

In [45]:
y_val_hat = lm_logit.predict_proba(X_tfidf_val)[:,1]

In [46]:
print(accuracy_score(y_tfidf_val, y_val_hat > 0.5))
print(roc_auc_score(y_tfidf_val, y_val_hat))
print(confusion_matrix(y_tfidf_val, y_val_hat > 0.5))
print(classification_report(y_tfidf_val, y_val_hat > 0.5))

0.8964
0.963334188805
[[1107  140]
 [ 119 1134]]
             precision    recall  f1-score   support

          0       0.90      0.89      0.90      1247
          1       0.89      0.91      0.90      1253

avg / total       0.90      0.90      0.90      2500



In [40]:
print("20 Fold CV Score: {}".format(np.mean(cross_validation.cross_val_score(lm_logit, X_tfidf_train, y_tfidf_train, cv=20, scoring='roc_auc'))))

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]

  " = {}.".format(self.n_jobs))


[LibLinear]20 Fold CV Score: 0.9600502389305714


In [122]:
lm_logit_tfidf = LogisticRegression(penalty='l2',
                              C=1.0,
                              class_weight=None,
                              random_state=42,
                              solver='liblinear',
                              max_iter=1000,
                              verbose=1,
                              n_jobs=-1)

In [123]:
lm_logit_tfidf.fit(X_tfidf_train, y_tfidf_train)

[LibLinear]

  " = {}.".format(self.n_jobs))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=False)

In [124]:
y_tfidf_val_hat = lm_logit_tfidf.predict(X_tfidf_val)

In [125]:
print(accuracy_score(y_tfidf_val, y_tfidf_val_hat))
print(confusion_matrix(y_tfidf_val, y_tfidf_val_hat))
print(classification_report(y_tfidf_val, y_tfidf_val_hat))

0.8864
[[1093  154]
 [ 130 1123]]
             precision    recall  f1-score   support

          0       0.89      0.88      0.89      1247
          1       0.88      0.90      0.89      1253

avg / total       0.89      0.89      0.89      2500



In [126]:
lm_ridge = RidgeClassifierCV(alphas=(0.1, 0.5, 1.0, 5.0, 10.0), 
                             cv=5,
                             class_weight=None)

In [197]:
lm_ridge.fit(X_count_train, y_count_train)

RidgeClassifierCV(alphas=(0.1, 0.5, 1.0, 5.0, 10.0), class_weight=None, cv=5,
         fit_intercept=True, normalize=False, scoring=None)

In [198]:
lm_ridge.alpha_

10.0

In [199]:
y_ridge_val_hat = lm_ridge.predict(X_count_val)

In [200]:
print(accuracy_score(y_count_val, y_ridge_val_hat))
print(confusion_matrix(y_count_val, y_ridge_val_hat))
print(classification_report(y_count_val, y_ridge_val_hat))

0.8916
[[1105  142]
 [ 129 1124]]
             precision    recall  f1-score   support

          0       0.90      0.89      0.89      1247
          1       0.89      0.90      0.89      1253

avg / total       0.89      0.89      0.89      2500



In [202]:
lm_ridge_single = RidgeClassifier(alpha=10.0)

In [215]:
nb = MultinomialNB(alpha=0.5)

In [216]:
nb.fit(X_count_train, y_count_train)

MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True)

In [217]:
y_nb_val_hat = nb.predict(X_count_val)

In [218]:
print(accuracy_score(y_count_val, y_nb_val_hat))
print(confusion_matrix(y_count_val, y_nb_val_hat))
print(classification_report(y_count_val, y_nb_val_hat))

0.8688
[[1111  136]
 [ 192 1061]]
             precision    recall  f1-score   support

          0       0.85      0.89      0.87      1247
          1       0.89      0.85      0.87      1253

avg / total       0.87      0.87      0.87      2500



### NB-SVM Model

Relevant Paper: https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf

In [107]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse

class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs
        self.coef_ = None

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        self.coef_ = self._clf.coef_
        return self

In [171]:
m = NbSvmClassifier(C=4, dual=True)

In [173]:
m.fit(x_nb, y_count_train)

LogisticRegression(C=4, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [174]:
y_hat = m.predict(X_count_val)

In [175]:
print(accuracy_score(y_count_val, y_hat))
print(confusion_matrix(y_count_val, y_hat))
print(classification_report(y_count_val, y_hat))

0.9172
[[1133  114]
 [  93 1160]]
             precision    recall  f1-score   support

          0       0.92      0.91      0.92      1247
          1       0.91      0.93      0.92      1253

avg / total       0.92      0.92      0.92      2500



### Ensemble Model

In [185]:
rf = RandomForestClassifier(n_estimators=300, n_jobs=-1)

In [189]:
rf.fit(X_count_train, y_count_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [192]:
print(accuracy_score(y_count_val, rf.predict(X_count_val)))
print(confusion_matrix(y_count_val, rf.predict(X_count_val)))
print(classification_report(y_count_val, rf.predict(X_count_val)))

0.8644
[[1067  180]
 [ 159 1094]]
             precision    recall  f1-score   support

          0       0.87      0.86      0.86      1247
          1       0.86      0.87      0.87      1253

avg / total       0.86      0.86      0.86      2500



In [219]:
gbc = GradientBoostingClassifier(n_estimators=200, verbose=1)

In [220]:
gbc.fit(X_count_train, y_count_train)

      Iter       Train Loss   Remaining Time 
         1           1.3551            3.46m
         2           1.3286            3.02m
         3           1.3051            2.91m
         4           1.2839            2.83m
         5           1.2660            2.78m
         6           1.2488            2.72m
         7           1.2332            2.68m
         8           1.2197            2.65m
         9           1.2071            2.60m
        10           1.1950            2.58m
        20           1.1073            2.34m
        30           1.0485            2.20m
        40           1.0033            2.09m
        50           0.9659            1.96m
        60           0.9349            1.84m
        70           0.9087            1.70m
        80           0.8842            1.56m
        90           0.8628            1.43m
       100           0.8433            1.30m
       200           0.7109            0.00s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=None, subsample=1.0, verbose=1,
              warm_start=False)

In [221]:
print(accuracy_score(y_count_val, gbc.predict(X_count_val)))
print(confusion_matrix(y_count_val, gbc.predict(X_count_val)))
print(classification_report(y_count_val, gbc.predict(X_count_val)))

0.8416
[[1010  237]
 [ 159 1094]]
             precision    recall  f1-score   support

          0       0.86      0.81      0.84      1247
          1       0.82      0.87      0.85      1253

avg / total       0.84      0.84      0.84      2500



In [222]:
vote_m = VotingClassifier([('lm_ridge_single',lm_ridge_single), ('rf',rf), ('gbc',gbc)],
                          voting='hard',
                          n_jobs=-1)

In [223]:
vote_m.fit(X_count_train, y_count_train)

      Iter       Train Loss   Remaining Time 
         1           1.3551            6.53m
         2           1.3286            6.37m
         3           1.3051            6.49m
         4           1.2839            6.42m
         5           1.2660            6.54m
         6           1.2488            6.30m
         7           1.2332            6.42m
         8           1.2197            6.27m
         9           1.2071            6.15m
        10           1.1950            6.13m
        20           1.1073            5.39m
        30           1.0485            4.22m
        40           1.0033            3.51m
        50           0.9659            3.02m
        60           0.9349            2.64m
        70           0.9087            2.32m
        80           0.8842            2.06m
        90           0.8628            1.83m
       100           0.8433            1.61m
       200           0.7109            0.00s


VotingClassifier(estimators=[('lm_ridge_single', RidgeClassifier(alpha=10.0, class_weight=None, copy_X=True,
        fit_intercept=True, max_iter=None, normalize=False,
        random_state=None, solver='auto', tol=0.001)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
          ...      presort='auto', random_state=None, subsample=1.0, verbose=1,
              warm_start=False))],
         flatten_transform=None, n_jobs=-1, voting='hard', weights=None)

In [224]:
print(accuracy_score(y_count_val, vote_m.predict(X_count_val)))
print(confusion_matrix(y_count_val, vote_m.predict(X_count_val)))
print(classification_report(y_count_val, vote_m.predict(X_count_val)))

0.8824
[[1075  172]
 [ 122 1131]]
             precision    recall  f1-score   support

          0       0.90      0.86      0.88      1247
          1       0.87      0.90      0.88      1253

avg / total       0.88      0.88      0.88      2500

