In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_validate

import warnings
warnings.filterwarnings('ignore')

# BOW 1-2-3

In [2]:
results = {}
for ngram in [1,2,3]:
    print('Max ngram: {}'.format(ngram))
    
    # 0. Load BOW data
    df0 = pd.read_csv('BOW-datasets/BOW_{}-gram_class0.csv'.format(ngram), header=0, index_col=0)
    df1 = pd.read_csv('BOW-datasets/BOW_{}-gram_class1.csv'.format(ngram), header=0, index_col=0)

    X0 = df0.values
    X1 = df1.values
    feat_names = df0.columns

    print('X0: ', X0.shape)
    print('X1: ', X1.shape)
    
    # 1. Create a balanced training set
    n = X1.shape[0]

    # Subsample X1
    idx = np.arange(n)
    np.random.shuffle(idx)
    X1_small = X1[idx, :]
    id1 = df1.index[idx]

    # Subsample X0
    idx = np.arange(n)
    np.random.shuffle(idx)
    X0_small = X0[idx, :]
    id0 = df0.index[idx]

    # Stack them
    X_small = np.vstack((X1_small, X0_small))
    y_small = np.hstack((np.ones(X1_small.shape[0]), np.zeros(X0_small.shape[0])))
    id_small = id1.tolist()+id0.tolist()
    print(X_small.shape)
    print(y_small.shape)
    
    # Learning phase
    pipe = Pipeline([('pp', StandardScaler()), ('mdl', LogisticRegressionCV(Cs=np.logspace(-5, 0, 10)))])

    scores = cross_validate(pipe, X_small, y_small, cv=10, n_jobs=-1)
    print('Training: avg score: {:2.3f} +- {:2.3f}'.format(np.mean(scores['train_score']),
                                                           np.std(scores['train_score'])))
    print('Validation: avg score: {:2.3f} +- {:2.3f}'.format(np.mean(scores['test_score']),
                                                             np.std(scores['test_score'])))
    
    results[ngram] = scores
    print('----------------------------------')

Max ngram: 1
('X0: ', (284718, 19))
('X1: ', (25949, 19))
(51898, 19)
(51898,)
Training: avg score: 0.899 +- 0.000
Validation: avg score: 0.899 +- 0.003
----------------------------------
Max ngram: 2
('X0: ', (284718, 129))
('X1: ', (25949, 129))
(51898, 129)
(51898,)
Training: avg score: 0.902 +- 0.001
Validation: avg score: 0.901 +- 0.004
----------------------------------
Max ngram: 3
('X0: ', (284718, 854))
('X1: ', (25949, 854))
(51898, 854)
(51898,)
Training: avg score: 0.909 +- 0.001
Validation: avg score: 0.903 +- 0.005
----------------------------------
