In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn import linear_model, svm, naive_bayes, model_selection

In [2]:
SEED = 57
TRAIN_DATASET_PATH = '~/Documents/ML-DL/imdb_review_classification/train.csv'
VALID_DATASET_PATH = '~/Documents/ML-DL/imdb_review_classification/valid.csv'
warnings.filterwarnings('ignore')

In [3]:
classifiers = {
    'lr': linear_model.LogisticRegression,
    # 'svc': SVC, # too slow!!!
    'lsvc': svm.LinearSVC,
    'mnb': naive_bayes.MultinomialNB,
    # 'knn': KNeighborsClassifier, # too slow!!!
    # 'dt': DecisionTreeClassifier, # doesn't work
    # 'rf': RandomForestClassifier, # doesn't work
    # 'ada': AdaBoostClassifier, # too slow!!!
    # 'gb': GradientBoostingClassifier, # too slow!!!
    'sgd': linear_model.SGDClassifier
}

classifiers_params = {
    'lr': {
        'C': np.logspace(-3, 3, 7)
    },
    'svc': {
        'C': np.logspace(-3, 3, 7),
        'gamma': np.logspace(-3, 1, 5)
    },
    'lsvc': {
        'C': np.logspace(-3, 3, 7)
    },
    'mnb': {
        'alpha': np.linspace(0, 1, 11)
    },
    'knn': {
        'n_neighbors': np.linspace(3, 7, 5, dtype=int)
    },
    'dt': {
        'criterion': ["gini", "entropy", "log_loss"],
        'min_samples_leaf': [0.1],
        'max_depth': np.linspace(3, 7, 5, dtype=int)
    },
    'rf': {
        ''
        'criterion': ["gini", "entropy", "log_loss"],
        'min_samples_leaf': [0.1],
        'max_depth': np.linspace(3, 7, 5, dtype=int),
        'n_estimators': np.linspace(50, 300, 6, dtype=int)
    },
    'ada': {
        'n_estimators': np.linspace(50, 300, 6, dtype=int)
    },
    'gb': {
        'n_estimators': np.linspace(50, 300, 6, dtype=int),
        'min_samples_leaf': [0.1],
    },
    'sgd': {
        'penalty' : ['l2', 'l1', 'elasticnet'],
        'alpha': np.logspace(-7, -1, 7)
    }
}

In [4]:
df_tr = pd.read_csv(TRAIN_DATASET_PATH)
df_vl = pd.read_csv(VALID_DATASET_PATH)

In [5]:
x_train, y_train = df_tr.drop(columns=['sentiment']), df_tr.sentiment
x_valid, y_valid = df_vl.drop(columns=['sentiment']), df_vl.sentiment

In [6]:
for classifier_name, classifier in classifiers.items():
    for params in model_selection.ParameterGrid(classifiers_params[classifier_name]):
        model = classifier(**params)
        model.fit(x_train, y_train)
        acc = model.score(x_valid, y_valid)
        print(classifier_name, params, acc)

lr {'C': 0.001} 0.8121
lr {'C': 0.01} 0.8378
lr {'C': 0.1} 0.866
lr {'C': 1.0} 0.8761
lr {'C': 10.0} 0.8723
lr {'C': 100.0} 0.8702
lr {'C': 1000.0} 0.8701
lsvc {'C': 0.001} 0.8351
lsvc {'C': 0.01} 0.8669
lsvc {'C': 0.1} 0.8761
lsvc {'C': 1.0} 0.8708
lsvc {'C': 10.0} 0.8712
lsvc {'C': 100.0} 0.8677
lsvc {'C': 1000.0} 0.8262
mnb {'alpha': 0.0} 0.8481
mnb {'alpha': 0.1} 0.8481
mnb {'alpha': 0.2} 0.8481
mnb {'alpha': 0.30000000000000004} 0.848
mnb {'alpha': 0.4} 0.848
mnb {'alpha': 0.5} 0.848
mnb {'alpha': 0.6000000000000001} 0.848
mnb {'alpha': 0.7000000000000001} 0.8479
mnb {'alpha': 0.8} 0.8478
mnb {'alpha': 0.9} 0.8479
mnb {'alpha': 1.0} 0.8479
sgd {'alpha': 1e-07, 'penalty': 'l2'} 0.8021
sgd {'alpha': 1e-07, 'penalty': 'l1'} 0.7879
sgd {'alpha': 1e-07, 'penalty': 'elasticnet'} 0.8633
sgd {'alpha': 1e-06, 'penalty': 'l2'} 0.8541
sgd {'alpha': 1e-06, 'penalty': 'l1'} 0.8624
sgd {'alpha': 1e-06, 'penalty': 'elasticnet'} 0.8688
sgd {'alpha': 1e-05, 'penalty': 'l2'} 0.8681
sgd {'alpha': 1e

### Best model
lr {'C': 1.0} with 0.8761 accuracy