In [None]:
from statistics import mean
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn import metrics
from sklearn import model_selection
import nltk

import warnings
warnings.filterwarnings('ignore')

df_tv = pd.read_csv('data/train_folds.csv')
df_tv['Phrase'] = df_tv['Phrase'].apply(lambda s: s.replace("n't", 'not'))

classifiers = {
    # 'lr': LogisticRegression,
    # 'svc': SVC, # too slow!!!
    # 'lsvc': LinearSVC,
    # 'mnb': MultinomialNB,
    # 'knn': KNeighborsClassifier, # too slow!!!
    # 'dt': DecisionTreeClassifier, # doesn't work
    # 'rf': RandomForestClassifier, # doesn't work
    # 'ada': AdaBoostClassifier, # too slow!!!
    # 'gb': GradientBoostingClassifier, # too slow!!!
    # 'sgd': SGDClassifier
}

classifiers_params = {
    'lr': {
        'C': np.logspace(-3, 3, 7)
    },
    'svc': {
        # 'C': np.logspace(-3, 3, 7),
        # 'gamma': np.logspace(-3, 1, 5)
    },
    'lsvc': {
        'C': np.logspace(-3, 3, 7)
    },
    'mnb': {
        'alpha': np.linspace(0, 1, 11)
    },
    'knn': {
        'n_neighbors': np.linspace(3, 7, 5, dtype=int)
    },
    'dt': {
        'criterion': ["gini", "entropy", "log_loss"],
        'min_samples_leaf': [0.1],
        'max_depth': np.linspace(3, 7, 5, dtype=int)
    },
    'rf': {
        ''
        'criterion': ["gini", "entropy", "log_loss"],
        'min_samples_leaf': [0.1],
        'max_depth': np.linspace(3, 7, 5, dtype=int),
        'n_estimators': np.linspace(50, 300, 6, dtype=int)
    },
    'ada': {
        'n_estimators': np.linspace(50, 300, 6, dtype=int)
    },
    'gb': {
        'n_estimators': np.linspace(50, 300, 6, dtype=int),
        'min_samples_leaf': [0.1],
    },
    'sgd': {
        'penalty' : ['l2', 'l1', 'elasticnet'],
        'alpha': np.logspace(-7, -1, 7)
    }
}

stemmer = nltk.stem.PorterStemmer()

In [None]:
def tokenizer(text: str):
    text = text.lower()
    tokens = [word for word in nltk.word_tokenize(text) if word.isalpha() and word not in ENGLISH_STOP_WORDS]
    tokens = [stemmer.stem(word) for word in tokens]
    return tokens

def get_accuracy(classifier, params, kfold):
    df_tr = df_tv[df_tv['kfold'] != kfold]
    df_vl = df_tv[df_tv['kfold'] == kfold]

    ytr = df_tr['Sentiment']
    yvl = df_vl['Sentiment']
    
    vectorizer = TfidfVectorizer(
        token_pattern=r'\b[^\d\W][^\d\W]+\b',
        ngram_range=(1, 2),
        max_df=0.9,
        lowercase=True
    ).fit(df_tr['Phrase'])

    xtr = vectorizer.transform(df_tr['Phrase'])
    xvl = vectorizer.transform(df_vl['Phrase'])

    model = classifier(**params).fit(xtr, ytr)
    ypd = model.predict(xvl)
    accuracy = metrics.accuracy_score(yvl, ypd)

    return accuracy


In [None]:
for name, classifier in classifiers.items():
    print(f'{name} classifier:')
    print('*'*100)
    for params in model_selection.ParameterGrid(classifiers_params[name]):
        accuracy = mean([get_accuracy(classifier, params, kfold) for kfold in range(5)])
        print(f'average accuracy: {accuracy} | params: {params}')