In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('clean_data.csv')

In [16]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_selection import SelectKBest, chi2
# from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.pipeline import Pipeline

In [5]:
np.random.seed(42)
df_permuted = df.reindex(np.random.permutation(df.index))

In [6]:
X, y = df_permuted['clean_text'], df_permuted['class']

In [11]:
import warnings
warnings.filterwarnings('ignore')

In [50]:
vectorizers = [('tfidf', TfidfVectorizer), ('cnt_vec', CountVectorizer)]
ngram_ranges = [(1, 1), (1, 2), (1, 3)]
max_features = [None, 20000, 50000, 80000]
dim_red = SelectKBest(score_func=chi2, k=20000)
classifiers = [('svc', LinearSVC(random_state=42)),
               ('lr', LogisticRegression(random_state=42)),
               ('sgd', SGDClassifier(random_state=42))]

In [52]:
k = 5
top_k_models = [None] * k
top_k_scores = [0] * k

for vect_name, vect in vectorizers:
    for ngram in ngram_ranges:
        for max_feat in max_features:
            for clf_name, clf in classifiers:
                np.random.seed(42)
                pipe = Pipeline([('vectorizer', vect(ngram_range=ngram, max_features=max_feat)),
                                 ('dim_red', dim_red),
                                 ('clf', clf)])
                cand_score = cross_val_score(pipe, X, y, cv=5).mean()
                worst_score = 1
                worst_score_idx = 0
                for i, score in enumerate(top_k_scores):
                    if score < worst_score:
                        worst_score = score
                        worst_score_idx = i
                if cand_score > worst_score:
                    top_k_scores[worst_score_idx] = cand_score
                    top_k_models[worst_score_idx] = pipe
                print(f"{vect_name}({ngram}, {max_feat}), {clf_name}: {cand_score}")
            print(f"top {k} scores: {top_k_scores}")

tfidf((1, 1), None), svc: 0.858
tfidf((1, 1), None), lr: 0.8344999999999999
tfidf((1, 1), None), sgd: 0.8525
top 5 scores: [0.858, 0.8344999999999999, 0.8525, 0, 0]
tfidf((1, 1), 20000), svc: 0.861
tfidf((1, 1), 20000), lr: 0.8385
tfidf((1, 1), 20000), sgd: 0.8564999999999999
top 5 scores: [0.858, 0.8564999999999999, 0.8525, 0.861, 0.8385]
tfidf((1, 1), 50000), svc: 0.858
tfidf((1, 1), 50000), lr: 0.8344999999999999
tfidf((1, 1), 50000), sgd: 0.8525
top 5 scores: [0.858, 0.8564999999999999, 0.8525, 0.861, 0.858]
tfidf((1, 1), 80000), svc: 0.858
tfidf((1, 1), 80000), lr: 0.8344999999999999
tfidf((1, 1), 80000), sgd: 0.8525
top 5 scores: [0.858, 0.8564999999999999, 0.858, 0.861, 0.858]
tfidf((1, 2), None), svc: 0.852
tfidf((1, 2), None), lr: 0.829
tfidf((1, 2), None), sgd: 0.8600000000000001
top 5 scores: [0.858, 0.8600000000000001, 0.858, 0.861, 0.858]
tfidf((1, 2), 20000), svc: 0.866
tfidf((1, 2), 20000), lr: 0.8404999999999999
tfidf((1, 2), 20000), sgd: 0.8564999999999999
top 5 scores

In [117]:
# Single best model:
cross_val_score(top_k_models[0], X, y, cv=5).mean()

0.866

In [108]:
#adding stacking
from mlxtend.classifier import StackingClassifier

In [110]:
stack = StackingClassifier(top_k_models, LogisticRegression(random_state=42))

In [115]:
cross_val_score(stack, X, y, cv=5).mean()

0.8664999999999999

In [118]:
import xgboost
stack2 = StackingClassifier(top_k_models, xgboost.XGBRFClassifier(random_state=42))

In [119]:
cross_val_score(stack2, X, y, cv=5).mean()

0.866

In [120]:
stack3 = StackingClassifier(top_k_models, LinearSVC(random_state=42))
cross_val_score(stack3, X, y, cv=5).mean()

0.8664999999999999

In [122]:
from joblib import dump
dump(stack, 'data_only_stack.joblib')

['data_only_stack.joblib']