In [2]:
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv('clean_data.csv')
df_imdb = pd.read_csv('clean_imdb.csv')

In [8]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline

In [9]:
np.random.seed(42)
pdf = df.reindex(np.random.permutation(df.index))
pdf_imdb = df_imdb.reindex(np.random.permutation(df_imdb.index))

In [10]:
X, y = pdf_imdb['clean_text'], pdf_imdb['class']
X_test, y_test = pdf['clean_text'], pdf['class']

In [11]:
import warnings
warnings.filterwarnings('ignore')

In [12]:
vectorizers = [('tfidf', TfidfVectorizer), ('cnt_vec', CountVectorizer)]
ngram_ranges = [(1, 1), (1, 2), (1, 3)]
max_features = [None, 20000, 50000, 80000]
dim_red = SelectKBest(score_func=chi2, k=20000)
classifiers = [('svc', LinearSVC(random_state=42)),
               ('lr', LogisticRegression(random_state=42)),
               ('sgd', SGDClassifier(random_state=42))]

In [13]:
k = 5
top_k_models = [None] * k
top_k_scores = [0] * k

for vect_name, vect in vectorizers:
    for ngram in ngram_ranges:
        for max_feat in max_features:
            for clf_name, clf in classifiers:
                np.random.seed(42)
                pipe = Pipeline([('vectorizer', vect(ngram_range=ngram, max_features=max_feat)),
                                 ('dim_red', dim_red),
                                 ('clf', clf)])
                pipe.fit(X, y)
                cand_score = accuracy_score(y_test, pipe.predict(X_test)).mean()
                worst_score = 1
                worst_score_idx = 0
                for i, score in enumerate(top_k_scores):
                    if score < worst_score:
                        worst_score = score
                        worst_score_idx = i
                if cand_score > worst_score:
                    top_k_scores[worst_score_idx] = cand_score
                    top_k_models[worst_score_idx] = pipe
                print(f"{vect_name}({ngram}, {max_feat}), {clf_name}: {cand_score}")
            print(f"top {k} scores: {top_k_scores}")

tfidf((1, 1), None), svc: 0.865
tfidf((1, 1), None), lr: 0.869
tfidf((1, 1), None), sgd: 0.8645
top 5 scores: [0.865, 0.869, 0.8645, 0, 0]
tfidf((1, 1), 20000), svc: 0.863
tfidf((1, 1), 20000), lr: 0.8645
tfidf((1, 1), 20000), sgd: 0.8675
top 5 scores: [0.865, 0.869, 0.8645, 0.8675, 0.8645]
tfidf((1, 1), 50000), svc: 0.8685
tfidf((1, 1), 50000), lr: 0.8705
tfidf((1, 1), 50000), sgd: 0.863
top 5 scores: [0.865, 0.869, 0.8685, 0.8675, 0.8705]
tfidf((1, 1), 80000), svc: 0.867
tfidf((1, 1), 80000), lr: 0.8695
tfidf((1, 1), 80000), sgd: 0.863
top 5 scores: [0.8695, 0.869, 0.8685, 0.8675, 0.8705]
tfidf((1, 2), None), svc: 0.8825
tfidf((1, 2), None), lr: 0.8605
tfidf((1, 2), None), sgd: 0.8465
top 5 scores: [0.8695, 0.869, 0.8685, 0.8825, 0.8705]
tfidf((1, 2), 20000), svc: 0.8655
tfidf((1, 2), 20000), lr: 0.8805
tfidf((1, 2), 20000), sgd: 0.8795
top 5 scores: [0.8695, 0.8795, 0.8805, 0.8825, 0.8705]
tfidf((1, 2), 50000), svc: 0.8805
tfidf((1, 2), 50000), lr: 0.8805
tfidf((1, 2), 50000), sgd: 

In [38]:
top_k_scores

[0.8815, 0.885, 0.8825, 0.8825, 0.8825]

In [39]:
best_model = top_k_models[1]
print("Best single pre-trained model score:", top_k_scores[1])

Best single pre-trained model score: 0.885


In [37]:
# We need to write our own k-fold validation to enhance each training dataset with common extraneous dataset
# in order to get final evaluation
import tqdm
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [42]:
scores = []
for train_dx, test_dx in tqdm.tqdm(kfold.split(X_test, y_test)):
    enh_train_X = np.hstack([X, X_test[train_dx]])
    enh_train_y = np.hstack([y, y_test[train_dx]])
    
    test_X = X_test[test_dx]
    test_y = y_test[test_dx]
    print("Start fitting model")
    best_model.fit(enh_train_X, enh_train_y)
    print("End fitting model")
    y_pred = best_model.predict(test_X)
    scores.append(accuracy_score(test_y, y_pred))

cv_score = np.mean(scores)
print(scores, "Mean score=", cv_score)


0it [00:00, ?it/s][A

Start fitting model
End fitting model



1it [05:23, 323.72s/it][A

Start fitting model
End fitting model



2it [10:44, 322.76s/it][A

Start fitting model
End fitting model



3it [16:01, 321.09s/it][A

Start fitting model
End fitting model



4it [21:22, 321.17s/it][A

Start fitting model
End fitting model



5it [28:06, 346.07s/it][A

[0.9, 0.8975, 0.8825, 0.905, 0.8975] 0.8965


In [45]:
print("Best single model cross-validation score:", cv_score)

Best single model cross-validation score: 0.8965


In [48]:
from mlxtend.classifier import StackingClassifier
stack = StackingClassifier(top_k_models, meta_classifier=LogisticRegression(random_state=42))

In [None]:
scores = []
for i, (train_dx, test_dx) in tqdm.tqdm(enumerate(kfold.split(X_test, y_test))):
    enh_train_X = np.hstack([X, X_test[train_dx]])
    enh_train_y = np.hstack([y, y_test[train_dx]])
    
    test_X = X_test[test_dx]
    test_y = y_test[test_dx]
    stack.fit(enh_train_X, enh_train_y)
    y_pred = stack.predict(test_X)
    acc = accuracy_score(test_y, y_pred)
    print(f"Fold {i} acc={acc}")
    scores.append(acc)

cv_score = np.mean(scores)
print(scores, "Mean score=", cv_score)


0it [00:00, ?it/s][A
1it [29:12, 1752.15s/it][A

Fold 0 acc=0.9025



2it [1:00:13, 1784.86s/it][A

Fold 1 acc=0.895
