In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from itertools import combinations
from mlens.ensemble import SuperLearner
from sklearn.metrics import accuracy_score

[MLENS] backend: threading


In [2]:
df_full = pd.read_csv('full_tfidf_df.csv')
df_full.drop(columns=['Unnamed: 0'], inplace=True)
df_full[['num_tokens','mention_count','url_count','hashtag_count']] = df_full[['num_tokens','mention_count','url_count','hashtag_count']].fillna(0)
df_full=df_full.astype('int')
features = df_full.drop(columns = 'class')
labels = df_full['class']

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, KFold
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC

names = ['Random Forest', 'Extra Trees', 'KNeighbors', 'SVC', 'Ridge Classifier']
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.3, random_state = 0)

best_clf = LogisticRegression(C=0.001, max_iter=100000)
rf = RandomForestClassifier()
et = ExtraTreesClassifier()
knn = KNeighborsClassifier()
svc = SVC()
rg = RidgeClassifier()
clf_array = [rf, et, knn, svc, rg]


def zip_stacked_classifiers(*args):
    to_zip = []
    for arg in args:
        combined_items = sum([list(map(list, combinations(arg, i))) for i in range(len(arg) + 1)], [])
        combined_items = filter(lambda x: len(x) > 0, combined_items)
        to_zip.append(combined_items)
    
    return zip(to_zip[0], to_zip[1])

stacked_clf_list = zip_stacked_classifiers(clf_array, names)

best_combination = [0.00, ""]

for clf in stacked_clf_list:
    
    ensemble = SuperLearner(scorer = accuracy_score, 
                            random_state = 0, 
                            folds = 10)
    ensemble.add(clf[0])
    ensemble.add_meta(best_clf)
    ensemble.fit(X_train, y_train)
    preds = ensemble.predict(X_test)
    accuracy = accuracy_score(preds, y_test)
    
    if accuracy > best_combination[0]:
        best_combination[0] = accuracy
        best_combination[1] = clf[1]
    #print("Accuracy score: {0:.3f} {1}").format(accuracy, clf[1])
    print(accuracy)
    print(clf[1])
    print(best_combination[0])
    print(best_combination[1])
    
#print("\nBest stacking model is {} with accuracy of: {:.3f}").format(best_combination[1], best_combination[0])

print(best_combination)

In [4]:
from mlxtend.classifier import StackingClassifier
names = ['Random Forest', 'Extra Trees', 'KNeighbors', 'SVC', 'Ridge Classifier']
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.3, random_state = 0)

best_clf = LogisticRegression(C=0.001, max_iter=100000)
rf = RandomForestClassifier()
et = ExtraTreesClassifier()
knn = KNeighborsClassifier()
svc = SVC()
rg = RidgeClassifier()
clf_array = [rf, et, knn, svc, rg]

sclf = StackingClassifier(classifiers= clf_array,meta_classifier=best_clf)

In [6]:
import matplotlib.gridspec as gridspec
import itertools
from mlxtend.plotting import plot_decision_regions

names = ['Random Forest', 'Extra Trees', 'KNeighbors', 'SVC', 'Ridge Classifier']
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.3, random_state = 0)

best_clf = LogisticRegression(C=0.001, max_iter=100000)
rf = RandomForestClassifier()
et = ExtraTreesClassifier()
knn = KNeighborsClassifier()
svc = SVC()
rg = RidgeClassifier()
clf_array = [rf, et, knn, svc, rg]

fig = plt.figure(figsize=(10,8))
gs = gridspec.GridSpec(2,2)
grid= itertools.product([0,1], repeat = 2)
clf_cv_mean= []
clf_cv_std = []
for clf, label, grd in zip(clf_array, names, grid):
        scores = cross_val_score(clf,features,labels,cv=10, scoring='accuracy')
        print('Accuracy: %.3f(+/- %.2f) [%s]' %(scores.mean(),scores.std(),label))
        clf_cv_mean.append(scores.mean())
        clf_cv_std.append(scores.std())
plt.show()

KeyboardInterrupt: 

<Figure size 720x576 with 0 Axes>

from mlens.ensemble import SuperLearner
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


seed = 0

ensemble = SuperLearner(scorer = accuracy_score, 
                        random_state=seed, 
                        folds=10,
                        verbose = 2)

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.30, random_state=seed)

# Build the first layer
ensemble.add([rf, et, knn, rg])
# Attach the final meta estimator
ensemble.add_meta(best_clf)

ensemble.fit(X_train, y_train)
preds = ensemble.predict(X_test)
print("Fit data:\n%r" % ensemble.data)
print("Accuracy score: {:.3f}").format(accuracy_score(preds, y_test))