In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from itertools import combinations
from mlens.ensemble import SuperLearner
from sklearn.metrics import accuracy_score

[MLENS] backend: threading


In [2]:
df_full = pd.read_csv('full_tfidf_df.csv')
df_full.drop(columns=['Unnamed: 0'], inplace=True)
df_full[['num_tokens','mention_count','url_count','hashtag_count']] = df_full[['num_tokens','mention_count','url_count','hashtag_count']].fillna(0)
df_full=df_full.astype('int')
features = df_full.drop(columns = 'class')
labels = df_full['class']

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, KFold
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC

In [None]:
names = ['Random Forest', 'Extra Trees', 'KNeighbors', 'SVC', 'Ridge Classifier']
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.3, random_state = 0)

best_clf = LogisticRegression(C=0.001, max_iter=100000)
rf = RandomForestClassifier()
et = ExtraTreesClassifier()
knn = KNeighborsClassifier()
svc = SVC()
rg = RidgeClassifier()
clf_array = [rf, et, knn, svc, rg]


def zip_stacked_classifiers(*args):
    to_zip = []
    for arg in args:
        combined_items = sum([list(map(list, combinations(arg, i))) for i in range(len(arg) + 1)], [])
        combined_items = filter(lambda x: len(x) > 0, combined_items)
        to_zip.append(combined_items)
    
    return zip(to_zip[0], to_zip[1])

stacked_clf_list = zip_stacked_classifiers(clf_array, names)

best_combination = [0.00, ""]

for clf in stacked_clf_list:
    
    ensemble = SuperLearner(scorer = accuracy_score, 
                            random_state = 0, 
                            folds = 10)
    ensemble.add(clf[0])
    ensemble.add_meta(best_clf)
    ensemble.fit(X_train, y_train)
    preds = ensemble.predict(X_test)
    accuracy = accuracy_score(preds, y_test)
    
    if accuracy > best_combination[0]:
        best_combination[0] = accuracy
        best_combination[1] = clf[1]
    #print("Accuracy score: {0:.3f} {1}").format(accuracy, clf[1])
    print(accuracy)
    print(clf[1])
    print(best_combination[0])
    print(best_combination[1])
    
#print("\nBest stacking model is {} with accuracy of: {:.3f}").format(best_combination[1], best_combination[0])

0.7657613967022309
['Random Forest']
0.7657613967022309
['Random Forest']
0.7657613967022309
['Extra Trees']
0.7657613967022309
['Random Forest']
0.7657613967022309
['KNeighbors']
0.7657613967022309
['Random Forest']


In [5]:
print(best_combination)

[0.7657613967022309, ['Random Forest']]


In [8]:
clf_array

[RandomForestClassifier(),
 ExtraTreesClassifier(),
 KNeighborsClassifier(),
 SVC(),
 RidgeClassifier()]

In [9]:
names

['Random Forest', 'Extra Trees', 'KNeighbors', 'SVC', 'Ridge Classifier']