In [1]:
import mltools as ml
import numpy as np
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier


In [32]:
X = np.genfromtxt('data/X_train.txt', delimiter=',')
Y = np.genfromtxt('data/Y_train.txt', delimiter=',')
X_test = np.genfromtxt('data/X_test.txt', delimiter=',')
X, Y = ml.shuffleData(X, Y)
Xtr,Xva,Ytr,Yva = ml.splitData(X,Y,0.5) 
X1,X2,Y1,Y2 = ml.splitData(X,Y,0.5) 
X3,X4,Y3,Y4 = ml.splitData(X2,Y2,0.5) 

In [37]:
#HELPER FUNCTIONS
def blendModelsPrediction(classifiers):

    trainClassifiers(classifiers, X, Y)
    predictions = predictClassifers(classifiers, X_test)
    w = calculateWeight(predictions)
    r = computesFinalBlendResult(predictions, w)
    return r

def threeStackedEnsemble(l1_classifiers, l2_classifiers, l3_classifiers):
    ''' 
    l1_classifiers
        trained: X1, Y1
        predict: X3 -> l1_p_Y3
        predict: X4 -> l1_p_Y4
        predict: X_test -> l1_p_Y_test

    l2_classifiers
        trained: l1_p_Y3, Y3
        predict: l1_p_Y4 -> l2_p_Y4
        predict: l1_p_Y_test -> l2_p_Y_test

    l3_classifiers
        trained: l2_p_Y4, Y4
        predict: l2_p_Y_test -> submit prediction Y_test
    '''
    trainClassifiers(l1_classifiers, X1, Y1)
    l1_p_Y3 = stackResults(predictClassifers(l1_classifiers, X3))
    l1_p_Y4 = stackResults(predictClassifers(l1_classifiers, X4))
    l1_p_Y_test = stackResults(predictClassifers(l1_classifiers, X_test))

    
    trainClassifiers(l2_classifiers, l1_p_Y3, Y3)
    l2_p_Y4 = stackResults(predictClassifers(l2_classifiers, l1_p_Y4))
    l2_p_Y_test = stackResults(predictClassifers(l2_classifiers, l1_p_Y_test))

    trainClassifiers(l3_classifiers, l2_p_Y4, Y4)
    Y_test_prediction = predictClassifers(l3_classifiers, l2_p_Y_test)

    return Y_test_prediction


def convertToFinalPredictions(pred): #CONVERTS ALL PREDICITONS TO 0 AND 1
    final_pred = []
    for i in pred:
        if i>0.5:
            final_pred.append(1)
        else:
            final_pred.append(0)
    return np.array(final_pred)


def PrintValidationErrorofPredictions(preds): #PRINTS THE ERROR OF EACH PREDICTION ON THE VALIDATION DATA
    i = 0
    for p in preds:
        fp = convertToFinalPredictions(p)
        print(i,"Prediction ERROR: ", 1-sum(fp==Yva)/float(len(Yva)))
        i+=1

def calculateWeight(predictions): #RETURNS THE WEIGHTS FOR EACH PREDICITON
    numOfleaners = len(predictions)
    predictions = np.array([np.array(convertToFinalPredictions(pred)) for pred in predictions])
    weight = np.zeros(numOfleaners)
    for indx, trueClass in enumerate(Yva):
        for i in range(numOfleaners):
            if predictions[i,indx] == trueClass:
                weight[i] += 1
            else:
                weight[i] -= 1
    return weight/sum(weight)


def computeBlendErr(pred,weight): #COMPUTES AND PRINTS THE ERROR OF BLEND
    result = 0
    for i in range(len(pred)):
        result += np.array(pred[i]) * weight[i]
    PrintValidationErrorofPredictions([result])

def computesFinalBlendResult(pred, weights): #COMPUTES BLEND FINAL RESULT AND RETURNS
    result = 0
    for i in range(len(pred)):
        result += np.array(pred[i]) * weights[i]
    return result


def submitPredictions(pred): #PUTS YOUR PREDICTIONS INTO FinalPredictions.txt FOR SUBMISSION
    Y_test = np.vstack((np.arange(X_test.shape[0]), pred)).T
    # Output a file with two columns, a row ID and a confidence in class 1:
    np.savetxt('FinalPredictions.txt', Y_test, '%d, %.2f',header='ID,Predicted', delimiter=',')


def stackResults(predictions): #STACKS PREDICTIONS TO SINGLE ARRAY FOR TRAINING 
    #EACH COLUMN OF PREDICTIONS BECOMES A ROW FOR THE NEW TRAINING
    new_outputs = []
    for pred in predictions:
        new_outputs.append([[j] for j in pred])
    return np.array(np.hstack((np.array(new_outputs))))

def finalSubmit(pred_tr, pred_te):
    weights = calculateWeight(pred_tr)
    print("final weights:",weights)
    final_prediction = computesFinalResult(pred_te, weights)
    submitPredictions(final_prediction) 

def printModelAccuracy(model):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(model, X, Y, scoring='accuracy', cv=cv, n_jobs=-1)
    # report performance
    print(model.__class__.__name__, 'Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

def trainClassifiers(classifiers, trainingX, trainingY):
    for clf in classifiers:
        print("TRAINING", clf.__class__.__name__)
        clf.fit(trainingX, trainingY)

def predictClassifers(classifiers, data):
    classifier_predictions = []
    for clf in classifiers:
        classifier_predictions.append(np.array(clf.predict_proba(data)[:, 1]))
    return np.array(classifier_predictions)

In [66]:
ada = AdaBoostClassifier(n_estimators=50)
gb = GradientBoostingClassifier()
lr = LogisticRegression(penalty='l1')
svc = SVC(probability=True)
nn = MLPClassifier()
knn = KNeighborsClassifier(n_neighbors=2)
rf = RandomForestClassifier(n_estimators=100)

l1_classifiers = [
    AdaBoostClassifier(n_estimators=50), AdaBoostClassifier(n_estimators=50), AdaBoostClassifier(n_estimators=50),
    GradientBoostingClassifier(),GradientBoostingClassifier(),GradientBoostingClassifier(),GradientBoostingClassifier(),GradientBoostingClassifier(),GradientBoostingClassifier(),GradientBoostingClassifier(),GradientBoostingClassifier(),GradientBoostingClassifier(),GradientBoostingClassifier(),GradientBoostingClassifier(),GradientBoostingClassifier(),GradientBoostingClassifier(),GradientBoostingClassifier(),GradientBoostingClassifier(),GradientBoostingClassifier(),GradientBoostingClassifier(),GradientBoostingClassifier(),GradientBoostingClassifier(),GradientBoostingClassifier(),LogisticRegression(penalty='l1'),LogisticRegression(penalty='l1'),LogisticRegression(penalty='l1'),SVC(probability=True),SVC(probability=True),SVC(probability=True),SVC(probability=True),SVC(probability=True),MLPClassifier(),MLPClassifier(),MLPClassifier(),MLPClassifier(),MLPClassifier(),MLPClassifier(),MLPClassifier(),MLPClassifier(),MLPClassifier(),MLPClassifier(),MLPClassifier(),MLPClassifier(),MLPClassifier(),MLPClassifier(),KNeighborsClassifier(n_neighbors=2),KNeighborsClassifier(n_neighbors=2),RandomForestClassifier(n_estimators=100),RandomForestClassifier(n_estimators=100),RandomForestClassifier(n_estimators=100),RandomForestClassifier(n_estimators=100),RandomForestClassifier(n_estimators=100),RandomForestClassifier(n_estimators=100),RandomForestClassifier(n_estimators=100),RandomForestClassifier(n_estimators=100),RandomForestClassifier(n_estimators=100),RandomForestClassifier(n_estimators=100),RandomForestClassifier(n_estimators=100)
]


l2_classifiers = [GradientBoostingClassifier(), LogisticRegression(penalty='l1'), MLPClassifier()]
l3_classifiers = [RandomForestClassifier(n_estimators=1000)]


# for clf in classifiers:
#     printModelAccuracy(clf)

In [67]:
final_stacked_prediction = threeStackedEnsemble(l1_classifiers, l2_classifiers, l3_classifiers)


('TRAINING', 'AdaBoostClassifier')
('TRAINING', 'AdaBoostClassifier')
('TRAINING', 'AdaBoostClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'LogisticRegression')


In [64]:
final_blend_prediction = blendModelsPrediction([AdaBoostClassifier(n_estimators=50),
    GradientBoostingClassifier(),GradientBoostingClassifier(),RandomForestClassifier(n_estimators=100),RandomForestClassifier(n_estimators=100),RandomForestClassifier(n_estimators=100),RandomForestClassifier(n_estimators=100),RandomForestClassifier(n_estimators=1000)])

('TRAINING', 'AdaBoostClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'GradientBoostingClassifier')
('TRAINING', 'RandomForestClassifier')
('TRAINING', 'RandomForestClassifier')
('TRAINING', 'RandomForestClassifier')
('TRAINING', 'RandomForestClassifier')
('TRAINING', 'RandomForestClassifier')


In [68]:
print(final_blend_prediction)
print(final_stacked_prediction[0])
float(sum(convertToFinalPredictions(final_blend_prediction)==convertToFinalPredictions(final_stacked_prediction[0])))/len(final_blend_prediction)

[0.34203022 0.19286418 0.53705284 ... 0.44350903 0.8320947  0.48865463]
[0.371 0.027 0.384 ... 0.258 1.    0.416]


0.7400970088924818

In [69]:
submitPredictions(final_stacked_prediction[0])