In [2]:
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix,roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from imblearn.combine import SMOTETomek

np.random.seed(0)

X = np.genfromtxt("data/X_train.txt", delimiter = None)
Y = np.genfromtxt("data/Y_train.txt", delimiter = None)

X_train, X_val, Y_train, Y_val = ml.splitData(X, Y, .75)
X_test = np.genfromtxt("data/X_test.txt", delimiter = None)

In [3]:
class BaggedTree(ml.base.classifier):
    def __init__(self, learners):
        """Constructs a BaggedTree class with a set of learners. """
        self.learners = learners
    
    def predictSoft(self, X):
        """Predicts the probabilities with each bagged learner and average over the results. """
        n_bags = len(self.learners)
        preds = [self.learners[l].predictSoft(X) for l in range(n_bags)]
        return np.mean(preds, axis=0)

In [13]:
n_bags = 20
bags = []

for i in range(n_bags):
    Xi, Yi = ml.bootstrapData(X_train, Y_train, X_train.shape[0])
    
    tree = ml.dtree.treeClassify(Xi, Yi, minParent=2**6,maxDepth=25, nFeatures=4)
    bags.append(tree)

model = BaggedTree(bags)
model.classes = np.unique(Y_train)

In [14]:
print("train auc = {}".format(model.auc(X_train, Y_train)))
print("val   auc = {}".format(model.auc(X_val, Y_val)))

train auc = 0.8940343806471489
val   auc = 0.7689064541830042


In [15]:
Yte = np.vstack((np.arange(X_test.shape[0]), model.predictSoft(X_test)[:,1])).T
np.savetxt('data/Y_submit_random_forest_4Features.txt', Yte, '%d, %.2f', header='ID,Prob1', comments='', delimiter=',')