In [1]:
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix,roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from imblearn.combine import SMOTETomek

np.random.seed(0)

X = np.genfromtxt("data/X_train.txt", delimiter = None)
Y = np.genfromtxt("data/Y_train.txt", delimiter = None)

X_train, X_val, Y_train, Y_val = ml.splitData(X, Y, .75)
X_test = np.genfromtxt("data/X_test.txt", delimiter = None)

In [2]:
#X_train_ipca = np.genfromtxt("reduced_data/Training/training_data_IPCA.txt")
#X_val_ipca = np.genfromtxt("reduced_data/Validation/validation_data_IPCA.txt")
#X_test_ipca = np.genfromtxt("reduced_data/Test/test_data_IPCA.txt")

X_train_svd = np.genfromtxt("reduced_data/Training/transformed_train_svd.txt")
X_val_svd = np.genfromtxt("reduced_data/Validation/transformed_validation_svd.txt")
X_test_svd = np.genfromtxt("reduced_data/Test/transformed_test_svd.txt")


Y_train_svd = np.genfromtxt("reduced_data/Training/train_Y.txt")
Y_val_svd = np.genfromtxt("reduced_data/Validation/validation_Y.txt")

In [3]:
class BaggedTree(ml.base.classifier):
    def __init__(self, learners):
        """Constructs a BaggedTree class with a set of learners. """
        self.learners = learners
    
    def predictSoft(self, X):
        """Predicts the probabilities with each bagged learner and average over the results. """
        n_bags = len(self.learners)
        preds = [self.learners[l].predictSoft(X) for l in range(n_bags)]
        return np.mean(preds, axis=0)

In [8]:
minParent_list = [2**x for x in range(1, 6)]

for mp in minParent_list:
    for md in range(0, 35, 5):
        n_bags = 20
        bags = []
        for i in range(n_bags):
            Xi, Yi = ml.bootstrapData(X_train_svd, Y_train_svd, X_train_svd.shape[0])

            tree = ml.dtree.treeClassify(Xi, Yi, minParent=mp, maxDepth=md, nFeatures=5)
            bags.append(tree)

        model = BaggedTree(bags)
        model.classes = np.unique(Y_train_svd)
        print("mp = {}, md = {}".format(mp, md))
        print("val   auc = {}".format(model.auc(X_val_svd, Y_val_svd)))

mp = 2, md = 0
val   auc = 0.5
mp = 2, md = 5
val   auc = 0.6770598819553819
mp = 2, md = 10
val   auc = 0.7140122523170583
mp = 2, md = 15
val   auc = 0.7581119775544675
mp = 2, md = 20
val   auc = 0.7869464097274356
mp = 2, md = 25
val   auc = 0.7987130548273701
mp = 2, md = 30
val   auc = 0.7982678588467652
mp = 4, md = 0
val   auc = 0.5
mp = 4, md = 5
val   auc = 0.6762841820781329
mp = 4, md = 10
val   auc = 0.714439302578227
mp = 4, md = 15
val   auc = 0.7595360691038869
mp = 4, md = 20
val   auc = 0.7873214182860556
mp = 4, md = 25
val   auc = 0.7982449985197203
mp = 4, md = 30
val   auc = 0.7987298368173326
mp = 8, md = 0
val   auc = 0.5
mp = 8, md = 5
val   auc = 0.6770726295789851
mp = 8, md = 10
val   auc = 0.7145430418039808
mp = 8, md = 15
val   auc = 0.7616875950598588
mp = 8, md = 20
val   auc = 0.7891099941365802
mp = 8, md = 25
val   auc = 0.7967648252191868
mp = 8, md = 30
val   auc = 0.7978575426826788
mp = 16, md = 0
val   auc = 0.5
mp = 16, md = 5
val   auc = 0.677

In [9]:
n_bags = 20
bags = []
for i in range(n_bags):
    Xi, Yi = ml.bootstrapData(X_train_svd, Y_train_svd, X_train_svd.shape[0])

    tree = ml.dtree.treeClassify(Xi, Yi, minParent=32, maxDepth=30, nFeatures=5)
    bags.append(tree)

model = BaggedTree(bags)
model.classes = np.unique(Y_train_svd)

In [7]:
print("train auc = {}".format(model.auc(X_train, Y_train)))
print("val   auc = {}".format(model.auc(X_val, Y_val)))

train auc = 0.9236229185297911
val   auc = 0.7734124671032768


In [10]:
#n_bags = 20
#bags_ipca = []
#for i in range(n_bags):
#    Xi, Yi = ml.bootstrapData(X_train_ipca, Y_train_ipca, X_train_ipca.shape[0])
#
#    tree = ml.dtree.treeClassify(Xi, Yi, minParent=32, maxDepth=25, nFeatures=4)
#    bags_ipca.append(tree)
#    
#model_ipca = BaggedTree(bags_ipca)
#model_ipca.classes = np.unique(Y_train_ipca)

In [11]:
#print("train auc = {}".format(model_ipca.auc(X_train_ipca, Y_train_ipca)))
#print("val   auc = {}".format(model_ipca.auc(X_val_ipca, Y_val_ipca)))

In [12]:
Yte = np.vstack((np.arange(X_test_svd.shape[0]), model.predictSoft(X_test_svd)[:,1])).T
np.savetxt('data/Y_submit_random_forest_5Features_svd.txt', Yte, '%d, %.2f', header='ID,Prob1', comments='', delimiter=',')