In [1]:
import os
import numpy as np 

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
from sklearn.svm import NuSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors, datasets
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier

from numpy import linalg as LA
import pandas as pd
from openpyxl import load_workbook

import time
import eda as eda

from sklearn.preprocessing import normalize

RANDOM_STATE = 42

np.random.seed(RANDOM_STATE)



In [2]:
file_loc = "datasets/abalone71.csv"

In [3]:
import sys
import csv
from collections import defaultdict

def construct_line( label, line ):
    new_line = []
    new_line.append(str(label))

    for i, item in enumerate( line ):
        new_item = "%s:%s" % ( i, item )
        new_line.append( new_item )
    new_line = " ".join( new_line )
    new_line += "\n"
    return new_line

def csv2libsvm(data, output_file):
    try:
        o = open( output_file, 'w')

        reader = data

        for line in reader:
            label = len(line)
            new_line = construct_line( label, line )
            o.write( new_line )
        
        return 1
    except:
        return 0

In [4]:
class TOMBoost:
    def __init__(self, M, topics = 10, depth=None):
        self.M = M
        self.topics = topics
        self.depth = depth
        self.error = np.zeros((M,1))

    def getWeights(self, X):
        try:
            status = csv2libsvm(X_train, file_loc+".data")

            if status == 1:
                myCmd = "./lda est 0.01 " +str(self.topics)+" settings.txt " + "../" + file_loc +".data random "  + \
                        "../" +file_loc +"_output/"
        
            os.chdir("lda-c")
            returned_value = os.system(myCmd)  # returns the exit code in unix
            os.chdir("../")
            
            if returned_value != 0:
                print("ERROR!!!")
                return None

            X_lda = np.genfromtxt(file_loc +"_output/final.gamma",delimiter=' ')
            X_lda = normalize(X_lda,axis=1,norm='l1')
            X_lda = LA.norm(X_lda,axis=1)
            
            return X_lda
        
        except Exception as e:
            print(e)
            return None
    
    def fit(self, X, Y):
        self.models = []
        self.alphas = []

        N, _ = X.shape
        
        W = self.getWeights(X)

        if not W is None:
            
            W_maj = np.array([W[i] for i in range(X.shape[0]) if Y[i] == -1])
            W_min = np.array([W[i] for i in range(X.shape[0]) if Y[i] == 1])

            W_maj = (W_maj - W_maj.min()) / (W_maj.max() - W_maj.min())
            W_min = (W_min - W_min.min()) / (W_min.max() - W_min.min())

            W = np.concatenate((W_maj,W_min),axis=0)

            W = W/sum(W)

            X_maj = np.array([X[i] for i in range(X.shape[0]) if Y[i] == -1])
            X_min = np.array([X[i] for i in range(X.shape[0]) if Y[i] == 1])

            X = np.concatenate((X_maj,X_min),axis=0)
            Y = np.concatenate((np.zeros(X_maj.shape[0])-1,np.ones(X_min.shape[0])),axis=0)

            for m in range(self.M):

                resample_size = X_min.shape[0]

                W_maj_norm = W[0:X_maj.shape[0]]/sum(W[0:X_maj.shape[0]])
                W_min_norm = W[X_maj.shape[0]+1:]/sum(W[X_maj.shape[0]+1:])

                X_maj_indices = np.random.choice(X_maj.shape[0],resample_size,p=W_maj_norm)
                X_min_indices = np.random.choice(range(X_maj.shape[0]+1,X.shape[0]),resample_size,p=W_min_norm)

                X_sampled = np.concatenate((X[X_maj_indices],X[X_min_indices]),axis=0)
                y_sampled = np.concatenate((Y[X_maj_indices],Y[X_min_indices]),axis=0)
                chosen_indices = np.concatenate((X_maj_indices,X_min_indices),axis=0)

                tree = DecisionTreeClassifier(max_depth=self.depth, splitter='best')

                tree.fit(X_sampled, y_sampled, sample_weight=W[chosen_indices])

                P = tree.predict(X)

                err = np.sum(W[P != Y])

                if err > 0.5:
                    m = m - 1
                if err <= 0:
                    err = 0.0000001
                else:
                    try:
                        if (np.log(1 - err) - np.log(err)) == 0 :
                            alpha = 0
                        else:
                            alpha = 0.5 * (np.log(1 - err) - np.log(err))
                        W = W * np.exp(-alpha * Y * P)  # vectorized form
                        W = W / W.sum()  # normalize so it sums to 1
                    except:
                        alpha = 0
                        # W = W * np.exp(-alpha * Y * P)  # vectorized form
                        W = W / W.sum()  # normalize so it sums to 1

                    self.models.append(tree)
                    self.alphas.append(alpha)

                self.error[m] = err

    def predict(self, X):
        N, _ = X.shape
        FX = np.zeros(N)
        for alpha, tree in zip(self.alphas, self.models):
            FX += alpha * tree.predict(X)
        return np.sign(FX), FX

    def predict_proba(self, X):
        # if self.alphas == 'SAMME'
        proba = sum(tree.predict_proba(X) * alpha for tree , alpha in zip(self.models,self.alphas) )


        proba = np.array(proba)


        proba = proba / sum(self.alphas)

        proba = np.exp((1. / (2 - 1)) * proba)
        normalizer = proba.sum(axis=1)[:, np.newaxis]
        normalizer[normalizer == 0.0] = 1.0
        # proba =  np.linspace(proba)
        # proba = np.array(proba).astype(float)
        proba = proba /  normalizer

        # print(proba)
        return proba

    def predict_proba_samme(self, X):
        # if self.alphas == 'SAMME.R'
        proba = sum(_samme_proba(est , 2 ,X) for est in self.models )

        proba = np.array(proba)

        proba = proba / sum(self.alphas)

        proba = np.exp((1. / (2 - 1)) * proba)
        normalizer = proba.sum(axis=1)[:, np.newaxis]
        normalizer[normalizer == 0.0] = 1.0
        # proba =  np.linspace(proba)
        # proba = np.array(proba).astype(float)
        proba = proba / normalizer

        # print('proba = ',proba)
        return proba.astype(float)

In [None]:
main = eda.eda()
main.read_data_csv(file_loc,header_row=0)

if len(main.target)-sum(main.target) > sum(main.target):
    majority_class = 0
else:
    majority_class = 1

for i in range(len(main.target)):
    if main.target[i] == majority_class:
        main.target[i] = -1
    else:
        main.target[i] = 1

X = main.data
y = main.target

for t in range(0,4):
    
    print(t)            

    skf = StratifiedKFold(n_splits=5)

    for train_index, test_index in skf.split(X, y):
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        try:
            tree = TOMBoost(10)
            tree.fit(X_train, y_train)
            
        except Exception as e:
            import traceback
            traceback.print_exc()
            print(e)
            break

        y_predict = tree.predict(X_test)

        y_score = tree.predict_proba(X_test)
        y_score = [row[1] for row in y_score]

        fpr, tpr, _ = roc_curve(y_test, y_score,pos_label=1)
        
        print('PRECISION - RECALL - F1-Score Report')
        print(metrics.precision_recall_fscore_support(y_test, y_predict[0]))
        print('AUC :' + str(auc(fpr, tpr)))

0
PRECISION - RECALL - F1-Score Report
(array([0.94485842, 0.25454545]), array([0.83751651, 0.53164557]), array([0.88795518, 0.3442623 ]), array([757,  79]))
AUC :0.7718007457819841
PRECISION - RECALL - F1-Score Report
(array([0.94375   , 0.21538462]), array([0.79788639, 0.53846154]), array([0.86471009, 0.30769231]), array([757,  78]))
AUC :0.7927632693154489
