In [None]:
#Program: Helminiak-project1
#Version: 1.0
#Author: David Helminiak
#Date Created: 12 October 2018
#Date Last Modified: 18 October 2018
#Changelog: 0.1 - visual decision tree construction - Oct 12, 2018
#           0.2 - accepts generic dataset and uses parallel processing - Oct 15, 2018
#           0.3 - changes un-balanced class dataset to balanced; optimizes split function - Oct 15, 2018
#           0.4 - balances classes using normal, truncated_normal, or oversampling techniques - Oct 15, 2018
#           0.5 - provides basic statistical analysis - Oct 16, 2018
#           0.6 - drops low class. corr. variables, draws from balanced df for training, term. tree gen on 1 rem. class - Oct 17, 2018
#           0.7 - fixed failure to evaluate any but the first variable's information gain - Oct 18, 2018
#           0.8 - quick function run options - Oct 18, 2018
#           0.9 - print results to file - Oct 18, 2018
#           1.0 - Finished initial contruction- Oct 18, 2018
#USEFUL FUNCTIONS:
#Add Breakpoint: from IPython.core.debugger import Tracer; Tracer()() 

#LIBRARY IMPORTS
import sys, os, math, pydot, multiprocessing, random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from IPython.display import Image
from joblib import Parallel, delayed
from scipy import stats
from datetime import datetime
#Also requires installation of GraphViz package - for OSX: "brew install graphviz"

#FUNCTIONS AND CLASS DEFINITIONS

#Calculate gini index for a particular characteristic
def gini(df, column):
        gini=1
        #Get count of number of rows
        rowCount = (len(df))
        #Get list and count of unique values
        countVals = df[column].value_counts().rename_axis('unique_values').reset_index(name='counts')
        for i in range(0, (len(countVals))):
            gini=gini-(countVals.iloc[i,1]/rowCount)**2   
        return gini

#Determine split point information gain results for each projection value
def bestSplit_parhelper(i, df, originalGini, characteristic, classifierID, projectVals, splits):
    #Load dataframe into left and right nodes
    leftData = pd.DataFrame(columns = df.columns.values)
    rightData = pd.DataFrame(columns = df.columns.values)
   
    for j in range(0, len(df)): #For the length of the dataframe
        if (df[characteristic].iloc[j] < projectVals[i]): #For any values less than projectVals[i]
            leftData=leftData.append(df.iloc[j], ignore_index=True)
        else: #Otherwise, values are greater than or equal to projected value
            rightData=rightData.append(df.iloc[j], ignore_index=True)
    #Calculate gini values for left and right nodes
    leftGini=gini(leftData, classifierID)
    rightGini=gini(rightData, classifierID)
    #Calculate information gain and append to splits df
    combinedGini=((len(leftData)/len(df))*leftGini)+((len(rightData)/len(df))*rightGini)
    informationGain=originalGini-combinedGini
    splits['information_gain'].iloc[i]=informationGain
    return splits

#Determine the best possible information gain and splitting point for a characteristic
def bestSplit(df, originalGini, characteristic, classifierID):
    #Get list and count of unique values
    countVals = df[characteristic].value_counts().rename_axis('unique_values').reset_index(name='counts')
    countVals = countVals.sort_values(by='unique_values') #Sort countVals by values rather than count
    #Project mean values to find candidate splitting points
    projectVals=[]
    for i in range(0, len(countVals['unique_values'])-1):
        projectVals.append((countVals['unique_values'].iloc[i]+countVals['unique_values'].iloc[i+1])/2)
    #Test data splits
    splits = pd.DataFrame(data={'projection_values': projectVals, 'information_gain': np.nan})
    splitsCompiled = pd.DataFrame(data={'projection_values': projectVals, 'information_gain': np.nan})
    #For each of the possible splitting points calculate the resulting information gain
    num_threads = multiprocessing.cpu_count() #Determine number of available threads
    splits = Parallel(n_jobs=num_threads)(delayed(bestSplit_parhelper)(i, df, originalGini, characteristic, classifierID, projectVals, splits) for i in range(0, len(projectVals))) #Perform task in parallel
    #Splits returns as a list with every ith row's ith value being the next value desired
    #Transform splits list back into dataframe
    for i in range (0, len(splits)):
        splitsCompiled['information_gain'].iloc[i] = splits[i].iloc[i]['information_gain']
    #Locate the best split point if there is one
    if (len(splitsCompiled) is 0): #If there is no data to split
        return 0, 0 #Then there is no information to be gained and the split point is negligable
    splitPoint=splitsCompiled['projection_values'].iloc[splitsCompiled['information_gain'].idxmax()]
    maxGain = splitsCompiled['information_gain'].value_counts().idxmax()
    return maxGain, splitPoint

#Find best information gain over all of the characteristics and then split the data accordingly
def split(df, classifierID, printEverything):
    #Calculate original gini
    originalGini = gini(df, classifierID)

    #Get characteristic names
    columnNames=list(df.columns.values)
    columnNames.remove(columnNames[len(columnNames)-1])

    #Determine which is best to perform split
    charSplit = pd.DataFrame(data={'characteristic': columnNames, 'information_gain': np.nan, 'splitting_point': np.nan})
    for i in range (0, len(columnNames)): 
        print('Split Evaluation: ', i/len(columnNames)*100, '%')
        charInformationGain, charSplitPoint = bestSplit(df, originalGini, columnNames[i], classifierID)
        charSplit['information_gain'].iloc[i]=charInformationGain
        charSplit['splitting_point'].iloc[i]=charSplitPoint
    splitChar=charSplit['characteristic'].iloc[charSplit['information_gain'].idxmax()]
    splitPoint=charSplit['splitting_point'].iloc[charSplit['information_gain'].idxmax()]

    #Actually split the data
    #Load dataframe into left and right nodes
    leftData = df.copy()
    rightData = df.copy()
    for i in range(0, len(df)): #For the length of the dataframe
        if (rightData[splitChar].iloc[i] < splitPoint): #For any values less than projectVals[i]
            rightData[splitChar].iloc[i] = np.nan #Set row in right side as NaN
        else: #Otherwise, values are greater than or equal to projected value
            leftData[splitChar].iloc[i] = np.nan #Set row in left side as NaN
    #Delete rows with nan values for both left and right side
    leftData=leftData.dropna()
    rightData=rightData.dropna()
    return splitChar, splitPoint, leftData, rightData

#Build the full tree from each sub-tree found for each node within a decision tree object
def buildGraph(tree):
    finalGraph = pydot.Dot(graph_type='graph') #Create a blank tree to hold all sub-trees
    root = tree.graph #Establish the tree's root sub-tree
    for i in range(0,len(root.get_edges())):
        finalGraph.add_edge(root.get_edges()[i])
    if (tree.leftChild is not None): #If there is a further sub-tree
        a = buildGraph(tree.leftChild) #Recursive call for left hand child 
        for i in range(0,len(a.get_edges())): #For all of the left hand child's edges
            finalGraph.add_edge(a.get_edges()[i])  #Add them to the final graph
        b = buildGraph(tree.rightChild) #Recursive call for right hand child 
        for i in range(0,len(b.get_edges())): #For all of the right hand child's edges
            finalGraph.add_edge(b.get_edges()[i]) #Add them to the final tree
    return finalGraph #Return back up the final tree

#Determine what the tree says the classifier ID should be
def determine(startNode, dataPoint, classifierID):
    if (startNode.leftChild is not None):
        if (dataPoint[startNode.splitChar]<startNode.splitPoint):
            startNode = startNode.leftChild
        else:
            startNode = startNode.rightChild
        if (startNode.leftChild is not None):
            startNode = determine(startNode, dataPoint, classifierID)
        return startNode
    
#Test if a correct answer is obtained through the decision tree for a sample
def test(tree, testData, classifierID):
    successes=0
    for i in range (0,len(testData)): #For each of the test data cases
        if (testData.iloc[i][classifierID] == determine(tree, testData.iloc[i], classifierID).classifierID):
            successes=successes+1
    return successes 

def distribute(df, columnNames, n, oversample, synthetic, truncated_normal, normal):
    random.seed(datetime.now())
    characteristic_value_sets = np.zeros(((len(columnNames)),n))  #Create array to hold the generated characteristic value sets
    for m in range (0, len(columnNames)): #For each of the characteristics 
        distributed_values=[] #Create an empty list to hold sample values
        #Calculate base statistics
        mean = df[columnNames[m]].mean()
        sigma = df[columnNames[m]].var()
        if (sigma >= 0):
            sigma=sigma
        else:
            sigma = 0
        minimum = df[columnNames[m]].min()
        maximum = df[columnNames[m]].max()
        if (sigma != 0): #If there is varience in the values provided
            if (oversample == 1): #If compensating unbalance using oversampling technique
                #Generate a random sample of n values from this distribution
                for p in range (0, n): #For the number of samples
                    rand = round((random.random()*(len(df[columnNames[m]])-1)-1)+1)                    
                    distributed_values.append(df[columnNames[m]].tolist()[rand])
            elif (synthetic == 1): #If compensating unbalance using synthetic value generation
                if (truncated_normal == 1): #If sampling should be performed from truncated normal distribution
                     distributed_values = stats.truncnorm.rvs((minimum-mean)/sigma, (maximum-mean)/sigma, scale=sigma, loc=mean, size=n)
                elif (normal == 1): #If sampling should be performed from a normal distribution
                    distributed_values = np.random.normal(mean, sigma, n)
        else: #Otherwise use values equal to the mean
            distributed_values = [mean] * n
        characteristic_value_sets[m]=distributed_values
    return characteristic_value_sets

#Create normal distribution of variable data for an unbalanced dataset given its end classifier's identification
def balanceData(df, toBalanceData, classifierID, n, oversample, synthetic, truncated_normal, normal):
    #Find unique values and counts of such in the training data
    training_counts = toBalanceData[classifierID].value_counts().rename_axis('unique_values').reset_index(name='counts').sort_values(by='unique_values')
    
    #Get characteristic names
    columnNames=list(df.columns.values)
    columnNames.remove(columnNames[len(columnNames)-1])
    #Determine binary characteristics
    binary_characteristics = [] #Create empty list to hold any characteristics that have binary values
    #For each of the characteristics
    for i in range (0, len(columnNames)):
        #If in the whole dataset only have 2 unique values, then they are binary
        if ((len(df[columnNames[i]].value_counts()))==2):
            binary_characteristics.append(Binary_Characteristic(columnNames[i], df[columnNames[i]].value_counts().rename_axis('unique_values').reset_index(name='counts').sort_values(by='unique_values')['unique_values']))
    if (n == -1): #If number of samples is set to automatic
        n = len(toBalanceData)/(len(training_counts['unique_values'])) #Make the number of samples prop. to original samples and classifiers
    if n % 2 != 0: #If the sample size is not even
        n = n+1 #Add 1 to make it so
    original_sample_size = n #Make a backup of the starting sample size
    balanced_training_data=pd.DataFrame(columns = df.columns.values)
    if (len(binary_characteristics) != 0): #If there even is a binary characteristic
        for i in range (0, len(binary_characteristics)): #For each of the binary characteristics
            binary_characteristic = binary_characteristics[i] #Load the characteristic data
            for j in range (0, len(binary_characteristic.values)): #For each of the binary characteristic's values; should always be 2
                binary_data = toBalanceData[toBalanceData[binary_characteristic.label] == binary_characteristic.values[j]] #Load training data for the binary characteristic label's value 
                #Find unique values and counts of such in the training data
                unique_counts = binary_data[classifierID].value_counts().rename_axis('unique_values').reset_index(name='counts').sort_values(by='unique_values')
                for k in unique_counts['unique_values']: #For each of the unique classes
                    binary_class_data = binary_data[binary_data[classifierID] == k] #Load data specific to a single class
                    #If a class is unique to the binary value, then the number of samples generated should compensate
                    n = original_sample_size #Reset the sample size
                    binary_variable_count = 0 #Counter for the number of binary variables that hold the class
                    for p in range (0, len(binary_characteristics)): #For each of the binary characteristics
                        binary_characteristic_2 = binary_characteristics[i] #Load the characteristic data 
                        for q in range (0, len(binary_characteristic.values)): #For each of the binary characteristic's values; should always be 2
                            binary_data_2 = toBalanceData[toBalanceData[binary_characteristic_2.label] == binary_characteristic_2.values[q]] #Load training data for the binary characteristic label's value
                            if (len(binary_data_2[binary_data_2[classifierID] == k][classifierID].value_counts()) != 0): #If the class value is present for a binary characteristic's value
                                binary_variable_count = binary_variable_count+1 #Increase the count by 1
                    for p in range(1, binary_variable_count): #For each value of the count
                        n = n - (n/2) #Subtract half of n's current value
                    n = round(n) #Round the value for indexing
                    #Distribute 
                    characteristic_value_sets = distribute(binary_class_data, columnNames, n, oversample, synthetic, truncated_normal, normal)
                    
                    #Form distributed values into a new dataframe and add it to the balanced dataset
                    balanced_characteristics = pd.DataFrame(np.transpose(characteristic_value_sets), columns = columnNames)
                    balanced_characteristics[classifierID] = k
                    balanced_training_data = balanced_training_data.append(balanced_characteristics, ignore_index=True)
    else: #If there are no binary characteristics
        n = round(n) #Round the value for indexing
        characteristic_value_sets = np.zeros((len(columnNames),n))  #Create array to hold the generated characteristic value sets
        unique_counts = toBalanceData[classifierID].value_counts().rename_axis('unique_values').reset_index(name='counts').sort_values(by='unique_values')
        
        for k in unique_counts['unique_values']: #For each of the unique classes
            class_data = toBalanceData[toBalanceData[classifierID] == k]
            characteristic_value_sets = distribute(class_data, columnNames, n, oversample, synthetic, truncated_normal, normal)
            balanced_characteristics = pd.DataFrame(np.transpose(characteristic_value_sets), columns = columnNames)
            balanced_characteristics[classifierID] = k
            balanced_training_data = balanced_training_data.append(balanced_characteristics, ignore_index=True)     
    return balanced_training_data

#Define a descision tree object
class Decision_Tree:
    def __init__(self, df, identifier, depth, printEverything):
        #Define graph
        self.graph = pydot.Dot(graph_type='graph')
        #Define internal node variables
        self.data=df
        self.classifierID=df[identifier].value_counts().idxmax()
        self.depth = depth
        self.gini=gini(df, identifier)
        self.printEverything=printEverything
        if (len(self.data[classifierID].value_counts()) > 1): #If the data has more than 1 classifier remaining
            print ('Depth: '+str(self.depth))
            print('Splitting with gini:',self.gini,'and:',len(self.data),'samples')
            self.splitChar, self.splitPoint, self.leftChildData, self.rightChildData = split(df, classifierID, self.printEverything)
            if ((len(self.leftChildData) == 0) or (len(self.rightChildData) == 0)):
                self.leftChild = None
                self.rightChild = None
            else:
                self.leftChild = Decision_Tree(self.leftChildData, identifier, (self.depth+1), printEverything)
                self.rightChild = Decision_Tree(self.rightChildData, identifier, (self.depth+1), printEverything)
        else:
            self.leftChild = None
            self.rightChild = None
        #Define nodal information
        self.nodeInformation='Mode: '+str(self.classifierID)
        self.nodeInformation=self.nodeInformation+'\nNumber of Members: '+str(len(self.data))
        self.nodeInformation=self.nodeInformation+'\nGini: '+str(self.gini)
        if (self.leftChild is not None):
            self.nodeInformation = self.nodeInformation+'\n(Left): '+self.splitChar+'<'+str(self.splitPoint)
            self.leftEdge = pydot.Edge(self.nodeInformation, self.leftChild.nodeInformation)
            self.graph.add_edge(self.leftEdge)
            self.rightEdge = pydot.Edge(self.nodeInformation, self.rightChild.nodeInformation)
            self.graph.add_edge(self.rightEdge)

#Define a binary characteristic object
class Binary_Characteristic:
    def __init__(self, label, values):
        self.label = label
        self.values = values

#Define stats and classifier and regression program
def STATS_CART(printEverything, trainSplit, filename, classifierID, classifierName, shouldBalanceData, autoSampleSize, trainingSampleSize, oversample, synthetic, truncated_normal, normal):
    #Import and split dataset
    if (printEverything == 1): 
        print('ANALYSING DATASET')
        print('\n')
    df = pd.read_csv(filename, na_values='?') #Read in dataset
    df = df.astype('float64') #Convert all values to be the same data type

    if (printEverything == 1): 
        print(df.info())
        print('\n')
        print('GENERATING BASE DATASET STATISTICS')
        print('\n')
        basestats = df.min().rename_axis('Variable').reset_index(name='Minimum')
        basestats['Maximum'] = df.max().values
        basestats['Average'] = df.mean().values
        basestats['#Missing'] = df.isna().sum().values
        basestats['%Missing']= ((basestats['#Missing']/len(df))*100).values
        basestats['Classifier Correlation'] = df.corr()[classifierID].values
        print(basestats)
        print('\n')
        print('Total amount of data missing is', basestats['%Missing'].sum(),'%')
        print('\n')
        print('Maximum amount of data missing for a single variable is', basestats['%Missing'].max(),'%')

        print('\n')
        print('REMOVING NA VALUES')
    df=df.dropna() #Remove all na.values
    
    if (printEverything == 1): 
        print('\n')
        print('DETERMINING TOTAL AND MAXIMUM DATA CORRELATIONS')
        print('\n')
        f, ax = plt.subplots(figsize=(6,6))
        sns.heatmap(df.corr(), fmt = ".1f", ax = ax, annot = True, cmap="Greys")
        plt.show()
        print('\n')
        absmaxcorr = pd.DataFrame((df.corr()[df.corr() != 1]).abs().max().rename_axis('1st Variable').reset_index(name='Max Correlation'))
        relatedCorrelations = [] #Create an empty list for matching correlation values
        for i in range (0, len(absmaxcorr)): #For each of the variables 
            variable = absmaxcorr['Max Correlation'].loc[i] #Take its maximum correlation value
            for column in df.corr().abs(): #For each of the columns in the correlation set
                if (variable == df.corr().abs().iloc[i][column]): #If the value matches that being sought
                    relatedCorrelations.append(column) #Add the column's name to the list
        absmaxcorr['Cross Variable']=relatedCorrelations
        print(absmaxcorr)

    #Balance the data if indicated and graph frequency of classes
    if (shouldBalanceData): #If the balance data option has been enabled
        if (printEverything == 1): 
            if (oversample):
                print('\n')
                print('DATA WILL BE RE-BALANCED USING OVERSAMPLING')
            if (synthetic):
                if (truncated_normal):
                    print('\n')
                    print('DATA WILL BE RE-BALANCED USING SYNTHETIC SAMPLE GENERATION FROM A TRUNCATED NORMAL DISTRIBUTION')
                if (normal):
                    print('\n')
                    print('DATA WILL BE RE-BALANCED USING SYNTHETIC SAMPLE GENERATION FROM A NORMAL DISTRIBUTION')
        if (autoSampleSize == 1): #If the sample size for rebalancing should be automatically determined
            #Indicate such when calling the balanceData function
            if (printEverything == 1): 
                print('\n')
                print('SAMPLE SIZE WILL BE SET AUTOMATICALLY')
            final_df = balanceData(df, df, classifierID, -1, oversample, synthetic, truncated_normal, normal)#Use the training set to create a balanced class dataset
        else: #Otherwise use the value specified by the user
            if (printEverything == 1): 
                print('\n')
                print('SAMPLE SIZE WAS SET BY THE USER')
            final_df = balanceData(df, df, classifierID, trainingSampleSize, oversample, synthetic, truncated_normal, normal) #Use the training set to create a balanced class dataset

        if (printEverything == 1): 
            #Generate histogram for the frequency of classifier values in original set
            print('\n')
            print('GENERATING DATASET FREQUENCY HISTOGRAM')
            #Original
            f, (ax1, ax2) = plt.subplots(1, 2, figsize=(6,3), sharey=True)
            labels, counts = np.unique(df[classifierID], return_counts=True)
            ax1.bar(labels, counts, align='center', color='k')
            ax1.set_title('Original')
            ax1.set_xlabel(classifierName)
            ax1.set_ylabel('Frequency')

            #Generate histogram for the frequency of classifier values in the balanced dataset
            #Balanced
            labels, counts = np.unique(final_df[classifierID], return_counts=True)
            ax2.bar(labels, counts, align='center', color='k')
            ax2.set_title('Final')
            ax2.set_xlabel(classifierName)
            string='Frequency of '+classifierName
            plt.suptitle(string)
            plt.subplots_adjust(wspace=0, top=0.8)
            plt.show()

    else: #Otherwise dataset is entirely unbalanced
        final_df = df.copy() #Copy original for use in final
        if (printEverything == 1): 
            print('\n')
            print('GENERATING DATASET FREQUENCY HISTOGRAM')
            #Generate histogram for the frequency of classifier values
            labels, counts = np.unique(final_df[classifierID], return_counts=True)
            plt.bar(labels, counts, align='center', color='k')
            plt.title("Frequency of "+classifierName)
            plt.xlabel(classifierName)
            plt.ylabel('Frequency')
            plt.show()

    #Allocate training/testing sets
    if (printEverything == 1): 
        print('\n')
        print('ALLOCATING TRAINING/TESTING SETS')
    df = shuffle(df) #Randomize original dataset prior to split
    final_df = shuffle(final_df) #Randomize final dataset prior to split
    numTrain = int(len(final_df.index)*(trainSplit/100)) #Find number of training examples; round as int for indexing
    trainingData = final_df.iloc[0:numTrain] #Split off a training dataset using the balanced data
    testData = df.iloc[numTrain:(len(df.index))] #Split off a test dataset from the original set regardless of balanced or not

    if (printEverything == 1): 
        print('\n')
        print('DROPPING VARIABLES LESS THAN THE MEAN CORRELATION TO THE CLASSIFIER FROM THE TRAINING SET')
    a = df.corr()[classifierID].abs().rename_axis('variable').reset_index(name='correlation')
    a = a[a['variable']!=classifierID]
    dropVariables = a[a['correlation'] <= a['correlation'].mean()]
    keepVariables = a[a['correlation'] >= a['correlation'].mean()]
    if (printEverything == 1): 
        print(dropVariables['variable'].values)
        print('\n')
        print('KEEPING VARIABLES')
        print(keepVariables['variable'].values)
    trainingData = trainingData.drop(dropVariables['variable'].values, axis=1)

    #For each characteristic, find the greatest information gain possible using Gini impurity
    #Split the data adding that node to a tree
    #Repeat until termination criteria and then print out the tree to tree.png

    #Build tree
    if (printEverything == 1): 
        print('\n')
        print('BUILDING TREE (THIS WILL TAKE A WHILE)\n')
    tree = Decision_Tree(trainingData, classifierID, 1, printEverything) #Construct a decision tree for determining class
    finalGraph = buildGraph(tree) #Reconstruct full tree from descision tree object's sub-trees
    finalGraph.write_png(classifierID+'_Decision_Tree.png') #Write the full tree to a file

    #Evaluate Tree
    print('\n')
    print('EVALUATING SUCCESS OF TREE FOR TESTING SET')
    successes = test(tree, testData, classifierID)
    print('\nTest Samples:',len(testData),'\nSuccesses:',successes,'\nFailures:',len(testData)-successes)
    print('Success Rate for Test Data:',round((successes/len(testData))*100,3),'%')
    
    #Write results to a file
    outFilename = classifierName+'_train_'+str(trainSplit)+'_balanced_'+str(shouldBalanceData)+'_autoSamp_'+str(autoSampleSize)+'_manSampSize_'+str(trainingSampleSize)+'_OverSam_'+str(oversample)+'_SynthSam_'+str(synthetic)+'_trunNorm_'+str(truncated_normal)+'_normal_'+str(normal)+'.txt'
    outData = 'Test Samples: '+str(len(testData))+'\nSuccesses: '+str(successes)+'\nFailures: '+str(len(testData)-successes)+'\nSuccess Rate for Test Data: '+str(round((successes/len(testData))*100, 3))+'%'
    file = open(outFilename,"w") 
    file.write(outData)
    file.close()
    
    
    if (printEverything == 1): 
        #Visualize the final tree
        Image(filename=classifierID+'_Decision_Tree.png')

In [None]:
#PROGRAM: STATS_CART MANUAL RUN

#Specify program parameters for a manual run
#NOTE: Program assumes the classifierID is located in the last column of the set!
filename = 'wine.csv' #Indicate filename containing dataset
classifierID = 'quality' #Indicate which variable should be predicted
classifierName = 'Wine Quality' #Indicate variable name for graphs

trainSplit = 0.1 #Indicate portion (%) of data to use for training; test is 1-trainSplit

printEverything = 0 #(0: No, 1: Yes) Should anything but the accuracy results and tree building progress be printed

shouldBalanceData = 1 #(0: No, 1: Yes) Should the data be re-balanced
autoSampleSize = 1 #(0: No, 1: Yes) Should the sample size for each balanced class be set automatically
#Automatic sample size: #samples/#cflassifiers
trainingSampleSize=0 #If the sample size is not set to be determined automatically, specify how many samples are desired

#Do not choose more than one of the following 
oversample = 1 #(0: No, 1: Yes) Should oversampling be used to compensate for class imbalance
synthetic = 0 #(0: No, 1: Yes) Should synthetic, generated values be used to compensate for class imbalance

#Do not choose more than one of the following and only select if synthetic is enabled
truncated_normal = 0 #(0: No, 1: Yes) If synthetic, then should sample be taken from a truncated normal distribution
normal = 0 #(0: No, 1: Yes) If synthetic, then should sample be taken from a truncated normal distribution

#FUNCTION STATS_CART(printEverything, trainSplit, filename, classifierID, classifierName, shouldBalanceData, autoSampleSize, trainingSampleSize, oversample, synthetic, truncated_normal, normal):
STATS_CART(printEverything, trainSplit, filename, classifierID, classifierName, shouldBalanceData, autoSampleSize, trainingSampleSize, oversample, synthetic, truncated_normal, normal)

In [None]:
#PROGRAM: STATS_CART AUTOMATIC RUNS

#Run all options for accuracy results only; autoSample and balanceData options are enabled
filename = 'wine.csv' #Indicate filename containing dataset
classifierID = 'quality' #Indicate which variable should be predicted
classifierName = 'Wine Quality' #Indicate variable name for graphs

#FUNCTION STATS_CART(printEverything, trainSplit, filename, classifierID, classifierName, shouldBalanceData, autoSampleSize, trainingSampleSize, oversample, synthetic, truncated_normal, normal)
print('Oversampling, 0.1% Training')
STATS_CART(0, 0.1, filename, classifierID, classifierName, 1, 1, 0, 1, 0, 0, 0)
print('\n\n')
print('Oversampling, 1% Training')
STATS_CART(0, 1, filename, classifierID, classifierName, 1, 1, 0, 1, 0, 0, 0)
print('\n\n')
print('Oversampling, 10% Training')
STATS_CART(0, 10, filename, classifierID, classifierName, 1, 1, 0, 1, 0, 0, 0)
print('\n\n')
print('Oversampling, 80% Training')
STATS_CART(0, 80, filename, classifierID, classifierName, 1, 1, 0, 1, 0, 0, 0)

print('\n\n')
print('Synthetic, 0.1% Training, Normal Sampling')
STATS_CART(0, 0.1, filename, classifierID, classifierName, 1, 1, 0, 0, 1, 0, 1)
print('\n\n')
print('Synthetic, 1% Training, Normal Sampling')
STATS_CART(0, 1, filename, classifierID, classifierName, 1, 1, 0, 0, 1, 0, 1)
print('\n\n')
print('Synthetic, 10% Training, Normal Sampling')
STATS_CART(0, 10, filename, classifierID, classifierName, 1, 1, 0, 0, 1, 0, 1)
print('\n\n')
print('Synthetic, 80% Training, Normal Sampling')
STATS_CART(0, 80, filename, classifierID, classifierName, 1, 1, 0, 0, 1, 0, 1)

print('\n\n')
print('Synthetic, 0.1% Training, Truncated Normal Sampling')
STATS_CART(0, 0.1, filename, classifierID, classifierName, 1, 1, 0, 0, 1, 1, 0)
print('\n\n')
print('Synthetic, 1% Training, Truncated Normal Sampling')
STATS_CART(0, 1, filename, classifierID, classifierName, 1, 1, 0, 0, 1, 1, 0)
print('\n\n')
print('Synthetic, 10% Training, Truncated Normal Sampling')
STATS_CART(0, 10, filename, classifierID, classifierName, 1, 1, 0, 0, 1, 1, 0)
print('\n\n')
print('Synthetic, 80% Training, Truncated Normal Sampling')
STATS_CART(0, 80, filename, classifierID, classifierName, 1, 1, 0, 0, 1, 1, 0)
