In [163]:
!pip install scikit-learn



In [164]:
# UTILS
%%writefile util.py

from sklearn import metrics
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.tree import export_graphviz
import IPython.display
import graphviz
from PIL import Image
import scipy.stats as stats

def myStuff(collabQuestionMark):
  #if collabQuestionMark:
  #  from google.colab import drive
  #  drive.mount('/content/drive/')

  from sklearn import metrics
  import pandas as pd
  import numpy as np
  from sklearn.ensemble import RandomForestClassifier
  from sklearn.tree import DecisionTreeClassifier
  from sklearn import metrics
  from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
  from sklearn.model_selection import RandomizedSearchCV, train_test_split
  from sklearn.model_selection import KFold, StratifiedKFold
  from sklearn.tree import export_graphviz
  import IPython.display
  import graphviz
  from PIL import Image



def kFoldSplit(feat, lab, totSplit, blockInUse, random_state):
  """
  This function helps do the cross validation method we are using:
  k fold cross validation
  In which, k blocks are made
  where one is for testing and all else training

  There is no intermediate validation, and this does not result in one model
  It is believed to be better suited for machine learning type/hyperparameter
  comparisons, and is less suited for specific deployment of models
  and slow training times

  For a situation where the training and testing is short and our goal
  is to compare training systems, this is what we should use
  """

  stratifiedKfold = StratifiedKFold(n_splits=totSplit, shuffle=True, random_state=random_state)

  for fold, (trainIndex, valIndex) in enumerate(stratifiedKfold.split(feat, lab)):
      if fold == blockInUse:
          featTrain, featVal = feat[trainIndex], feat[valIndex]
          labelTrain, labelVal = lab[trainIndex], lab[valIndex]
          break

  return featTrain, featVal, labelTrain, labelVal




def oneRoundEvaluation(arborModel, featVal, labelVal):
  """
  Each round of k fold cross validation produces a result
  in the form of performance metrics
  all of the confusion matrix data, and calculations with the confusion
  matrix data, is made in this funciton
  """

  # make predictions
  labelValPredictions = arborModel.predict(featVal)

  #eval for confusion matrix
  confusionMatrix = confusion_matrix(labelVal, labelValPredictions)
  trueNegative, falsePositive, falseNegative, truePositive = confusionMatrix.ravel()

  #perform extra calculations with confusion matrix data
  roundAccuracy = (truePositive+trueNegative)/(truePositive+trueNegative+falsePositive+falseNegative)
  roundPrecision = (truePositive)/(truePositive+falsePositive)
  roundRecall = (truePositive)/(truePositive+falseNegative)
  totalCounts = trueNegative + falsePositive + falseNegative + truePositive

  # Make normalized confusion matrix
  normRoundFalsePositive= falsePositive/totalCounts
  normRoundFalseNegative=falseNegative/totalCounts
  normRoundTruePositive= truePositive/totalCounts
  normRoundTrueNegative= trueNegative/totalCounts

  # return dictionary
  summaryDict = {"trueNegative":trueNegative,
          "falsePositive":falsePositive,
          "falseNegative":falseNegative,
          "truePositive":truePositive,
          "roundAccuracy":roundAccuracy,
          "roundPrecision":roundPrecision,
          "roundRecall":roundRecall,
          "labelValPredictions":labelValPredictions,
          "normRoundFalsePositive":normRoundFalsePositive,
          "normRoundFalseNegative":normRoundFalseNegative,
          "normRoundTruePositive":normRoundTruePositive,
          "normRoundTrueNegative":normRoundTrueNegative,

                 }

  return summaryDict


def visualizeTree(arborModel,featureColumns, class_names=['<=50K', '>50K']): #!!!!REMEMBER TO FIX THE CLASS NAMES ISSUE!!!!
  """
  one advantage of decision trees is their subjective interpretability
  Decision trees can often be used to make predicitons by hand

  This funciton can generate an image that shows the exact decision tree created
  After training

  Since this project uses k fold cross validation, this was disabled for
  redundancy reasons
  """

  # Create String representation of tree
  # using graphvis /dot format/
  # class_names=['<=50K', '>50K'],
  stringTree = export_graphviz(arborModel, out_file=None, filled=True, rounded=True,
                            feature_names=featureColumns, class_names=class_names,
                            special_characters=False, max_depth=5)



  # Use string representation as sourcecode in
  # the graphviz "language"
  # Then we will render the code to make an image
  #print(str(stringTree)) (this can show you the original source code)
  ranStringTreeCode = graphviz.Source(stringTree)
  ranStringTreeCode.render(filename='treeFlowChart', format='png', cleanup=True)

  # the display image is waaaaay too big
  # so we have to cut it down a bit
  # first we will grab the image and set our new sizes
  originalDiagramImage = Image.open('treeFlowChart.png')
  diagramWidth = int(400*4)
  digramHeight = int(200*4)

  # then we will resize and set the image
  # and finally display the new, resized image
  deforestedDiagramImage = originalDiagramImage.resize((diagramWidth, digramHeight))
  deforestedDiagramImage.save('deforestedTreeFlowChart.png')
  IPython.display.display(IPython.display.Image(filename='deforestedTreeFlowChart.png'))


def getTBTCI(quantList, confidence = 0.95):
  """
  Point estimates alone can often lead to erroneous conclusions about
  model superiority. To mitigate that risk, a confidence interval should be used
  To interpret results. This does not create pairwise comparisons with
  other results, these intervals cannot directly be used to conclude
  statistical non-significance but can be used to conclude insignificance

  intersection == 0 -> significance
  intersection != 0 -> unknown if sigificant or not

  This CI uses the so-called theory based t interval, which comes with assumtions.
  These assumptions were assumed valid for all metrics used
  """

  # calculate relevant variables
  xbar1 = sum(quantList)/len(quantList)
  nn1 = len(quantList)
  degreesFreedom1 = nn1-1
  alpha = 1 - confidence
  ssd1 = np.std(quantList, ddof=1)

  # obtain t and SE
  tMultiplier = stats.t.ppf(1-alpha/2, degreesFreedom1)
  standardError = ( ssd1**2  / nn1 )**(0.5)

  #fininsh CI and return as list
  lowerBound = xbar1 - tMultiplier * standardError
  upperBound = xbar1 + tMultiplier * standardError
  return [lowerBound, upperBound]

def calculatefBeta(beta,precision, recall):
  """
  The F score is a subjective measurement of precision and recall
  In order to combine these into one score, an decision must be made
  about the value of precision and recall. To adjust this assumption,
  the variable beta is used.
  """
  fBeta =( (1+beta**2)*(precision*recall) ) / ( beta**2 * precision + recall )
  return fBeta

def statAnalysis(kfoldResultNestedData, mlMethod, dataSetInUse, beta=1,):
  """
  For each completed set of runs, the point estimates and
  confidence intervals must be calculated for each metric

  This funciton obtains the point estimate and CI for
  -number of true negatives
  -number of false positives
  -number of false negatives
  -number of true positives
  -accuracy
  -precision
  -recall
  -normalized true negatives, false positives, false negatives, and
  true positives
  -it also stores predicitons

  This funciton is admittingly repetititve and could use improvement in that area
  """


  # first generate a dictionary that contains all results of a type
  # rather than having to access each one indvidually
  # this will help in statistical analysis

  # yeah i know
  # i know
  # i know
  # I know
  ###print(kfoldResultNestedData)
  allResultsByValue={
        "trueNegative": [],
        "falsePositive": [],
        "falseNegative": [],
        "truePositive": [],
        "roundAccuracy": [],
        "roundPrecision": [],
        "roundRecall": [],
        "labelValPredictions": [],
        "normRoundFalsePositive": [],
        "normRoundFalseNegative": [],
        "normRoundTruePositive": [],
        "normRoundTrueNegative": [],

        "fBeta":[]
    }

  # i know
  # i know
  # I know
  # i know
  # i know
  # I know
  for singleRun in range(len(kfoldResultNestedData)):
    # add each result to create a list of results for each value
    allResultsByValue["trueNegative"].append(kfoldResultNestedData[singleRun]["trueNegative"])
    allResultsByValue["falsePositive"].append(kfoldResultNestedData[singleRun]["falsePositive"])
    allResultsByValue["falseNegative"].append(kfoldResultNestedData[singleRun]["falseNegative"])
    allResultsByValue["truePositive"].append(kfoldResultNestedData[singleRun]["truePositive"])
    allResultsByValue["roundAccuracy"].append(kfoldResultNestedData[singleRun]["roundAccuracy"])
    allResultsByValue["roundPrecision"].append(kfoldResultNestedData[singleRun]["roundPrecision"])
    allResultsByValue["roundRecall"].append(kfoldResultNestedData[singleRun]["roundRecall"])
    allResultsByValue["labelValPredictions"].append(kfoldResultNestedData[singleRun]["labelValPredictions"])
    allResultsByValue["normRoundFalsePositive"].append(kfoldResultNestedData[singleRun]["normRoundFalsePositive"])
    allResultsByValue["normRoundFalseNegative"].append(kfoldResultNestedData[singleRun]["normRoundFalseNegative"])
    allResultsByValue["normRoundTruePositive"].append(kfoldResultNestedData[singleRun]["normRoundTruePositive"])
    allResultsByValue["normRoundTrueNegative"].append(kfoldResultNestedData[singleRun]["normRoundTrueNegative"])

    allResultsByValue["fBeta"].append(
        calculatefBeta(
            beta=beta,
            precision=kfoldResultNestedData[singleRun]["roundPrecision"],
            recall=kfoldResultNestedData[singleRun]["roundRecall"])
        )

  # for each model type and dataset, all results are printed
  # along with the dataset, method used, and beta value
  print()
  print()
  print()
  print(f"BETA: {beta}")
  print(f"ML MODE: {mlMethod}")
  print(f"DATASET: {dataSetInUse}")
  for key in allResultsByValue.keys():
    if key != "labelValPredictions":
      print()
      print(key)
      print(allResultsByValue[key])
      print(f"AVERAGE {key}:", sum(allResultsByValue[key])/len(allResultsByValue[key]) )
      print(f"CI {key}:", getTBTCI(quantList=allResultsByValue[key], confidence = 0.95))





Overwriting util.py


In [165]:
# MAIN
%%writefile main.py
from util import *

from sklearn import metrics
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.tree import export_graphviz
import IPython.display
import graphviz
from PIL import Image



def runTheProcess(df1, featureColumns,labelColumns,foldCount,mlMethod,reproSeed,iAmUsingCollab, visuals, dataSetInUse):
  """
  This funciton is the real action behind the process, once the dataset is
  prepared

  This funciton runs k fold cross validation on a model-dataset pair
  and then gathers the results to be analyzed
  """

  featureData = df1[featureColumns]
  labelData = df1[labelColumns]
  featureData = np.array(featureData)
  labelData = np.array(labelData)

  #this loop runs k fold cross validation
  kfoldResultNestedData = []
  for xx1 in range(foldCount):
    #printing reminder, since these traing rounds can be slow
    print("LOADING FOLD: ", xx1, " METHOD: ", mlMethod, " DATASET: ", dataSetInUse)

    # gather kfold traing and testing data
    featTrain, featVal, labelTrain, labelVal = kFoldSplit(feat=featureData, lab=labelData, totSplit=10, blockInUse=xx1, random_state=reproSeed)

    labelTrain = labelTrain.ravel()
    labelVal = labelVal.ravel()

    # here the method is chosen, this is the only difference in the two
    # different methods code
    match mlMethod:
      case "tree":
        arborModel = DecisionTreeClassifier(max_depth=5).fit(featTrain, labelTrain)
      case "forest":
        arborModel = RandomForestClassifier(max_depth=5, n_estimators=300).fit(featTrain, labelTrain)
        #arborModel = RandomForestClassifier(max_depth=None, n_estimators=300).fit(featTrain, labelTrain)

    # add the results to a list for later analysis
    kfoldResultNestedData.append(oneRoundEvaluation(arborModel=arborModel, featVal=featVal, labelVal=labelVal))

    # if a tree is being used, it may be helpful to render an image of the tree
    if visuals and (mlMethod == "tree"):
      visualizeTree(arborModel=arborModel,featureColumns=featureColumns)

  # all results are finally sent to be analyzed for
  # pointwise estimates and confidence intervals
  statAnalysis(kfoldResultNestedData, beta=1, mlMethod=mlMethod, dataSetInUse=dataSetInUse)



def mainRun(dataNumber, mlMethodNumber):
  """
  The main run is essentially a list of settings for training
  and some code to deal with specific dataset


  """

  # MANUAL INPUTS
  # determine if you are using google collab or not
  iAmUsingCollab = True
  # determine if you want to print the tree diagram
  visuals = False
  # random seed for reproducability
  reproSeed = 1
  # what type of machine learning method you want to use
  mlMethod = ["tree","forest"][mlMethodNumber]
  # which of the 4 datasets are you using (this is not redundant it was just
  # switched from a manual input at the end)
  dataSetInUse = [0,1,2,3,4][dataNumber] #no 1 is having issues of null with RF, SO IS 2
  # number of blocks for k-fold cross validation
  foldCount = 10#5 #10#10




  # depending on the dataset, there may be a few things that need to be
  # added to this code before it can run
  # for this reason, each dataset has its own block
  # so it can be processed before the final step
  if dataSetInUse == 0:

    # Select your data
    df1 = pd.read_csv('/content/drive/My Drive/INTO ML/adultDataset.csv')

    # specific dataframe considerations
    df1["income"] = df1["income"].apply(lambda x: x.replace('.', ''))
    df1["income"] = df1["income"].apply(lambda x: 1 if x == ">50K" else 0)

    df1 = pd.get_dummies(df1, columns=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'])

    featureColumns = [col for col in df1.columns if col != 'income']
    labelColumns=['income']

    runTheProcess(df1, featureColumns,labelColumns,foldCount,mlMethod,reproSeed,iAmUsingCollab, visuals,dataSetInUse)

  elif dataSetInUse == 1:

    # Select your data
    df1 = pd.read_csv('/content/drive/My Drive/INTO ML/creditDataset.csv')

    # specific dataframe considerations

    df1 = pd.get_dummies(df1, columns=["X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12", "X13", "X14", "X15", "X16", "X17", "X18", "X19", "X20", "X21", "X22", "X23"])

    featureColumns = [col for col in df1.columns if col != 'Y']
    labelColumns=['Y']

    runTheProcess(df1, featureColumns,labelColumns,foldCount,mlMethod,reproSeed,iAmUsingCollab, visuals,dataSetInUse)

  elif dataSetInUse == 2:

    # Select your data
    df1 = pd.read_csv('/content/drive/My Drive/INTO ML/bankDataset.csv')

    # specific dataframe considerations

    df1["y"] = df1["y"].apply(lambda x: 1 if x == "yes" else 0)

    df1 = pd.get_dummies(df1, columns = ["age", "job", "marital", "education", "default", "balance", "housing", "loan", "contact", "day_of_week", "month", "duration", "campaign", "pdays", "previous", "poutcome"])

    featureColumns = [col for col in df1.columns if col != 'y']
    labelColumns=['y']

    runTheProcess(df1, featureColumns,labelColumns,foldCount,mlMethod,reproSeed,iAmUsingCollab, visuals,dataSetInUse)

  elif dataSetInUse == 3:
    # Select your data
    df1 = pd.read_csv('/content/drive/My Drive/INTO ML/obDataset.csv')

    # specific dataframe considerations

    df1["NObeyesdad"] = df1["NObeyesdad"].apply(lambda x: 1 if ("Obesity" in x) else 0)

    df1 = pd.get_dummies(df1, columns = ["Gender", "Age", "Height", "Weight", "family_history_with_overweight", "FAVC", "FCVC", "NCP", "CAEC", "SMOKE", "CH2O", "SCC", "FAF", "TUE", "CALC", "MTRANS"])

    featureColumns = [col for col in df1.columns if col != 'NObeyesdad']
    labelColumns=['NObeyesdad']

    runTheProcess(df1, featureColumns,labelColumns,foldCount,mlMethod,reproSeed,iAmUsingCollab, visuals,dataSetInUse)

  elif dataSetInUse == 4:
    # Select your data
    df1 = pd.read_csv('/content/drive/My Drive/INTO ML/tttDataset.csv')

    # specific dataframe considerations

    df1["class"] = df1["class"].apply(lambda x: 1 if ("positive" in x) else 0)

    df1 = pd.get_dummies(df1, columns = ["top-left-square", "top-middle-square", "top-right-square", "middle-left-square", "middle-middle-square", "middle-right-square", "bottom-left-square", "bottom-middle-square", "bottom-right-square"])

    featureColumns = [col for col in df1.columns if col != 'class']
    labelColumns=['class']

    runTheProcess(df1, featureColumns,labelColumns,foldCount,mlMethod,reproSeed,iAmUsingCollab, visuals,dataSetInUse)


if __name__=="__main__":
  """
  This is the starter loop
  it has been adjusted to run all rounds at once
  """
  #mainRun(dataNumber=4, mlMethodNumber=0)

  for dataNumber in [0,3,4]:
    for mlMethodNumber in [0,1]:
      print();print();print();print();
      mainRun(dataNumber=dataNumber, mlMethodNumber=mlMethodNumber)
      print(dataNumber,mlMethodNumber)

Overwriting main.py


In [166]:

from google.colab import drive
drive.mount('/content/drive/')

!python main.py

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).




LOADING FOLD:  0  METHOD:  tree  DATASET:  0
LOADING FOLD:  1  METHOD:  tree  DATASET:  0
LOADING FOLD:  2  METHOD:  tree  DATASET:  0
LOADING FOLD:  3  METHOD:  tree  DATASET:  0
LOADING FOLD:  4  METHOD:  tree  DATASET:  0
LOADING FOLD:  5  METHOD:  tree  DATASET:  0
LOADING FOLD:  6  METHOD:  tree  DATASET:  0
LOADING FOLD:  7  METHOD:  tree  DATASET:  0
LOADING FOLD:  8  METHOD:  tree  DATASET:  0
LOADING FOLD:  9  METHOD:  tree  DATASET:  0



BETA: 1
ML MODE: tree
DATASET: 0

trueNegative
[3495, 3532, 3537, 3551, 3556, 3544, 3549, 3509, 3531, 3548]
AVERAGE trueNegative: 3535.2
CI trueNegative: [3521.1989709910804, 3549.2010290089192]

falsePositive
[221, 184, 179, 165, 160, 171, 166, 206, 184, 167]
AVERAGE falsePositive: 180.3
CI falsePositive: [166.2736121163491, 194.3263878836509]

falseNegative
[534, 537, 526, 576, 559, 533, 552, 518, 547, 536]

In [167]:
#import IPython.display
#IPython.display.display(IPython.display.Image(filename='deforestedTreeFlowChart.png'))