In [7]:
import random
import time
from collections import defaultdict

import numpy as np
from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.sql import Row

sc = SparkSession.builder.appName('eMM2').master("local[*]").getOrCreate()
sql = SQLContext(sc)


In [3]:
class oldSpliceBinaryClassificationEvaluator(object):
    """
    A Function that provides an easy way to evaluate models once, or over random iterations
    """

    def __init__(self, spark, label_column='label', prediction_column='prediction',
                 confusion_matrix=True):
        """
        :param label_column: the column in the dataframe containing the correct output
        :param prediction_column: the column in the dataframe containing the prediction
        :param confusion_matrix: whether or not to show a confusion matrix after each input
        """
        self.spark = spark
        self.avg_tp = []
        self.avg_tn = []
        self.avg_fn = []
        self.avg_fp = []
        self.label_column = label_column
        self.prediction_column = prediction_column
        self.confusion_matrix = confusion_matrix

    def input(self, predictions_dataframe):
        """
        Evaluate actual vs Predicted in a dataframe
        :param predictions_dataframe: the dataframe containing the label and the predicition
        """

        pred_v_lab = predictions_dataframe.select(self.label_column,
                                                  self.prediction_column)  # Select the actual and the predicted labels

        self.avg_tp.append(pred_v_lab[(pred_v_lab.label == 1)
                                      & (
                                              pred_v_lab.prediction == 1)].count())  # Add confusion stats
        self.avg_tn.append(pred_v_lab[(pred_v_lab.label == 0)
                                      & (pred_v_lab.prediction == 0)].count())
        self.avg_fp.append(pred_v_lab[(pred_v_lab.label == 1)
                                      & (pred_v_lab.prediction == 0)].count())
        self.avg_fn.append(pred_v_lab[(pred_v_lab.label == 0)
                                      & (pred_v_lab.prediction == 1)].count())

        if self.confusion_matrix:
            get_confusion_matrix(
                self.spark,
                self.avg_tp[-1],
                self.avg_tn[-1],
                self.avg_fp[-1],
                self.avg_fn[-1],
            ).show()

            # show the confusion matrix to the user

    def get_results(self, output_type='dataframe'):
        """
        Return a dictionary containing evaluated results
        :param output_type: either a dataframe or a dict (which to return)
        :return results: computed_metrics (dict) or computed_df (df)
        """

        TP = np.mean(self.avg_tp)
        TN = np.mean(self.avg_tn)
        FP = np.mean(self.avg_fp)
        FN = np.mean(self.avg_fn)

        if self.confusion_matrix:
            get_confusion_matrix(
                self.spark,
                float(TP),
                float(TN),
                float(FP),
                float(FN)
            ).show()

        computed_metrics = {
            'TPR': float(TP) / (TP + FN),
            'SPC': float(TP) / (TP + FN),
            'PPV': float(TP) / (TP + FP),
            'NPV': float(TN) / (TN + FN),
            'FPR': float(FP) / (FP + TN),
            'FDR': float(FP) / (FP + TP),
            'FNR': float(FN) / (FN + TP),
            'ACC': float(TP + TN) / (TP + FN + FP + TN),
            'F1': float(2 * TP) / (2 * TP + FP + FN),
            'MCC': float(TP * TN - FP * FN) / np.sqrt(
                (TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)),
        }

        if output_type == 'dict':
            return computed_metrics
        else:

            ordered_cols = ['TPR', 'SPC', 'PPV', 'NPV', 'FPR', 'FDR', 'FNR', 'ACC', 'F1', 'MCC']
            metrics_row = Row(*ordered_cols)
            computed_row = metrics_row(*[float(computed_metrics[i])
                                         for i in ordered_cols])
            computed_df = self.spark._wrapped.createDataFrame([computed_row])
            return computed_df



In [5]:
from splicemachine.ml.utilities import SpliceBinaryClassificationEvaluator


In [8]:
df = sc.read.csv("creditcard.csv", header = True)


In [9]:
#FIXME -- MAKE A BASIC ASS MODEL AND GET IT TO EVALUATE USING OLD AND NEW EVALUATOR


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.3598071336738,-0.0727811733098497,2.53634673796914,1.37815522427443,-0.338320769942518,0.462387777762292,0.239598554061257,0.0986979012610507,0.363786969611213,...,-0.018306777944153,0.277837575558899,-0.110473910188767,0.0669280749146731,0.128539358273528,-0.189114843888824,0.133558376740387,-0.0210530534538215,149.62,0
1,0,1.19185711131486,0.26615071205963,0.16648011335321,0.448154078460911,0.0600176492822243,-0.0823608088155687,-0.0788029833323113,0.0851016549148104,-0.255425128109186,...,-0.225775248033138,-0.638671952771851,0.101288021253234,-0.339846475529127,0.167170404418143,0.125894532368176,-0.00898309914322813,0.0147241691924927,2.69,0
2,1,-1.35835406159823,-1.34016307473609,1.77320934263119,0.379779593034328,-0.503198133318193,1.80049938079263,0.791460956450422,0.247675786588991,-1.51465432260583,...,0.247998153469754,0.771679401917229,0.909412262347719,-0.689280956490685,-0.327641833735251,-0.139096571514147,-0.0553527940384261,-0.0597518405929204,378.66,0
3,1,-0.966271711572087,-0.185226008082898,1.79299333957872,-0.863291275036453,-0.0103088796030823,1.24720316752486,0.23760893977178,0.377435874652262,-1.38702406270197,...,-0.108300452035545,0.00527359678253453,-0.190320518742841,-1.17557533186321,0.647376034602038,-0.221928844458407,0.0627228487293033,0.0614576285006353,123.5,0
4,2,-1.15823309349523,0.877736754848451,1.548717846511,0.403033933955121,-0.407193377311653,0.0959214624684256,0.592940745385545,-0.270532677192282,0.817739308235294,...,-0.00943069713232919,0.79827849458971,-0.137458079619063,0.141266983824769,-0.206009587619756,0.502292224181569,0.219422229513348,0.215153147499206,69.99,0
5,2,-0.425965884412454,0.960523044882985,1.14110934232219,-0.168252079760302,0.42098688077219,-0.0297275516639742,0.476200948720027,0.260314333074874,-0.56867137571251,...,-0.208253514656728,-0.559824796253248,-0.0263976679795373,-0.371426583174346,-0.232793816737034,0.105914779097957,0.253844224739337,0.0810802569229443,3.67,0
6,4,1.22965763450793,0.141003507049326,0.0453707735899449,1.20261273673594,0.191880988597645,0.272708122899098,-0.00515900288250983,0.0812129398830894,0.464959994783886,...,-0.167716265815783,-0.270709726172363,-0.154103786809305,-0.780055415004671,0.75013693580659,-0.257236845917139,0.0345074297438413,0.00516776890624916,4.99,0
7,7,-0.644269442348146,1.41796354547385,1.0743803763556,-0.492199018495015,0.948934094764157,0.428118462833089,1.12063135838353,-3.80786423873589,0.615374730667027,...,1.94346533978412,-1.01545470979971,0.057503529867291,-0.649709005559993,-0.415266566234811,-0.0516342969262494,-1.20692108094258,-1.08533918832377,40.8,0
8,7,-0.89428608220282,0.286157196276544,-0.113192212729871,-0.271526130088604,2.6695986595986,3.72181806112751,0.370145127676916,0.851084443200905,-0.392047586798604,...,-0.0734251001059225,-0.268091632235551,-0.204232669947878,1.0115918018785,0.373204680146282,-0.384157307702294,0.0117473564581996,0.14240432992147,93.2,0
9,9,-0.33826175242575,1.11959337641566,1.04436655157316,-0.222187276738296,0.49936080649727,-0.24676110061991,0.651583206489972,0.0695385865186387,-0.736727316364109,...,-0.246913936910008,-0.633752642406113,-0.12079408408185,-0.385049925313426,-0.0697330460416923,0.0941988339514961,0.246219304619926,0.0830756493473326,3.68,0
