In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql.types import StructType,IntegerType,FloatType,BooleanType,StringType
from pyspark.sql.functions import rand
conf = SparkConf().setMaster("local[*]").setAppName("My App")
sc = SparkContext.getOrCreate(conf = conf)
sc._conf.set('spark.executor.memory','15g')\
    .set('spark.driver.memory','15g')\
        .set('spark.driver.maxResultsSize','0')
spark=SparkSession.builder\
    .appName('myApp')\
        .config("spark.driver.memory", "15g")\
            .getOrCreate()

In [2]:
def load_data(files,schema):
    df=spark.read.csv(files,header=True
                  ,schema=schema)
    return df

def load_record_linkage_data():
    schema = StructType() \
      .add("id_1",IntegerType(),True) \
      .add("id_2",IntegerType(),True) \
      .add("cmp_fname_c1",FloatType(),True) \
      .add("cmp_fname_c2",FloatType(),True) \
      .add("cmp_lname_c1",FloatType(),True) \
      .add("cmp_lname_c2",FloatType(),True) \
      .add("cmp_sex",IntegerType(),True) \
      .add("cmp_bd",IntegerType(),True) \
      .add("cmp_bm",IntegerType(),True) \
      .add("cmp_by",IntegerType(),True) \
      .add("cmp_plz",IntegerType(),True) \
      .add("is_match",BooleanType(),False)
    files=[f'./data/block_{id}.csv' for id in range(1,11)]
    return load_data(files,schema=schema)

In [3]:
df=load_record_linkage_data()

In [4]:
df.groupBy('cmp_plz').count().show()

+-------+-------+
|cmp_plz|  count|
+-------+-------+
|   null|  12843|
|      1|  31714|
|      0|5704575|
+-------+-------+



In [5]:
df.count()

5749132

In [6]:
df.printSchema()

root
 |-- id_1: integer (nullable = true)
 |-- id_2: integer (nullable = true)
 |-- cmp_fname_c1: float (nullable = true)
 |-- cmp_fname_c2: float (nullable = true)
 |-- cmp_lname_c1: float (nullable = true)
 |-- cmp_lname_c2: float (nullable = true)
 |-- cmp_sex: integer (nullable = true)
 |-- cmp_bd: integer (nullable = true)
 |-- cmp_bm: integer (nullable = true)
 |-- cmp_by: integer (nullable = true)
 |-- cmp_plz: integer (nullable = true)
 |-- is_match: boolean (nullable = true)



# Missing Values

### Drop Missing Values


In [24]:
miss_df=df.drop('id_1','id_2')

اگر همه داده های گم شده را حذف کنیم، کلا 20 رکورد باقی می‌ ماند !


In [None]:
miss_df.replace('?',None).na.drop().count()

20

In [None]:
miss_df=miss_df.replace('?',None)

In [None]:
miss_df.head(3)

[Row(cmp_fname_c1=0.8333333134651184, cmp_fname_c2=None, cmp_lname_c1=1.0, cmp_lname_c2=None, cmp_sex=1, cmp_bd=1, cmp_bm=1, cmp_by=1, cmp_plz=0, is_match=True),
 Row(cmp_fname_c1=1.0, cmp_fname_c2=None, cmp_lname_c1=1.0, cmp_lname_c2=None, cmp_sex=1, cmp_bd=1, cmp_bm=1, cmp_by=1, cmp_plz=1, is_match=True),
 Row(cmp_fname_c1=1.0, cmp_fname_c2=None, cmp_lname_c1=1.0, cmp_lname_c2=None, cmp_sex=1, cmp_bd=1, cmp_bm=1, cmp_by=1, cmp_plz=1, is_match=True)]

هیچ رکوردی که همه یا حداقل 2 تا از متغیرهای آن گم شده باشد، وجود ندارد

In [None]:
miss_df.na.drop(how='all').count()

5749132

In [None]:
miss_df.na.drop(how='any',thresh=2).count()

5749132

In [None]:
miss_df.filter(miss_df.cmp_lname_c1==None).show()

+------------+------------+------------+------------+-------+------+------+------+-------+--------+
|cmp_fname_c1|cmp_fname_c2|cmp_lname_c1|cmp_lname_c2|cmp_sex|cmp_bd|cmp_bm|cmp_by|cmp_plz|is_match|
+------------+------------+------------+------------+-------+------+------+------+-------+--------+
+------------+------------+------------+------------+-------+------+------+------+-------+--------+



In [None]:
miss_df.groupBy('cmp_fname_c1').count().show()

+------------+------+
|cmp_fname_c1| count|
+------------+------+
|  0.27272728|   454|
|   0.8181818|     3|
|  0.16666667|152732|
|        0.25|137039|
|       0.875| 71211|
|   0.5714286|  7414|
|  0.47058824|    11|
|        null|  1007|
|        0.75| 46521|
|         0.1| 10357|
|  0.11111111|123127|
|       0.125|155172|
|  0.36363637|   293|
|   0.7777778|  3083|
|         0.6| 19725|
|         0.9|  7780|
|         0.5| 44615|
|  0.42857143| 34463|
|   0.2857143| 78429|
|  0.33333334| 94936|
+------------+------+
only showing top 20 rows



### Fill the Missing Values

In [7]:
from pyspark.ml.feature import Imputer
from pyspark.sql.functions import when, lit
# for float variables



def convert_label_binary(input_df):
    temp = input_df.withColumn('label',
                             when(input_df['is_match']==True,
                                  lit(1)).otherwise(0)
                                  ) 
    return temp

def fill_missing_values(input_df):
    miss_df=input_df.drop('id_1','id_2')
    miss_df=miss_df.replace('?',None)
    float_cols=[
    'cmp_fname_c1', 
    'cmp_fname_c2', 
    'cmp_lname_c1', 
    'cmp_lname_c2', 
    ]
    float_imputer = Imputer(
        inputCols=float_cols,
        outputCols=[f"{col}_imputed" for col in float_cols]
    ).setStrategy('mean')

    # for binary variables
    binary_cols=[
        'cmp_sex', 
        'cmp_bd', 
        'cmp_bm', 
        'cmp_by',
        'cmp_plz',
    ]
    binary_imputer = Imputer(
        inputCols=binary_cols,
        outputCols=[f"{col}_imputed" for col in binary_cols]
    ).setStrategy('mode')
    imputed_df=float_imputer.fit(miss_df).transform(miss_df)
    output_df=binary_imputer.fit(imputed_df).transform(imputed_df)
    output_df=output_df.select([x for x in output_df.columns if '_imputed' in x or x=='is_match'])
    return output_df


def preprocessing_df(input_df):
    return convert_label_binary(fill_missing_values(input_df))

In [10]:
prep_df=preprocessing_df(df)


In [11]:
prep_df.na.drop().count()

5749132

In [34]:
prep_df.printSchema()

root
 |-- is_match: boolean (nullable = true)
 |-- cmp_fname_c1_imputed: float (nullable = true)
 |-- cmp_fname_c2_imputed: float (nullable = true)
 |-- cmp_lname_c1_imputed: float (nullable = true)
 |-- cmp_lname_c2_imputed: float (nullable = true)
 |-- cmp_sex_imputed: integer (nullable = true)
 |-- cmp_bd_imputed: integer (nullable = true)
 |-- cmp_bm_imputed: integer (nullable = true)
 |-- cmp_by_imputed: integer (nullable = true)
 |-- cmp_plz_imputed: integer (nullable = true)
 |-- label: integer (nullable = false)



In [40]:
from pyspark.sql.functions import col
prep_dff=prep_df.drop(col('is_match'))

In [41]:
prep_dff.printSchema()

root
 |-- cmp_fname_c1_imputed: float (nullable = true)
 |-- cmp_fname_c2_imputed: float (nullable = true)
 |-- cmp_lname_c1_imputed: float (nullable = true)
 |-- cmp_lname_c2_imputed: float (nullable = true)
 |-- cmp_sex_imputed: integer (nullable = true)
 |-- cmp_bd_imputed: integer (nullable = true)
 |-- cmp_bm_imputed: integer (nullable = true)
 |-- cmp_by_imputed: integer (nullable = true)
 |-- cmp_plz_imputed: integer (nullable = true)
 |-- label: integer (nullable = false)



In [14]:
prep_df.groupBy('cmp_fname_c1_imputed').count().show()

+--------------------+-------+
|cmp_fname_c1_imputed|  count|
+--------------------+-------+
|          0.27272728|    454|
|           0.8181818|      3|
|          0.16666667| 152732|
|                0.25| 137039|
|               0.875|  71211|
|           0.5714286|   7414|
|          0.47058824|     11|
|                0.75|  46521|
|                 0.1|  10357|
|          0.11111111| 123127|
|               0.125| 155172|
|          0.36363637|    293|
|           0.7777778|   3083|
|                 0.6|  19725|
|                 0.9|   7780|
|                 0.5|  44615|
|          0.42857143|  34463|
|           0.2857143|  78429|
|          0.33333334|  94936|
|                 1.0|3508203|
+--------------------+-------+
only showing top 20 rows



In [31]:
prep_df.show(1,truncate=False)

+--------+--------------------+--------------------+--------------------+--------------------+---------------+--------------+--------------+--------------+---------------+-----+
|is_match|cmp_fname_c1_imputed|cmp_fname_c2_imputed|cmp_lname_c1_imputed|cmp_lname_c2_imputed|cmp_sex_imputed|cmp_bd_imputed|cmp_bm_imputed|cmp_by_imputed|cmp_plz_imputed|label|
+--------+--------------------+--------------------+--------------------+--------------------+---------------+--------------+--------------+--------------+---------------+-----+
|true    |0.8333333           |0.9000177           |1.0                 |0.31841284          |1              |1             |1             |1             |0              |1    |
+--------+--------------------+--------------------+--------------------+--------------------+---------------+--------------+--------------+--------------+---------------+-----+
only showing top 1 row



In [32]:
prep_df[prep_df['label']>0].show(3,truncate=False)

+--------+--------------------+--------------------+--------------------+--------------------+---------------+--------------+--------------+--------------+---------------+-----+
|is_match|cmp_fname_c1_imputed|cmp_fname_c2_imputed|cmp_lname_c1_imputed|cmp_lname_c2_imputed|cmp_sex_imputed|cmp_bd_imputed|cmp_bm_imputed|cmp_by_imputed|cmp_plz_imputed|label|
+--------+--------------------+--------------------+--------------------+--------------------+---------------+--------------+--------------+--------------+---------------+-----+
|true    |0.8333333           |0.9000177           |1.0                 |0.31841284          |1              |1             |1             |1             |0              |1    |
|true    |1.0                 |0.9000177           |1.0                 |0.31841284          |1              |1             |1             |1             |1              |1    |
|true    |1.0                 |0.9000177           |1.0                 |0.31841284          |1              |

In [48]:
from pyspark.ml.feature import VectorAssembler
def feature_engineering(input_df,feature_list,label_name):
    assembler = VectorAssembler(inputCols=feature_list,
                             outputCol='features')
    assembled_df = assembler.transform(input_df)
    output_df=assembled_df.select('features', label_name)
    return output_df

In [49]:
features=list(set(prep_dff.columns) - set(['label','is_match']))
features

['cmp_lname_c2_imputed',
 'cmp_bd_imputed',
 'cmp_sex_imputed',
 'cmp_plz_imputed',
 'cmp_fname_c1_imputed',
 'cmp_lname_c1_imputed',
 'cmp_fname_c2_imputed',
 'cmp_by_imputed',
 'cmp_bm_imputed']

In [59]:
from pyspark.sql.functions import element_at
prep_df.select(features).show(10,truncate=False)

+--------------------+--------------------+--------------+--------------------+--------------------+---------------+---------------+--------------+--------------+
|cmp_lname_c2_imputed|cmp_lname_c1_imputed|cmp_by_imputed|cmp_fname_c2_imputed|cmp_fname_c1_imputed|cmp_sex_imputed|cmp_plz_imputed|cmp_bm_imputed|cmp_bd_imputed|
+--------------------+--------------------+--------------+--------------------+--------------------+---------------+---------------+--------------+--------------+
|0.31841284          |1.0                 |1             |0.9000177           |0.8333333           |1              |0              |1             |1             |
|0.31841284          |1.0                 |1             |0.9000177           |1.0                 |1              |1              |1             |1             |
|0.31841284          |1.0                 |1             |0.9000177           |1.0                 |1              |1              |1             |1             |
|0.31841284          |

In [51]:

assembled_df = feature_engineering(prep_dff,features,'label')
assembled_df.show(5, truncate=False)

+----------------------------------------------------------------------------------+-----+
|features                                                                          |label|
+----------------------------------------------------------------------------------+-----+
|[0.3184128403663635,1.0,1.0,0.0,0.8333333134651184,1.0,0.9000176787376404,1.0,1.0]|1    |
|[0.3184128403663635,1.0,1.0,1.0,1.0,1.0,0.9000176787376404,1.0,1.0]               |1    |
|[0.3184128403663635,1.0,1.0,1.0,1.0,1.0,0.9000176787376404,1.0,1.0]               |1    |
|[0.3184128403663635,1.0,1.0,1.0,1.0,1.0,0.9000176787376404,1.0,1.0]               |1    |
|[1.0,1.0,1.0,1.0,1.0,1.0,0.9000176787376404,1.0,1.0]                              |1    |
+----------------------------------------------------------------------------------+-----+
only showing top 5 rows



In [18]:
r=assembled_df.randomSplit([0.8,0.2],22)
r[0].show(1,truncate=False)

+----------------------------------------------+-----+
|features                                      |label|
+----------------------------------------------+-----+
|(9,[0,1,2,4],[0.3184128403663635,1.0,1.0,1.0])|0    |
+----------------------------------------------+-----+
only showing top 1 row



In [19]:
r[0].show(10,truncate=False)

+-------------------------------------------------------------+-----+
|features                                                     |label|
+-------------------------------------------------------------+-----+
|(9,[0,1,2,4],[0.3184128403663635,1.0,1.0,1.0])               |0    |
|(9,[0,1,2,5],[0.3184128403663635,1.0,1.0,0.4285714328289032])|0    |
|(9,[0,1,2,5],[0.3184128403663635,1.0,1.0,1.0])               |0    |
|(9,[0,1,2,6],[0.3184128403663635,1.0,1.0,0.9000176787376404])|0    |
|(9,[0,1,2,6],[0.3184128403663635,1.0,1.0,0.9000176787376404])|0    |
|(9,[0,1,2,6],[0.3184128403663635,1.0,1.0,0.9000176787376404])|0    |
|(9,[0,1,2,6],[0.3184128403663635,1.0,1.0,0.9000176787376404])|0    |
|(9,[0,1,2,6],[0.3184128403663635,1.0,1.0,0.9000176787376404])|0    |
|(9,[0,1,2,6],[0.3184128403663635,1.0,1.0,0.9000176787376404])|0    |
|(9,[0,1,2,6],[0.3184128403663635,1.0,1.0,0.9000176787376404])|0    |
+-------------------------------------------------------------+-----+
only showing top 10 

In [20]:
df3 = assembled_df.withColumn('rand', rand(seed=42)).orderBy('rand')

In [22]:
df3.show(3,truncate=False)

+----------------------------------------------------------------------------------+-----+---------------------+
|features                                                                          |label|rand                 |
+----------------------------------------------------------------------------------+-----+---------------------+
|[0.3184128403663635,0.0,1.0,0.0,1.0,0.3636363744735718,0.9000176787376404,0.0,1.0]|0    |1.0589978002295553E-6|
|[0.3184128403663635,0.0,1.0,0.0,1.0,0.1428571492433548,0.9000176787376404,1.0,0.0]|0    |1.1244146868039095E-6|
|(9,[0,2,5,6],[0.3184128403663635,1.0,0.8571428656578064,0.9000176787376404])      |0    |1.8169130715506299E-6|
+----------------------------------------------------------------------------------+-----+---------------------+
only showing top 3 rows



In [48]:
df_train=df3.filter(df3.rand < 0.3)
df_test=df3.filter(df3.rand >= 0.3)
df_train.show(3,truncate=False)

+----------------------------------------------------------------------------------+-----+---------------------+
|features                                                                          |label|rand                 |
+----------------------------------------------------------------------------------+-----+---------------------+
|[0.3184128403663635,0.3636363744735718,0.0,0.9000176787376404,1.0,1.0,0.0,1.0,0.0]|0    |1.0589978002295553E-6|
|[0.3184128403663635,0.1428571492433548,1.0,0.9000176787376404,1.0,1.0,0.0,0.0,0.0]|0    |1.1244146868039095E-6|
|(9,[0,1,3,5],[0.3184128403663635,0.8571428656578064,0.9000176787376404,1.0])      |0    |1.8169130715506299E-6|
+----------------------------------------------------------------------------------+-----+---------------------+
only showing top 3 rows



In [49]:
df_test.show(3,truncate=False)

+-------------------------------------------------------------------------------------------------+-----+-------------------+
|features                                                                                         |label|rand               |
+-------------------------------------------------------------------------------------------------+-----+-------------------+
|[0.3184128403663635,0.8888888955116272,0.0,0.9000176787376404,0.2857142984867096,1.0,0.0,1.0,0.0]|0    |0.30000020961157103|
|(9,[0,1,4,5],[0.3184128403663635,0.20000000298023224,0.25,1.0])                                  |0    |0.3000004588256735 |
|[0.3184128403663635,0.5555555820465088,0.0,0.9000176787376404,1.0,1.0,0.0,0.0,0.0]               |0    |0.3000007368526143 |
+-------------------------------------------------------------------------------------------------+-----+-------------------+
only showing top 3 rows



In [None]:
def test_train_split(input_df,train_size=0.7):
    train, test = assembled_df.randomSplit([train_size,1 - train_size], seed=42)
    return train,test

In [None]:
train, test = test_train_split(assembled_df,0.7)
train.count()

In [None]:
prep_df.groupBy('label').count().show()

### ML Models

In [None]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier,LogisticRegression
from pyspark.ml import Pipeline

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import numpy as np
def evaluate_from_scratch(pred,model_name='Logistic Regression'):
    pred.groupBy('label', 'prediction').count().show()

    # Calculate the elements of the confusion matrix
    TN = pred.filter('prediction = 0 AND label = prediction').count()
    TP = pred.filter('prediction = 1 AND label = prediction').count()
    FN = pred.filter('prediction = 0 AND label = 1').count()
    FP = pred.filter('prediction = 1 AND label = 0').count()

    # Accuracy measures the proportion of correct predictions
    accuracy = (TN + TP) / (TN + TP + FN + FP)
    recall = (TP) / (TP+FN)
    precision= (TP) / (TP+FP)
    f1=2*(precision*recall)/(precision+recall)
    print(f'EVALUATION SUMMARY for {model_name}:')
    print(f" accuracy:{accuracy}")
    print(f" precision:{precision}")
    print(f" recall:{recall}")
    print(f" f1-score:{f1}")

def evaluate_from_spark(predictions,model_name='Logistic Regression'):
    eval = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="label")
    eval2= MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label")
    AUC  = eval.evaluate(predictions)
    ACC  = eval2.evaluate(predictions, {eval2.metricName:"accuracy"})
    PREC  = eval2.evaluate(predictions, {eval2.metricName:"weightedPrecision"})
    REC  = eval2.evaluate(predictions, {eval2.metricName:"weightedRecall"})
    F1  = eval2.evaluate(predictions, {eval2.metricName:"f1"})
    WeightedFMeasure=eval2.evaluate(predictions, {eval2.metricName:"weightedFMeasure"})
    print(f"{model_name} Performance Measure")
    print(" Accuracy = %0.8f" % ACC)
    print(" Weighted Precision = %0.8f" % PREC)
    print(" Weighted Recall = %0.8f" % REC)
    print(" F1 = %0.8f" % F1)
    print(" Weighted F Measure = %0.8f" % WeightedFMeasure)

    print(" AUC = %.8f" % AUC)
    print(" ROC curve:")
    PredAndLabels           = predictions.select("probability", "label")
    PredAndLabels_collect   = PredAndLabels.collect()
    PredAndLabels_list      = [(float(i[0][0]), 1.0-float(i[1])) for i in PredAndLabels_collect]
    PredAndLabels           = sc.parallelize(PredAndLabels_list)
    fpr = dict()                                                        # FPR: False Positive Rate
    tpr = dict()                                                        # TPR: True Positive Rate
    roc_auc = dict()
    
    y_test = [i[1] for i in PredAndLabels_list]
    y_score = [i[0] for i in PredAndLabels_list]
    
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(5,4))
    plt.plot(fpr, tpr, label='ROC curve (area = %0.8f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'r--')
    # plt.xlim([0.0, 1.0])
    # plt.ylim([0.0, 1.05])
    plt.yticks(np.arange(0,1.03,0.1))
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend(loc="lower right")
    plt.show()

    

def evaluate(predictions,model_name=None):
    print('Evaluate From Scratch:')
    evaluate_from_scratch(predictions,model_name)
    print('\nEvaluate From Spark Library:')
    evaluate_from_spark(predictions,model_name)

In [None]:
# lr=LogisticRegression(featuresCol='features', labelCol='label')
# lr_model = lr.fit(train)
# lr_result = lr_model.transform(test)

In [None]:
lr=LogisticRegression(featuresCol='features', labelCol='label')
pipeline = Pipeline(stages=[lr])
model = pipeline.fit(train)
lr_result = model.transform(test)

In [None]:
lr_result.select('label', 'prediction', 'probability').show(3)

In [None]:
evaluate(lr_result)

### Decision Tree

In [None]:
tree = DecisionTreeClassifier()
tree_pipeline = Pipeline(stages=[tree])
tree_model = tree_pipeline.fit(train)
tree_result = tree_model.transform(test)

In [None]:
evaluate(tree_result,model_name='Decision Tree')

### RandomForest

In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
rf_pipeline = Pipeline(stages=[ rf])
rf_model=rf_pipeline.fit(train)
rf_result=rf_model.transform(test)

In [None]:
evaluate(rf_result,model_name='Random Forest')

# Not Refactored yet...


### Cross Validation 

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [None]:
lr_cs = LogisticRegression()

In [None]:
from pyspark.ml.classification import LogisticRegression
lr_cs = LogisticRegression()
grid = ParamGridBuilder().addGrid(lr_cs.regParam, [0.1, 0.01]) \
    .addGrid(lr_cs.fitIntercept, [False, True])\
    .addGrid(lr_cs.elasticNetParam, [0.0, 1.0])\
    .build()
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=lr_cs, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(train)
lrprediction=cvModel.transform(test)



print('Accuracy:', evaluator.evaluate(lrprediction))
print('AUC:', BinaryClassificationMetrics(lrprediction['label','prediction'].rdd).areaUnderROC)

In [None]:
evaluate(lrprediction)

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier()
grid = ParamGridBuilder() \
        .addGrid(dt.maxDepth,  [2, 5, 10, 20, 30]) \
        .addGrid(dt.maxBins,  [10, 20, 40, 80, 100]) \
        .build()
dtevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
cv = CrossValidator(estimator=dt, 
                    estimatorParamMaps=grid, 
                    evaluator=dtevaluator,
                    numFolds = 3)
dtModel = cv.fit(train)
dtpredictions = dtModel.transform(test)

print('Accuracy:', dtevaluator.evaluate(dtpredictions))
print('AUC:', BinaryClassificationMetrics(dtpredictions['label','prediction'].rdd).areaUnderROC)

In [None]:
evaluate(dtpredictions)