In [125]:
import findspark
import os
import sys
findspark.init('/usr/local/spark/')
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pandas as pd
import sys
import os
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql.functions import to_timestamp
import time
from pyspark.sql import functions as fn
from pyspark.ml import feature, regression, Pipeline

import datetime
from pyspark.sql.functions import year, month, dayofmonth, hour, minute, second
from pyspark.mllib.stat import Statistics
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.sql.types import FloatType
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.appName('data-model').getOrCreate()

sqlContext = SQLContext(spark.sparkContext)

from pyspark.ml import Pipeline
from pyspark.ml import feature
from pyspark.ml import classification
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator

In [126]:
spark.stop()

In [106]:
def evaluate(predictionAndLabels):
    log = {}

    # Show Validation Score (AUROC)
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
    log['AUROC'] = "%f" % evaluator.evaluate(predictionAndLabels)    
    print("Area under ROC = {}".format(log['AUROC']))

    # Show Validation Score (AUPR)
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
    log['AUPR'] = "%f" % evaluator.evaluate(predictionAndLabels)
    print("Area under PR = {}".format(log['AUPR']))

    # Metrics
    predictionRDD = predictionAndLabels.select(['label', 'prediction']) \
                            .rdd.map(lambda line: (line[1], line[0]))
    metrics = MulticlassMetrics(predictionRDD)

    # Confusion Matrix
    print(metrics.confusionMatrix().toArray())

    # Overall statistics
    log['precision'] = "%s" % metrics.precision()
    log['recall'] = "%s" % metrics.recall()
    log['F1 Measure'] = "%s" % metrics.fMeasure()
    print("[Overall]\tprecision = %s | recall = %s | F1 Measure = %s" % \
            (log['precision'], log['recall'], log['F1 Measure']))

    # Statistics by class
    labels = [0.0, 1.0]
    for label in sorted(labels):
        log[label] = {}
        log[label]['precision'] = "%s" % metrics.precision(label)
        log[label]['recall'] = "%s" % metrics.recall(label)
        log[label]['F1 Measure'] = "%s" % metrics.fMeasure(label, 
                                                           beta=1.0)
        print("[Class %s]\tprecision = %s | recall = %s | F1 Measure = %s" \
                  % (label, log[label]['precision'], 
                    log[label]['recall'], log[label]['F1 Measure']))

    return log

In [8]:
train_data = spark.read.format('csv').option('header', 'true').load('/user/ananth/spark_data.csv')

In [9]:
train_data.count()

134550

In [10]:
train_data.limit(3).toPandas()

Unnamed: 0,device_id,label_id,app_id,event_id,longitude,latitude,is_installed,is_active,device_model,phone_brand,...,country,category_mapped,year,month,day,hour,minute,seconds,time_of_day,age_group
0,-4968154927622705128,713,-145658454112781034,4633,116.38,39.96,1,0,荣耀6 Plus,Huawei,...,China,industry,2016,5,1,7,48,6,morning,50-59
1,-4968154927622705128,704,-145658454112781034,4633,116.38,39.96,1,0,荣耀6 Plus,Huawei,...,China,industry,2016,5,1,7,48,6,morning,50-59
2,-4968154927622705128,548,-145658454112781034,4633,116.38,39.96,1,0,荣耀6 Plus,Huawei,...,China,industry,2016,5,1,7,48,6,morning,50-59


In [None]:
#train_data.select('town').distinct().collect()

In [11]:
training, validation, test = train_data.randomSplit([0.7, 0.2, 0.1], )

In [12]:
float_columns = ['device_id', 'app_id', 'label_id', 'event_id', 'longitude', 'latitude']
int_columns = ['is_active', 'age', 'is_installed', 'day', 'hour', 'minute', 'seconds']
string_columns = ['gender', 'phone_brand', 'device_model', 'town', 'country', 'category_mapped', 'time_of_day', 'age_group']

training = training.select(*(col(c).cast("float").alias(c) for c in float_columns), \
                                                 *(col(c).cast("int").alias(c) for c in int_columns), \
                                                 *(col(c).alias(c) for c in string_columns))

validation = validation.select(*(col(c).cast("float").alias(c) for c in float_columns), \
                                                 *(col(c).cast("int").alias(c) for c in int_columns), \
                                                 *(col(c).alias(c) for c in string_columns))

test = test.select(*(col(c).cast("float").alias(c) for c in float_columns), \
                                                 *(col(c).cast("int").alias(c) for c in int_columns), \
                                                 *(col(c).alias(c) for c in string_columns))

In [13]:
training.count()

94302

In [14]:
test.count()

13300

In [15]:
# #PCA for town

# gender_indexer = feature.StringIndexer(inputCol="gender", outputCol="gender_label",handleInvalid='skip')
# category_indexer = feature.StringIndexer(inputCol='category_mapped', outputCol='category_encoded',handleInvalid='skip')
# phone_brand_indexer = feature.StringIndexer(inputCol='phone_brand', outputCol='phone_brand_encoded',handleInvalid='skip')
# is_active_indexer = feature.StringIndexer(inputCol='is_active', outputCol='is_active_encoded',handleInvalid='skip')
# device_model_indexer = feature.StringIndexer(inputCol='device_model', outputCol='device_model_encoded',handleInvalid='skip')
# town_indexer = feature.StringIndexer(inputCol='town', outputCol='town_encoded',handleInvalid='skip')
# country_indexer = feature.StringIndexer(inputCol='country', outputCol='country_encoded',handleInvalid='skip')
# time_of_day_indexer = feature.StringIndexer(inputCol='time_of_day', outputCol='time_of_day_encoded',handleInvalid='skip')
# age_group_indexer = feature.StringIndexer(inputCol='age_group', outputCol='age_group_encoded',handleInvalid='skip')
# #area_cluster_id_indexer = feature.StringIndexer(inputCol='area_cluster_id', outputCol='area_cluster_id_encoded',handleInvalid='skip')

# vector_assembler = feature.VectorAssembler(inputCols=['device_id', 'app_id', 'label_id', 'event_id', 'is_active',\
#                                                       'device_model_encoded', 'phone_brand_encoded', 'gender_label', 'country_encoded',\
#                                                       'time_of_day_encoded', 'age_group_encoded', 'category_encoded'],
#                                         outputCol='features')
# sc = feature.StandardScaler(inputCol='features',outputCol='sfeatures')

# evaluator = BinaryClassificationEvaluator(labelCol='gender_label')

# pipe_prep_location=Pipeline(stages=[gender_indexer, category_indexer,phone_brand_indexer, is_active_indexer, device_model_indexer,\
#                            town_indexer, country_indexer, time_of_day_indexer, age_group_indexer, \
#                            vector_assembler, sc])

In [16]:
# from pyspark.ml.clustering import KMeans

# # df_kmeans.show()
# df_kmeans = pipe_prep_location.fit(training).transform(training)
# cost = np.zeros(30)
# for k in range(2,30):
#     kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("sfeatures")
#     model = kmeans.fit(df_kmeans.sample(False,0.1, seed=42))
#     cost[k] = model.computeCost(df_kmeans) # requires Spark 2.0 or later
    
# fig, ax = plt.subplots(1,1, figsize =(8,6))
# ax.plot(range(2,30),cost[2:30])
# ax.set_xlabel('k')
# ax.set_ylabel('cost')

In [17]:
# from pyspark.ml.clustering import KMeans
# k = 30
# df_kmeans = pipe_prep_location.fit(train_data).transform(train_data)
# kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("sfeatures")
# model = kmeans.fit(df_kmeans)
# centers = model.clusterCenters()

# print("Cluster Centers: ")
# for center in centers:
#     print(center)

# new_data = model.transform(df_kmeans)

In [101]:
# Gender Pipeline
gender_indexer = feature.StringIndexer(inputCol="gender", outputCol="label",handleInvalid='skip')
category_indexer = feature.StringIndexer(inputCol='category_mapped', outputCol='category_encoded',handleInvalid='skip')
phone_brand_indexer = feature.StringIndexer(inputCol='phone_brand', outputCol='phone_brand_encoded',handleInvalid='skip')
is_active_indexer = feature.StringIndexer(inputCol='is_active', outputCol='is_active_encoded',handleInvalid='skip')
device_model_indexer = feature.StringIndexer(inputCol='device_model', outputCol='device_model_encoded',handleInvalid='skip')
town_indexer = feature.StringIndexer(inputCol='town', outputCol='town_encoded',handleInvalid='skip')
country_indexer = feature.StringIndexer(inputCol='country', outputCol='country_encoded',handleInvalid='skip')
time_of_day_indexer = feature.StringIndexer(inputCol='time_of_day', outputCol='time_of_day_encoded',handleInvalid='skip')
age_group_indexer = feature.StringIndexer(inputCol='age_group', outputCol='age_group_encoded',handleInvalid='skip')
#area_cluster_id_indexer = feature.StringIndexer(inputCol='area_cluster_id', outputCol='area_cluster_id_encoded',handleInvalid='skip')

vector_assembler = feature.VectorAssembler(inputCols=['device_id', 'app_id', 'label_id', 'event_id', 'is_active',\
                                                      'device_model_encoded', 'phone_brand_encoded', 'town_encoded', 'country_encoded',\
                                                      'time_of_day_encoded', 'age_group_encoded', 'category_encoded'],
                                        outputCol='features')
sc = feature.StandardScaler(inputCol='features',outputCol='sfeatures')

evaluator = BinaryClassificationEvaluator(labelCol='label')

pipe_prep=Pipeline(stages=[gender_indexer, category_indexer,phone_brand_indexer, is_active_indexer, device_model_indexer,\
                           town_indexer, country_indexer, time_of_day_indexer, age_group_indexer, \
                           vector_assembler, sc])

In [19]:
pca=feature.PCA(k=2, inputCol='sfeatures', outputCol='pfeat')

pipe_pca=Pipeline(stages=[pipe_prep,pca]).fit(training)

pca_mod=pipe_pca.transform(training)

feat=train_data.columns
actfeat=['device_id', 'app_id', 'label_id', 'event_id', 'is_active',\
                                                      'device_model_encoded', 'phone_brand_encoded', 'town_encoded', 'country_encoded',\
                                                      'time_of_day_encoded', 'age_group_encoded', 'category_encoded']

# feat

# actfeat=pca_mod.columns

pca=pipe_pca.stages[1].pc.toArray()

pc1_df=pd.DataFrame([pca[:, 0],actfeat]).T.rename(columns={0:'pc1',1:'abs_loadings'})
pc2_df=pd.DataFrame([pca[:, 1],actfeat]).T.rename(columns={0:'pc2',1:'abs_loadings'})

In [20]:
pc1_df.pc1=pc1_df.pc1.abs()

pc1_df.sort_values(by=['pc1'],ascending=False)

Unnamed: 0,pc1,abs_loadings
6,0.678323,phone_brand_encoded
5,0.672741,device_model_encoded
4,0.15323,is_active
8,0.141971,country_encoded
10,0.138975,age_group_encoded
7,0.126475,town_encoded
0,0.0832806,device_id
1,0.0248684,app_id
11,0.0239107,category_encoded
9,0.0110473,time_of_day_encoded


In [21]:
pc2_df.pc2=pc2_df.pc2.abs()

pc2_df.sort_values(by=['pc2'],ascending=False)

Unnamed: 0,pc2,abs_loadings
2,0.702398,label_id
11,0.695127,category_encoded
4,0.115651,is_active
7,0.0800999,town_encoded
10,0.0354481,age_group_encoded
1,0.0335258,app_id
9,0.0237315,time_of_day_encoded
8,0.0206697,country_encoded
0,0.0113723,device_id
3,0.00748714,event_id


In [107]:
# logistic with default or no parameters

logistic = classification.LogisticRegression(labelCol='label', featuresCol='sfeatures')

lr_pipe = Pipeline(stages=[pipe_prep, logistic]).fit(training)

result1=evaluator.evaluate(lr_pipe.transform(test))

result1

0.6179698897928729

In [108]:
evaluate(lr_pipe.transform(test))

Area under ROC = 0.617970
Area under PR = 0.278325
[[10674.    16.]
 [ 2609.     0.]]
[Overall]	precision = 0.8026167381006091 | recall = 0.8026167381006091 | F1 Measure = 0.8026167381006091
[Class 0.0]	precision = 0.8035835278175111 | recall = 0.9985032740879326 | F1 Measure = 0.890501814541359
[Class 1.0]	precision = 0.0 | recall = 0.0 | F1 Measure = 0.0


{'AUROC': '0.617970',
 'AUPR': '0.278325',
 'precision': '0.8026167381006091',
 'recall': '0.8026167381006091',
 'F1 Measure': '0.8026167381006091',
 0.0: {'precision': '0.8035835278175111',
  'recall': '0.9985032740879326',
  'F1 Measure': '0.890501814541359'},
 1.0: {'precision': '0.0', 'recall': '0.0', 'F1 Measure': '0.0'}}

In [23]:
auroc = evaluator.evaluate(lr_pipe.transform(test), {evaluator.metricName: "areaUnderROC"})
auprc = evaluator.evaluate(lr_pipe.transform(test), {evaluator.metricName: "areaUnderPR"})
print("Area under ROC Curve: {:.4f}".format(auroc))
print("Area under PR Curve: {:.4f}".format(auprc))

Area under ROC Curve: 0.6180
Area under PR Curve: 0.2783


In [24]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

lr_pipe = Pipeline(stages=[pipe_prep, logistic])

lr_param = ParamGridBuilder() \
    .addGrid(logistic.elasticNetParam, [0.0, 0.5, 1.0]) \
    .addGrid(logistic.regParam, [0.1, 0.01]) \
    .build()

evaluator = BinaryClassificationEvaluator(labelCol="label", metricName='areaUnderROC')
#evaluator = RegressionEvaluator(labelCol="town_encoded", predictionCol="prediction", metricName="r2")
#evaluator = MulticlassClassificationEvaluator(labelCol = 'town_encoded', predictionCol='prediction', metricName = 'accuracy')
crossval = CrossValidator(estimator=lr_pipe,
                         estimatorParamMaps=lr_param,
                         evaluator=evaluator,
                         numFolds=3)

cvmodel = crossval.fit(validation)
cvmodel.bestModel.stages[-1].extractParamMap()

{Param(parent='LogisticRegression_29c6309518dc', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2)'): 2,
 Param(parent='LogisticRegression_29c6309518dc', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty'): 0.0,
 Param(parent='LogisticRegression_29c6309518dc', name='family', doc='The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial.'): 'auto',
 Param(parent='LogisticRegression_29c6309518dc', name='featuresCol', doc='features column name'): 'sfeatures',
 Param(parent='LogisticRegression_29c6309518dc', name='fitIntercept', doc='whether to fit an intercept term'): True,
 Param(parent='LogisticRegression_29c6309518dc', name='labelCol', doc='label column name'): 'gender_label',
 Param(parent='LogisticRegression_29c6309518dc', name='maxIter', doc='maximum number of iterations (>

In [102]:
# RF

rf=classification.RandomForestClassifier(labelCol='label', featuresCol='sfeatures')

rf_pipe=Pipeline(stages=[pipe_prep,rf]).fit(training)

evaluator = BinaryClassificationEvaluator(labelCol = 'label', metricName ='areaUnderROC')

resultrf4=evaluator.evaluate(rf_pipe.transform(validation))

resultrf4

0.7248086612604546

In [26]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

rf_pipe=Pipeline(stages=[pipe_prep,rf])

rfParam = ParamGridBuilder() \
.addGrid(rf.maxDepth, [4, 6, 8, 10]) \
.addGrid(rf.maxBins, [5, 10, 20]) \
.addGrid(rf.impurity, ["gini"]) \
.build()

# gbtParam = (ParamGridBuilder()
#              .addGrid(gbt.maxDepth, [4, 6, 8, 10])
#              .addGrid(gbt.maxBins, [5, 10, 20, 40])
#              .addGrid(gbt.maxIter, [5, 10, 15])
#              .build())

evaluator = BinaryClassificationEvaluator(labelCol="gender_label", metricName='areaUnderROC')
#evaluator = RegressionEvaluator(labelCol="town_encoded", predictionCol="prediction", metricName="r2")
#evaluator = MulticlassClassificationEvaluator(labelCol = 'town_encoded', predictionCol='prediction', metricName = 'accuracy')
crossval = CrossValidator(estimator=rf_pipe,
                         estimatorParamMaps=rfParam,
                         evaluator=evaluator,
                         numFolds=3)

#cv = CrossValidator(estimator=random_forest_pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cvmodel = crossval.fit(validation)
cvmodel.bestModel.stages[-1].extractParamMap()

{Param(parent='RandomForestClassifier_084483c7c992', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False,
 Param(parent='RandomForestClassifier_084483c7c992', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10,
 Param(parent='RandomForestClassifier_084483c7c992', name='featureSubsetStrategy', doc='The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n].'): 'auto',
 Param(parent='RandomForestClassifier_084483c7c992', name='featuresCol', doc='features column name'): 'sfeatures',
 Param(parent='RandomForestClassifier_084483c7c992',

In [111]:
# RF
rf=classification.RandomForestClassifier(labelCol='label', featuresCol='sfeatures', \
                                         maxDepth=10, maxBins=20, numTrees=20)

rf_pipe=Pipeline(stages=[pipe_prep,rf]).fit(training)

evaluator = BinaryClassificationEvaluator(labelCol="label", metricName='areaUnderROC')
resultrf4=evaluator.evaluate(rf_pipe.transform(test))

In [112]:
evaluate(rf_pipe.transform(test))

Area under ROC = 0.908114
Area under PR = 0.813277
[[10675.    15.]
 [ 1748.   861.]]
[Overall]	precision = 0.86743364162719 | recall = 0.86743364162719 | F1 Measure = 0.86743364162719
[Class 0.0]	precision = 0.8592932463978105 | recall = 0.9985968194574368 | F1 Measure = 0.9237225803660278
[Class 1.0]	precision = 0.9828767123287672 | recall = 0.33001149865848983 | F1 Measure = 0.4941176470588235


{'AUROC': '0.908114',
 'AUPR': '0.813277',
 'precision': '0.86743364162719',
 'recall': '0.86743364162719',
 'F1 Measure': '0.86743364162719',
 0.0: {'precision': '0.8592932463978105',
  'recall': '0.9985968194574368',
  'F1 Measure': '0.9237225803660278'},
 1.0: {'precision': '0.9828767123287672',
  'recall': '0.33001149865848983',
  'F1 Measure': '0.4941176470588235'}}

In [113]:
rf_pipe.save("/user/ananth/rf")

Py4JJavaError: An error occurred while calling o116529.saveAsTextFile.
: org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory hdfs://localhost:9000/user/ananth/rf/metadata already exists
	at org.apache.hadoop.mapred.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:131)
	at org.apache.spark.internal.io.HadoopMapRedWriteConfigUtil.assertConf(SparkHadoopWriter.scala:287)
	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:71)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1096)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1094)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:1067)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1032)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:958)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:958)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:958)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:957)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1499)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1478)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1478)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1478)
	at org.apache.spark.api.java.JavaRDDLike$class.saveAsTextFile(JavaRDDLike.scala:550)
	at org.apache.spark.api.java.AbstractJavaRDDLike.saveAsTextFile(JavaRDDLike.scala:45)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [56]:
# Random Forest one of the Tree
tree_to_json = rf_pipe.stages[-1].trees[0].toDebugString

import re

d = {"feature 0": "device_id", "feature 1": "app_id",\
     "feature 2": "label_id", "feature 3": "event_id",\
     "feature 4": "is_active", "feature 5": "device_model_encoded",\
    "feature 6": "phone_brand_encoded", "feature 7": "town_encoded",\
    "feature 8": "country_encoded", "feature 9": "time_of_day_encoded",\
    "feature 10": "age_group_encoded", "feature 11": "category_encoded"}

def replace_all(text, dic):
    for i, j in dic.items():
        text = text.replace(i, j)
    return text

tree_to_json = replace_all(tree_to_json, d)

# Parser
def parse(lines):
    block = []
    while lines :

        if lines[0].startswith('If'):
            bl = ' '.join(lines.pop(0).split()[1:]).replace('(', '').replace(')', '')
            block.append({'name':bl, 'children':parse(lines)})


            if lines[0].startswith('Else'):
                be = ' '.join(lines.pop(0).split()[1:]).replace('(', '').replace(')', '')
                block.append({'name':be, 'children':parse(lines)})
        elif not lines[0].startswith(('If','Else')):
            block2 = lines.pop(0)
            block.append({'name':block2})
        else:
            break	
    return block

# Convert Tree to JSON
def tree_json(tree):
    data = []
    for line in tree.splitlines() : 
        if line.strip():
            line = line.strip()
            data.append(line)
        else : break
        if not line : break
    res = []
    res.append({'name':'Root', 'children':parse(data[1:])})
    with open('/home/ananth/Decision-Tree-Visualization-Spark/data/sample.json', 'w') as outfile:
        json.dump(res[0], outfile)
    print ('Conversion Success !')
tree_json(tree_to_json)

Conversion Success !


In [70]:
from IPython.display import IFrame
IFrame('../Mytrees/trees/tree1.html', width=1000, height=700)

UsageError: Line magic function `%%javascript` not found.


In [None]:
#GBT without Cross Validation

gbt = classification.GBTClassifier(labelCol='gender_label', featuresCol='sfeatures')
gbt_pipe=Pipeline(stages=[pipe_prep,gbt]).fit(training)

evaluator = BinaryClassificationEvaluator(labelCol="gender_label", metricName='areaUnderROC')

resultrf4=evaluator.evaluate(gbt_pipe.transform(test))

resultrf4

In [None]:
# GBT Cross Validation

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

gbt_pipe = Pipeline(stages=[pipe_prep, gbt])

gbtParam = (ParamGridBuilder()
             .addGrid(gbt.maxDepth, [4, 6, 8, 10])
             .addGrid(gbt.maxBins, [5, 10, 20])
             .addGrid(gbt.maxIter, [2])
             .build())

evaluator = BinaryClassificationEvaluator(labelCol="gender_label", metricName='areaUnderROC')
#evaluator = RegressionEvaluator(labelCol="town_encoded", predictionCol="prediction", metricName="r2")
#evaluator = MulticlassClassificationEvaluator(labelCol = 'gender_label', predictionCol='prediction', metricName = 'accuracy')
crossval = CrossValidator(estimator=gbt_pipe,
                         estimatorParamMaps=gbtParam,
                         evaluator=evaluator,
                         numFolds=3)

#cv = CrossValidator(estimator=random_forest_pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)


cvmodel = crossval.fit(validation)
cvmodel.bestModel.stages[-1].extractParamMap()

In [None]:
gbt = classification.GBTClassifier(labelCol='gender_label', featuresCol='sfeatures', maxDepth=10, maxBins=20, lossType = 'logistic', maxIter=2)
gbt_pipe=Pipeline(stages=[pipe_prep,gbt]).fit(training)

evaluator = BinaryClassificationEvaluator(labelCol="gender_label", metricName='areaUnderROC')

resultrf4=evaluator.evaluate(gbt_pipe.transform(test))

resultrf4

In [None]:
auroc = evaluator.evaluate(gbt_pipe.transform(test), {evaluator.metricName: "areaUnderROC"})
auprc = evaluator.evaluate(gbt_pipe.transform(test), {evaluator.metricName: "areaUnderPR"})
print("Area under ROC Curve: {:.4f}".format(auroc))
print("Area under PR Curve: {:.4f}".format(auprc))

In [114]:
# Is Active Pipeline
gender_indexer = feature.StringIndexer(inputCol="gender", outputCol="gender_label",handleInvalid='skip')
category_indexer = feature.StringIndexer(inputCol='category_mapped', outputCol='category_encoded',handleInvalid='skip')
phone_brand_indexer = feature.StringIndexer(inputCol='phone_brand', outputCol='phone_brand_encoded',handleInvalid='skip')
is_active_indexer = feature.StringIndexer(inputCol='is_active', outputCol='label',handleInvalid='skip')
device_model_indexer = feature.StringIndexer(inputCol='device_model', outputCol='device_model_encoded',handleInvalid='skip')
town_indexer = feature.StringIndexer(inputCol='town', outputCol='town_encoded',handleInvalid='skip')
country_indexer = feature.StringIndexer(inputCol='country', outputCol='country_encoded',handleInvalid='skip')
time_of_day_indexer = feature.StringIndexer(inputCol='time_of_day', outputCol='time_of_day_encoded',handleInvalid='skip')
age_group_indexer = feature.StringIndexer(inputCol='age_group', outputCol='age_group_encoded',handleInvalid='skip')
#area_cluster_id_indexer = feature.StringIndexer(inputCol='area_cluster_id', outputCol='area_cluster_id_encoded',handleInvalid='skip')

vector_assembler = feature.VectorAssembler(inputCols=['device_id', 'app_id', 'label_id', 'event_id', 'gender_label',\
                                                      'device_model_encoded', 'phone_brand_encoded', 'town_encoded', 'country_encoded',\
                                                      'time_of_day_encoded', 'age_group_encoded', 'category_encoded'],
                                        outputCol='features')
sc = feature.StandardScaler(inputCol='features',outputCol='sfeatures')

evaluator = BinaryClassificationEvaluator(labelCol='label')

pipe_prep2=Pipeline(stages=[gender_indexer, category_indexer,phone_brand_indexer, is_active_indexer, device_model_indexer,\
                           town_indexer, country_indexer, time_of_day_indexer, age_group_indexer, \
                           vector_assembler, sc])

In [116]:
cvmodel.bestModel.stages[-1].extractParamMap()

{Param(parent='RandomForestClassifier_65295dcda0c0', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False,
 Param(parent='RandomForestClassifier_65295dcda0c0', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10,
 Param(parent='RandomForestClassifier_65295dcda0c0', name='featureSubsetStrategy', doc='The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n].'): 'auto',
 Param(parent='RandomForestClassifier_65295dcda0c0', name='featuresCol', doc='features column name'): 'sfeatures',
 Param(parent='RandomForestClassifier_65295dcda0c0',

In [121]:
# logistic with default or no parameters

logistic = classification.LogisticRegression(labelCol='label', featuresCol='sfeatures')

lr_pipe = Pipeline(stages=[pipe_prep2, logistic]).fit(training)

evaluator = BinaryClassificationEvaluator(labelCol = 'label', metricName ='areaUnderROC')

result1=evaluator.evaluate(lr_pipe.transform(test))

result1

0.6047458831506994

In [117]:
# RF
rf=classification.RandomForestClassifier(labelCol='label', featuresCol='sfeatures', \
                                         maxDepth=10, maxBins=20, numTrees=20)

rf_pipe=Pipeline(stages=[pipe_prep2,rf]).fit(training)

evaluator = BinaryClassificationEvaluator(labelCol="label", metricName='areaUnderROC')
resultrf4=evaluator.evaluate(rf_pipe.transform(test))

In [119]:
evaluate(rf_pipe.transform(test))

Area under ROC = 0.823505
Area under PR = 0.697947
[[9123.  180.]
 [2949. 1047.]]
[Overall]	precision = 0.764719151815926 | recall = 0.764719151815926 | F1 Measure = 0.764719151815926
[Class 0.0]	precision = 0.7557157057654076 | recall = 0.980651402773299 | F1 Measure = 0.8536140350877194
[Class 1.0]	precision = 0.8533007334963325 | recall = 0.262012012012012 | F1 Measure = 0.40091901206203334


{'AUROC': '0.823505',
 'AUPR': '0.697947',
 'precision': '0.764719151815926',
 'recall': '0.764719151815926',
 'F1 Measure': '0.764719151815926',
 0.0: {'precision': '0.7557157057654076',
  'recall': '0.980651402773299',
  'F1 Measure': '0.8536140350877194'},
 1.0: {'precision': '0.8533007334963325',
  'recall': '0.262012012012012',
  'F1 Measure': '0.40091901206203334'}}

In [115]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

rf_pipe=Pipeline(stages=[pipe_prep2,rf])

rfParam = ParamGridBuilder() \
.addGrid(rf.maxDepth, [4, 6, 8, 10]) \
.addGrid(rf.maxBins, [5, 10, 20]) \
.addGrid(rf.impurity, ["gini"]) \
.build()

# gbtParam = (ParamGridBuilder()
#              .addGrid(gbt.maxDepth, [4, 6, 8, 10])
#              .addGrid(gbt.maxBins, [5, 10, 20, 40])
#              .addGrid(gbt.maxIter, [5, 10, 15])
#              .build())

evaluator = BinaryClassificationEvaluator(labelCol="label", metricName='areaUnderROC')
#evaluator = RegressionEvaluator(labelCol="town_encoded", predictionCol="prediction", metricName="r2")
#evaluator = MulticlassClassificationEvaluator(labelCol = 'town_encoded', predictionCol='prediction', metricName = 'accuracy')
crossval = CrossValidator(estimator=rf_pipe,
                         estimatorParamMaps=rfParam,
                         evaluator=evaluator,
                         numFolds=3)

#cv = CrossValidator(estimator=random_forest_pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cvmodel = crossval.fit(validation)
cvmodel.bestModel.stages[-1].extractParamMap()

{Param(parent='RandomForestClassifier_65295dcda0c0', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False,
 Param(parent='RandomForestClassifier_65295dcda0c0', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10,
 Param(parent='RandomForestClassifier_65295dcda0c0', name='featureSubsetStrategy', doc='The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n].'): 'auto',
 Param(parent='RandomForestClassifier_65295dcda0c0', name='featuresCol', doc='features column name'): 'sfeatures',
 Param(parent='RandomForestClassifier_65295dcda0c0',

In [None]:
# def getMeasures(dataframe):
#     Accuracy = MulticlassClassificationEvaluator(predictionCol = 'prediction', labelCol = 'gender_label', \
#                                                  metricName = 'accuracy').evaluate(dataframe)*100

#     Precision = MulticlassClassificationEvaluator(predictionCol = 'prediction', labelCol = 'gender_label', \
#                                                  metricName = 'weightedPrecision').evaluate(dataframe)*100

#     Recall = MulticlassClassificationEvaluator(predictionCol = 'prediction', labelCol = 'gender_label', \
#                                                  metricName = 'weightedRecall').evaluate(dataframe)*100

#     F1 = MulticlassClassificationEvaluator(predictionCol = 'prediction', labelCol = 'gender_label', \
#                                                  metricName = 'f1').evaluate(dataframe)*100

    
#     TP = dataframe.select("gender_label", "prediction").filter("gender_label = 0 and prediction = 0").count
#     TN = dataframe.select("gender_label", "prediction").filter("gender_label = 1 and prediction = 1").count
#     FP = dataframe.select("gender_label", "prediction").filter("gender_label = 0 and prediction = 1").count
#     FN = dataframe.select("gender_label", "prediction").filter("gender_label = 1 and prediction = 0").count
#     total = dataframe.select("gender_label").count
    
#     print("Accuracy: ", Accuracy)
#     print("Precision: ", Precision)
#     print("Recall: ", Recall)
#     print("F1: ", F1)

# # printMetrics(predictions_and_labels)

In [122]:
spark.stop()