In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pandas as pd
import sys
import os
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql.functions import to_timestamp
import time
from pyspark.sql import functions as fn
from pyspark.ml import feature, regression, Pipeline

import datetime
from pyspark.sql.functions import year, month, dayofmonth, hour, minute, second
from pyspark.mllib.stat import Statistics
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [2]:
def getstats(predicted, labelCol):
    from pyspark.mllib.util import MLUtils
    from pyspark.mllib.evaluation import MulticlassMetrics, RegressionMetrics, RankingMetrics
    log = {}
    # Show Validation Score (AUROC)
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC', labelCol=labelCol)
    log['AUROC'] = "%f" % evaluator.evaluate(predicted)    
    print("Area under ROC = {}".format(log['AUROC']))

    # Show Validation Score (AUPR)
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR', labelCol=labelCol)
    log['AUPR'] = "%f" % evaluator.evaluate(predicted)
    print("Area under PR = {}".format(log['AUPR']))

    # Metrics
    predictionRDD = predicted.select([labelCol, 'prediction']) \
                            .rdd.map(lambda line: (line[1], line[0]))
    metrics = MulticlassMetrics(predictionRDD)
    
    metrics2 = RegressionMetrics(predictionRDD)
    
    #"print("RMSE = %s" % metrics2.rootMeanSquaredError)

    # R-squared
    #print("R-squared = %s" % abs(metrics2.r2))

    # Confusion Matrix
    print(metrics.confusionMatrix().toArray())

    # Overall statistics
    log['precision'] = "%s" % metrics.precision()
    log['recall'] = "%s" % metrics.recall()
    log['F1 Measure'] = "%s" % metrics.fMeasure()
    print("[Overall]\tprecision = %s | recall = %s | F1 Measure = %s" % \
            (log['precision'], log['recall'], log['F1 Measure']))

    # Statistics by class
    labels = [0.0, 1.0]
    for label in sorted(labels):
        log[label] = {}
        log[label]['precision'] = "%s" % metrics.precision(label)
        log[label]['recall'] = "%s" % metrics.recall(label)
        log[label]['F1 Measure'] = "%s" % metrics.fMeasure(label, 
                                                           beta=1.0)
        print("[Class %s]\tprecision = %s | recall = %s | F1 Measure = %s" \
                  % (label, log[label]['precision'], 
                    log[label]['recall'], log[label]['F1 Measure']))

In [3]:
spark = SparkSession.builder.appName('data-cleaning').\
                        config("spark.executor.instances", '3').\
                        config("spark.executor.memory", '40g').\
                        config('spark.executor.cores', '5').\
                        config('spark.cores.max', '5').appName('data_clean').\
                        getOrCreate()

In [4]:
sqlContext = SQLContext(spark.sparkContext)

In [5]:
train_data = spark.read.format('csv').option('header', 'true').load('../modeled_data/train_geo')

In [6]:
train_data = train_data.drop('latitude18').drop('longitude19').drop('_c0').withColumnRenamed('latitude5', 'latitude').withColumnRenamed('longitude4', 'longitude')
train_data.limit(5).toPandas()

Unnamed: 0,app_id,device_id,label_id,event_id,longitude,latitude,is_active,age,is_installed,day,...,gender,group,category,phone_brand,device_model,town,country,category_mapped,town_index,AgeRange
0,-1.4565846e+17,-4.968155e+18,713.0,4633.0,116.38,39.96,0,53,1,1,...,F,F43+,Services 1,Huawei,荣耀6 Plus,Beijing,China,industry,35,43+
1,-1.4565846e+17,-4.968155e+18,704.0,4633.0,116.38,39.96,0,53,1,1,...,F,F43+,Property Industry 2.0,Huawei,荣耀6 Plus,Beijing,China,industry,35,43+
2,-1.4565846e+17,-4.968155e+18,548.0,4633.0,116.38,39.96,0,53,1,1,...,F,F43+,Industry tag,Huawei,荣耀6 Plus,Beijing,China,industry,35,43+
3,-1.4565846e+17,-4.968155e+18,302.0,4633.0,116.38,39.96,0,53,1,1,...,F,F43+,unknown,Huawei,荣耀6 Plus,Beijing,China,other,35,43+
4,-1.4565846e+17,-4.968155e+18,303.0,4633.0,116.38,39.96,0,53,1,1,...,F,F43+,unknown,Huawei,荣耀6 Plus,Beijing,China,other,35,43+


In [7]:
training, test = train_data.randomSplit([0.8, 0.2], 0)

In [8]:
training.count()

103811

In [9]:
test.count()

25988

In [10]:
from pyspark.sql.types import FloatType
from pyspark.sql.types import IntegerType

float_columns = ['device_id', 'app_id', 'label_id', 'event_id', 'longitude', 'latitude']
int_columns = ['is_active', 'age', 'is_installed', 'day', 'hour', 'minute', 'second', 'town_index']
string_columns = ['gender', 'group', 'category', 'phone_brand', 'device_model', 'town', 'country', 'category_mapped', 'AgeRange']

training = train_data.select(*(col(c).cast("float").alias(c) for c in float_columns), \
                                                 *(col(c).cast("int").alias(c) for c in int_columns), \
                                                 *(col(c).alias(c) for c in string_columns))
test = train_data.select(*(col(c).cast("float").alias(c) for c in float_columns), \
                                                 *(col(c).cast("int").alias(c) for c in int_columns), \
                                                 *(col(c).alias(c) for c in string_columns))

In [11]:
training.printSchema()

root
 |-- device_id: float (nullable = true)
 |-- app_id: float (nullable = true)
 |-- label_id: float (nullable = true)
 |-- event_id: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- latitude: float (nullable = true)
 |-- is_active: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- is_installed: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- second: integer (nullable = true)
 |-- town_index: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- group: string (nullable = true)
 |-- category: string (nullable = true)
 |-- phone_brand: string (nullable = true)
 |-- device_model: string (nullable = true)
 |-- town: string (nullable = true)
 |-- country: string (nullable = true)
 |-- category_mapped: string (nullable = true)
 |-- AgeRange: string (nullable = true)



In [12]:
#Correlation
indexer = feature.StringIndexer(inputCol="gender", outputCol="gender_label")
category = feature.StringIndexer(inputCol='category', outputCol='category_encoded')
brand = feature.StringIndexer(inputCol='phone_brand', outputCol='phone_brand_encoded')
group = feature.StringIndexer(inputCol='group', outputCol='group_encoded')
is_active = feature.StringIndexer(inputCol='is_active', outputCol='is_active_encoded')
device_model = feature.StringIndexer(inputCol='device_model', outputCol='device_model_encoded')
town_model = feature.StringIndexer(inputCol='town', outputCol='town_encoded')
country_model = feature.StringIndexer(inputCol='country', outputCol='country_encoded')
category_wide = feature.StringIndexer(inputCol='category_mapped', outputCol='category_wide_encoded')
age_encoded = feature.StringIndexer(inputCol='AgeRange', outputCol='AgeRange_encoded')

correlation_pipeline = Pipeline(stages=[indexer, category, age_encoded, category_wide, brand, group, is_active, device_model, town_model, country_model])
encoded_data = correlation_pipeline.fit(training).transform(training)

encoded_data = encoded_data['gender_label','device_id', 'app_id', 'AgeRange_encoded', 'category_encoded', 'category_wide_encoded', 'phone_brand_encoded', 'group_encoded', 'is_active_encoded', 'device_model_encoded',\
                           'town_encoded', 'country_encoded', 'label_id', 'event_id', 'longitude', 'latitude', 'day', 'hour', 'minute']
col_names = encoded_data.columns
features = encoded_data.rdd.map(lambda row: row[0:])
features
corr_mat=Statistics.corr(features, method="pearson")
corr_df = pd.DataFrame(corr_mat)

corr_df.columns = col_names

corr = corr_df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,gender_label,device_id,app_id,AgeRange_encoded,category_encoded,category_wide_encoded,phone_brand_encoded,group_encoded,is_active_encoded,device_model_encoded,town_encoded,country_encoded,label_id,event_id,longitude,latitude,day,hour,minute
gender_label,1.0,-0.00688269,-0.071486,0.0925375,-0.0696375,-0.0254883,-0.0849737,0.958339,0.108322,-0.0185147,-0.0438324,-0.00830217,-0.113286,-0.0971488,-0.129968,-0.124172,-0.0378543,-0.0324042,-0.12037
device_id,-0.00688269,1.0,-0.0414471,-0.0636429,-0.0531508,-0.0574632,-0.109825,-0.0293747,-0.0857567,-0.120529,-0.135838,0.00889713,-0.0196875,-0.0686444,-0.0782537,-0.105933,0.00454076,-0.0542642,-0.0448433
app_id,-0.071486,-0.0414471,1.0,-0.0500846,-0.155553,-0.099439,-0.0863589,-0.0784336,0.168398,-0.0857694,-0.0848638,-0.0305188,-0.00677723,-0.0724822,-0.056782,-0.0683356,-0.0542724,-0.0334571,-0.121021
AgeRange_encoded,0.0925375,-0.0636429,-0.0500846,1.0,-0.0712297,-0.0527234,-0.0444038,0.338869,-0.0793519,-0.122125,-0.0465798,-0.054371,-0.0480786,-0.0912725,-0.0959637,-0.152176,-0.0649582,-0.0312524,-0.068804
category_encoded,-0.0696375,-0.0531508,-0.155553,-0.0712297,1.0,0.675515,-0.117885,-0.0748027,-0.237331,-0.141781,-0.10594,-0.0387442,-0.319589,-0.0598914,-0.0353119,-0.0470241,-0.0785286,-0.0675265,-0.023735
category_wide_encoded,-0.0254883,-0.0574632,-0.099439,-0.0527234,0.675515,1.0,-0.0805286,-0.0143005,-0.0963056,-0.0972006,-0.068773,-0.0433732,-0.65529,-0.0540599,-0.0123085,-0.0143463,-0.0750937,-0.0373427,-0.04591
phone_brand_encoded,-0.0849737,-0.109825,-0.0863589,-0.0444038,-0.117885,-0.0805286,1.0,-0.0748893,0.0152997,0.781898,0.057116,0.0832099,-0.0344173,-0.0691194,-0.296775,-0.283562,-0.147657,-0.15432,-0.0959017
group_encoded,0.958339,-0.0293747,-0.0784336,0.338869,-0.0748027,-0.0143005,-0.0748893,1.0,0.0884367,-0.0441255,-0.0474949,-0.0445774,-0.144097,-0.110284,-0.124219,-0.137771,-0.050696,-0.0323926,-0.138154
is_active_encoded,0.108322,-0.0857567,0.168398,-0.0793519,-0.237331,-0.0963056,0.0152997,0.0884367,1.0,0.0489388,-0.0255047,-0.0170066,-0.113396,-0.0935825,-0.120951,-0.117347,-0.0694805,0.15027,-0.181068
device_model_encoded,-0.0185147,-0.120529,-0.0857694,-0.122125,-0.141781,-0.0972006,0.781898,-0.0441255,0.0489388,1.0,0.1439,0.0436653,-0.0485297,-0.0792105,-0.236682,-0.210927,-0.138742,-0.166643,-0.108388


In [12]:
from pyspark.ml import Pipeline
from pyspark.ml import feature
from pyspark.ml import classification
from pyspark.ml.evaluation import BinaryClassificationEvaluator, \
    MulticlassClassificationEvaluator, \
    RegressionEvaluator

In [13]:
training.limit(5).toPandas()

Unnamed: 0,device_id,app_id,label_id,event_id,longitude,latitude,is_active,age,is_installed,day,...,town_index,gender,group,category,phone_brand,device_model,town,country,category_mapped,AgeRange
0,-4.968155e+18,-1.456585e+17,713.0,4633.0,116.379997,39.959999,0,53,1,1,...,35,F,F43+,Services 1,Huawei,荣耀6 Plus,Beijing,China,industry,43+
1,-4.968155e+18,-1.456585e+17,704.0,4633.0,116.379997,39.959999,0,53,1,1,...,35,F,F43+,Property Industry 2.0,Huawei,荣耀6 Plus,Beijing,China,industry,43+
2,-4.968155e+18,-1.456585e+17,548.0,4633.0,116.379997,39.959999,0,53,1,1,...,35,F,F43+,Industry tag,Huawei,荣耀6 Plus,Beijing,China,industry,43+
3,-4.968155e+18,-1.456585e+17,302.0,4633.0,116.379997,39.959999,0,53,1,1,...,35,F,F43+,unknown,Huawei,荣耀6 Plus,Beijing,China,other,43+
4,-4.968155e+18,-1.456585e+17,303.0,4633.0,116.379997,39.959999,0,53,1,1,...,35,F,F43+,unknown,Huawei,荣耀6 Plus,Beijing,China,other,43+


In [19]:
#Random Forest
from pyspark.ml.evaluation import BinaryClassificationEvaluator, \
    MulticlassClassificationEvaluator, \
    RegressionEvaluator

indexer = feature.StringIndexer(inputCol="gender", outputCol="gender_label")
category = feature.StringIndexer(inputCol='category', outputCol='category_encoded')
brand = feature.StringIndexer(inputCol='phone_brand', outputCol='phone_brand_encoded')
group = feature.StringIndexer(inputCol='group', outputCol='group_encoded')
is_active = feature.StringIndexer(inputCol='is_active', outputCol='is_active_encoded')
device_model = feature.StringIndexer(inputCol='device_model', outputCol='device_model_encoded')
town_model = feature.StringIndexer(inputCol='town', outputCol='town_encoded')
country_model = feature.StringIndexer(inputCol='country', outputCol='country_encoded')
category_wide = feature.StringIndexer(inputCol='category_mapped', outputCol='category_wide_encoded')
age_encoded = feature.StringIndexer(inputCol='AgeRange', outputCol='AgeRange_encoded')
age_one_hot = feature.OneHotEncoder(inputCol="AgeRange_encoded", outputCol="age_encoded_vector")
category_encoder = feature.OneHotEncoder(inputCol="category_wide_encoded", outputCol="category_wide_encoded_vector")
is_active_encoder = feature.OneHotEncoder(inputCol='is_active_encoded', outputCol='is_active_one_hot_encoded')

vector_assembler = feature.VectorAssembler(inputCols=['device_id', 'app_id', 'label_id', 'event_id', 'is_active_one_hot_encoded', 'group_encoded',
                                                         'category_wide_encoded_vector', 'phone_brand_encoded'\
                                                     ],
                                        outputCol='features')

sc = feature.StandardScaler(inputCol='features',outputCol='sfeatures')

rf = classification.RandomForestClassifier(labelCol='town_encoded', featuresCol='sfeatures')

random_forest_pipeline = Pipeline(stages=[indexer, category_wide, age_encoded, age_one_hot, category_encoder, brand, group, is_active, is_active_encoder, device_model, country_model,town_model, vector_assembler, sc, rf])

random_forest_pipeline_p=random_forest_pipeline.fit(training)

In [20]:
rf_prediction = random_forest_pipeline_p.transform(test)

In [17]:
rf_prediction.limit(5).toPandas()

Unnamed: 0,device_id,app_id,label_id,event_id,longitude,latitude,is_active,age,is_installed,day,...,is_active_encoded,is_active_one_hot_encoded,device_model_encoded,country_encoded,town_encoded,features,sfeatures,rawPrediction,probability,prediction
0,-4.968155e+18,-1.456585e+17,713.0,4633.0,116.379997,39.959999,0,53,1,1,...,0.0,(1.0),3.0,0.0,41.0,"(-4.968154834977948e+18, -1.456584557246546e+1...","(-0.9503223770935055, -0.027873368274453827, 2...","[4.626419614909939, 0.4747511198960814, 0.3707...","[0.23132098074549695, 0.02373755599480407, 0.0...",0.0
1,-4.968155e+18,-1.456585e+17,704.0,4633.0,116.379997,39.959999,0,53,1,1,...,0.0,(1.0),3.0,0.0,41.0,"(-4.968154834977948e+18, -1.456584557246546e+1...","(-0.9503223770935055, -0.027873368274453827, 2...","[4.626419614909939, 0.4747511198960814, 0.3707...","[0.23132098074549695, 0.02373755599480407, 0.0...",0.0
2,-4.968155e+18,-1.456585e+17,548.0,4633.0,116.379997,39.959999,0,53,1,1,...,0.0,(1.0),3.0,0.0,41.0,"(-4.968154834977948e+18, -1.456584557246546e+1...","(-0.9503223770935055, -0.027873368274453827, 2...","[4.626419614909939, 0.4747511198960814, 0.3707...","[0.23132098074549695, 0.02373755599480407, 0.0...",0.0
3,-4.968155e+18,-1.456585e+17,302.0,4633.0,116.379997,39.959999,0,53,1,1,...,0.0,(1.0),3.0,0.0,41.0,"(-4.968154834977948e+18, -1.456584557246546e+1...","(-0.9503223770935055, -0.027873368274453827, 1...","[4.626419614909939, 0.4747511198960814, 0.3707...","[0.23132098074549695, 0.02373755599480407, 0.0...",0.0
4,-4.968155e+18,-1.456585e+17,303.0,4633.0,116.379997,39.959999,0,53,1,1,...,0.0,(1.0),3.0,0.0,41.0,"(-4.968154834977948e+18, -1.456584557246546e+1...","(-0.9503223770935055, -0.027873368274453827, 1...","[4.626419614909939, 0.4747511198960814, 0.3707...","[0.23132098074549695, 0.02373755599480407, 0.0...",0.0


In [21]:
evaluator = BinaryClassificationEvaluator(labelCol='gender_label')
evaluator.evaluate(rf_prediction)

0.46808403733960924