In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pandas as pd
import sys
import os
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql.functions import to_timestamp
import time
from pyspark.sql import functions as fn
from pyspark.ml import feature, regression, Pipeline

import datetime
from pyspark.sql.functions import year, month, dayofmonth, hour, minute, second
from pyspark.mllib.stat import Statistics
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.sql.types import FloatType
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.appName('data-cleaning').\
                        config("spark.executor.instances", '3').\
                        config("spark.executor.memory", '40g').\
                        config('spark.executor.cores', '5').\
                        config('spark.cores.max', '5').appName('data_clean').\
                        getOrCreate()

sqlContext = SQLContext(spark.sparkContext)

from pyspark.ml import Pipeline
from pyspark.ml import feature
from pyspark.ml import classification
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator

In [2]:
train_data = spark.read.format('csv').option('header', 'true').load('../modeled_data/train_with_geo.csv')

In [3]:
train_data.limit(3).toPandas()

Unnamed: 0,_c0,app_id,device_id,label_id,event_id,longitude,latitude,is_active,age,is_installed,...,second,gender,group,category,phone_brand,device_model,town,country,category_mapped,town_index
0,0,-1.4565846e+17,-4.968155e+18,713.0,4633.0,116.38,39.96,0,53,1,...,6,F,F43+,Services 1,Huawei,荣耀6 Plus,Beijing,China,industry,35
1,1,-1.4565846e+17,-4.968155e+18,704.0,4633.0,116.38,39.96,0,53,1,...,6,F,F43+,Property Industry 2.0,Huawei,荣耀6 Plus,Beijing,China,industry,35
2,2,-1.4565846e+17,-4.968155e+18,548.0,4633.0,116.38,39.96,0,53,1,...,6,F,F43+,Industry tag,Huawei,荣耀6 Plus,Beijing,China,industry,35


In [3]:
train_data = train_data.drop('latitude18')\
    .drop('longitude19')\
    .drop('_c0')\
    .withColumnRenamed('latitude5', 'latitude')\
    .withColumnRenamed('longitude4', 'longitude')

In [4]:
training, test = train_data.randomSplit([0.8, 0.2], )

In [23]:
training.count()

103795

In [24]:
test.count()

26004

In [25]:
float_columns = ['device_id', 'app_id', 'label_id', 'event_id', 'longitude', 'latitude']
int_columns = ['is_active', 'age', 'is_installed', 'day', 'hour', 'minute', 'second', 'town_index']
string_columns = ['gender', 'group', 'category', 'phone_brand', 'device_model', 'town', 'country', 'category_mapped']

train_data=train_data.select(*(col(c).cast("float").alias(c) for c in float_columns), \
                                                 *(col(c).cast("int").alias(c) for c in int_columns), \
                                                 *(col(c).alias(c) for c in string_columns))

training = training.select(*(col(c).cast("float").alias(c) for c in float_columns), \
                                                 *(col(c).cast("int").alias(c) for c in int_columns), \
                                                 *(col(c).alias(c) for c in string_columns))
test = test.select(*(col(c).cast("float").alias(c) for c in float_columns), \
                                                 *(col(c).cast("int").alias(c) for c in int_columns), \
                                                 *(col(c).alias(c) for c in string_columns))

In [17]:
training.count()

103795

In [18]:
test.count()

26004

In [107]:
#prep for logistic

indexer = feature.StringIndexer(inputCol="gender", outputCol="gender_label",handleInvalid='skip')
category = feature.StringIndexer(inputCol='category', outputCol='category_encoded',handleInvalid='skip')
brand = feature.StringIndexer(inputCol='phone_brand', outputCol='phone_brand_encoded',handleInvalid='skip')
group = feature.StringIndexer(inputCol='group', outputCol='group_encoded',handleInvalid='skip')
# is_active = feature.StringIndexer(inputCol='is_active', outputCol='is_active_encoded',handleInvalid='skip')
device_model = feature.StringIndexer(inputCol='device_model', outputCol='device_model_encoded',handleInvalid='skip')
town_model = feature.StringIndexer(inputCol='town', outputCol='town_encoded',handleInvalid='skip')
country_model = feature.StringIndexer(inputCol='country', outputCol='country_encoded',handleInvalid='skip')
category_wide = feature.StringIndexer(inputCol='category_mapped', outputCol='category_wide_encoded',handleInvalid='skip')

vector_assembler = feature.VectorAssembler(inputCols=['device_id', 'app_id', 'label_id', 'event_id', 'longitude', 'latitude', 'is_active', 'age', 
                                                            'day', 'hour', 'minute', 'second', 'category_encoded', 'category_wide_encoded', 'phone_brand_encoded',\
                                                     'is_active', 'device_model_encoded'],
                                        outputCol='features')
sc = feature.StandardScaler(inputCol='features',outputCol='sfeatures')

evaluator = BinaryClassificationEvaluator(labelCol='gender_label')

pipe_prep=Pipeline(stages=[indexer, category,category_wide, brand, device_model, vector_assembler, sc])

In [108]:
# logistic with default or no parameters

logistic = classification.LogisticRegression(labelCol='gender_label', featuresCol='sfeatures')

lr_pipe = Pipeline(stages=[pipe_prep, logistic]).fit(training)

result1=evaluator.evaluate(lr_pipe.transform(test))

result1

0.5929850624369608

In [12]:
# logistic with regParam = 0.1

logistic = classification.LogisticRegression(labelCol='gender_label', featuresCol='sfeatures',regParam=0.1)

lr_pipe = Pipeline(stages=[pipe_prep, logistic]).fit(training)

result2=evaluator.evaluate(lr_pipe.transform(test))

result2

0.5911758719539477

In [13]:
# logistic with ElasticNetParam = 0.4 and regParam = 0.1

logistic = classification.LogisticRegression(labelCol='gender_label', featuresCol='sfeatures',regParam=0.1,elasticNetParam=0.4)

lr_pipe = Pipeline(stages=[pipe_prep, logistic]).fit(training)

result3=evaluator.evaluate(lr_pipe.transform(test))

result3

0.5

In [14]:
# logistic with ElasticNetParam = 0.8 and regParam = 0.3

logistic = classification.LogisticRegression(labelCol='gender_label', featuresCol='sfeatures',regParam=0.3,elasticNetParam=0.8)

lr_pipe = Pipeline(stages=[pipe_prep, logistic]).fit(training)

result4=evaluator.evaluate(lr_pipe.transform(test))

result4

0.5

In [15]:
# logistic with ElasticNetParam = 0.8 and regParam = 0.3 and maxIter=10

logistic = classification.LogisticRegression(labelCol='gender_label', featuresCol='sfeatures',regParam=0.3,elasticNetParam=0.8,maxIter=10)

lr_pipe = Pipeline(stages=[pipe_prep, logistic]).fit(training)

result5=evaluator.evaluate(lr_pipe.transform(test))

result5

0.5

In [16]:
# logistic with maxIter=200

logistic = classification.LogisticRegression(labelCol='gender_label', featuresCol='sfeatures',maxIter=200)

lr_pipe = Pipeline(stages=[pipe_prep, logistic]).fit(training)

result6=evaluator.evaluate(lr_pipe.transform(test))

result6

0.5929850624369604

In [17]:
# logistic with ElasticNetParam = 0.1 and regParam = 0.01

logistic = classification.LogisticRegression(labelCol='gender_label', featuresCol='sfeatures',regParam=0.01,elasticNetParam=0.1)

lr_pipe = Pipeline(stages=[pipe_prep, logistic]).fit(training)

result7=evaluator.evaluate(lr_pipe.transform(test))

result7

0.5927645654744123

In [26]:
train_data.printSchema()

root
 |-- device_id: float (nullable = true)
 |-- app_id: float (nullable = true)
 |-- label_id: float (nullable = true)
 |-- event_id: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- latitude: float (nullable = true)
 |-- is_active: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- is_installed: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- second: integer (nullable = true)
 |-- town_index: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- group: string (nullable = true)
 |-- category: string (nullable = true)
 |-- phone_brand: string (nullable = true)
 |-- device_model: string (nullable = true)
 |-- town: string (nullable = true)
 |-- country: string (nullable = true)
 |-- category_mapped: string (nullable = true)



In [109]:
pca=feature.PCA(k=2, inputCol='sfeatures', outputCol='pfeat')

pipe_pca=Pipeline(stages=[pipe_prep,pca]).fit(training)

pca_mod=pipe_pca.transform(train_data)

In [110]:
feat=train_data.columns
actfeat=['device_id', 'app_id', 'label_id', 'event_id', 'longitude', 'latitude', 'age', 
                                                            'day', 'hour', 'minute', 'second', 'category_encoded', 'category_wide_encoded', 'phone_brand_encoded',\
                                                     'is_active', 'device_model_encoded']

# feat

In [111]:
# actfeat=pca_mod.columns

pca=pipe_pca.stages[1].pc.toArray()

pc1_df=pd.DataFrame([pca[:, 0],actfeat]).T.rename(columns={0:'pc1',1:'abs_loadings'})
pc2_df=pd.DataFrame([pca[:, 1],actfeat]).T.rename(columns={0:'pc2',1:'abs_loadings'})

In [115]:
pc1_df.pc1=pc1_df.pc1.abs()

pc1_df.sort_values(by=['pc1'],ascending=False)

Unnamed: 0,pc1,abs_loadings
6,0.66524,age
15,0.66524,device_model_encoded
16,0.144994,
1,0.144915,app_id
14,0.128828,is_active
12,0.127274,category_wide_encoded
9,0.124147,minute
4,0.0928638,longitude
5,0.0779171,latitude
10,0.0765541,second


In [113]:
pc2_df.pc2=pc2_df.pc2.abs()

pc2_df.sort_values(by=['pc2'],ascending=False)

Unnamed: 0,pc2,abs_loadings
4,0.613121,longitude
5,0.604944,latitude
14,0.291743,is_active
16,0.256566,
13,0.151736,phone_brand_encoded
6,0.131258,age
15,0.131258,device_model_encoded
9,0.123078,minute
2,0.118369,label_id
12,0.0987887,category_wide_encoded


In [116]:
impfeat=['age', 'device_model','app_id','is_active','category_mapped','longitude','latitude','phone_brand']

ntrain_data=train_data.select(impfeat)
ntrain_data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- device_model: string (nullable = true)
 |-- app_id: float (nullable = true)
 |-- is_active: integer (nullable = true)
 |-- category_mapped: string (nullable = true)
 |-- longitude: float (nullable = true)
 |-- latitude: float (nullable = true)
 |-- phone_brand: string (nullable = true)



In [118]:
# logistic with imp feat

vc=feature.VectorAssembler(inputCols=['age','device_model_encoded','app_id','is_active',\
                                      'category_wide_encoded','longitude','latitude','phone_brand_encoded'],outputCol='features')

pipe_prep2=Pipeline(stages=[indexer, category_wide, brand, device_model, vc, sc])

logistic = classification.LogisticRegression(labelCol='gender_label', featuresCol='sfeatures')

lr_pipe = Pipeline(stages=[pipe_prep2, logistic]).fit(training)

result8=evaluator.evaluate(lr_pipe.transform(test))

result8


0.5834646953543081

In [119]:
# RF with imp feat, with default parameters

rf=classification.RandomForestClassifier(labelCol='gender_label', featuresCol='sfeatures')

rf_pipe=Pipeline(stages=[pipe_prep2,rf]).fit(training)

resultrf2=evaluator.evaluate(rf_pipe.transform(test))

resultrf2

0.7198191920332011

In [120]:
# RF with imp feat, with maxDepth=10

rf=classification.RandomForestClassifier(labelCol='gender_label', featuresCol='sfeatures',maxDepth=10)

rf_pipe=Pipeline(stages=[pipe_prep2,rf]).fit(training)

resultrf3=evaluator.evaluate(rf_pipe.transform(test))

resultrf3

0.8957866676724066

In [121]:
# RF with imp feat, with maxDepth=10 and numOftrees

rf=classification.RandomForestClassifier(labelCol='gender_label', featuresCol='sfeatures',maxDepth=10,numTrees=50)

rf_pipe=Pipeline(stages=[pipe_prep2,rf]).fit(training)

resultrf4=evaluator.evaluate(rf_pipe.transform(test))

resultrf4

0.9097333833472312