In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pandas as pd
import sys
import os
from pyspark.sql.functions import isnan, when, count, col

In [2]:
spark = SparkSession.builder.appName('data-cleaning').\
                        config("spark.executor.instances", '3').\
                        config("spark.executor.memory", '40g').\
                        config('spark.executor.cores', '5').\
                        config('spark.cores.max', '5').appName('data_clean').\
                        getOrCreate()

In [3]:
spark.sparkContext.addFile('../libraries/spark-csv_2.11-1.5.0.jar')

In [4]:
sqlContext = SQLContext(spark.sparkContext)

In [5]:
app_events = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../data/app_events.csv')
phone_brands = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../modeled_data/phone_brand_device_model_mod.csv')
events = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../data/events.csv')
app_labels = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../data/app_labels.csv')
label_categories = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../data/label_categories.csv')
gender_age = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../data/gender_age_train.csv')

In [6]:
app_events.limit(5).toPandas()

In [7]:
phone_brands.limit(5).toPandas()

In [8]:
events.limit(5).toPandas()

In [9]:
app_labels.limit(5).toPandas()

In [10]:
label_categories.limit(5).toPandas()

In [11]:
gender_age.limit(5).toPandas()

In [12]:
inner_join = events.join(phone_brands, "device_id").\
                    join(app_events, 'event_id').\
                    join(app_labels, 'app_id').\
                    join(label_categories, 'label_id').\
                    join(gender_age, 'device_id')
join(app_events, "app_id")

In [13]:
sample = inner_join.limit(10).toPandas()

In [14]:
sample

In [15]:
reordered_columns = ['device_id', 'phone_brand', 'device_model', 'app_id', 'is_active', 'is_installed', 'category', 'gender', 'age', 'group', 'event_id', 'timestamp', 'latitude', 'longitude']

In [16]:
inner_join = inner_join[reordered_columns]

In [17]:
inner_join.printSchema()

In [18]:
cols = inner_join.columns
if 'timestamp' in cols: cols.remove('timestamp')
cols

['device_id',
 'phone_brand',
 'device_model',
 'app_id',
 'is_active',
 'is_installed',
 'category',
 'gender',
 'age',
 'group',
 'event_id',
 'latitude',
 'longitude']

In [19]:
inner_join.select([count(when(isnan(c), c)).alias(c)\
                   for c in cols]).show()

In [20]:
clean_data = inner_join.filter((col('latitude').isin([0]) == 'False') | \
                               (col('longitude').isin([0]) == 'False'))

In [21]:
inner_join.groupBy('phone_brand').agg({'phone_brand':'count'}).limit(5).show()

In [22]:
clean_data.write('../data/clean_data.csv', 'com.databricks.spark.csv')
clean_data.repartition(1).write.format('com.databricks.spark.csv').save("../data/clean_data.csv",header = 'true')

In [23]:
load_data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../data/clean_data.csv/part-00000-a4e5319d-2cf6-42ab-a97f-4e4ce1cfee52-c000.csv')
clean_data.write.parquet('../modeled_data/clean_data.parquet')
clean_data_rdd = clean_data.rdd

In [24]:
clean_data_rdd.getNumPartitions()

200

In [25]:
clean_data.write.parquet('../modeled_data/clean_data.parquet')

In [27]:
sample = sqlContext.read.parquet('../modeled_data/clean_data.parquet')

In [28]:
sample.limit(5).toPandas()

Unnamed: 0,device_id,phone_brand,device_model,app_id,is_active,is_installed,category,gender,age,group,event_id,timestamp,latitude,longitude
0,-2263134200994072375,Xiaomi,红米Note2,-9066890603850550053,0,1,The elimination of class,F,56,F43+,2508603,2016-05-03 08:46:04,40.73,122.1
1,-2263134200994072375,Xiaomi,红米Note2,-9066890603850550053,0,1,game,F,56,F43+,2508603,2016-05-03 08:46:04,40.73,122.1
2,-2263134200994072375,Xiaomi,红米Note2,-9066890603850550053,0,1,Tencent,F,56,F43+,2508603,2016-05-03 08:46:04,40.73,122.1
3,-2263134200994072375,Xiaomi,红米Note2,-9066890603850550053,0,1,Custom label,F,56,F43+,2508603,2016-05-03 08:46:04,40.73,122.1
4,-2263134200994072375,Xiaomi,红米Note2,-9066890603850550053,0,1,The elimination of class,F,56,F43+,36755,2016-05-04 08:36:22,40.72,122.1


In [29]:
spark.stop()