In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pandas as pd
import sys
import os
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql.functions import to_timestamp
import time
from pyspark.sql import functions as fn

import datetime
from pyspark.sql.functions import year, month, dayofmonth, hour, minute, second

In [2]:
spark = SparkSession.builder.appName('data-cleaning').\
                        config("spark.executor.instances", '3').\
                        config("spark.executor.memory", '40g').\
                        config('spark.executor.cores', '5').\
                        config('spark.cores.max', '5').appName('data_clean').\
                        getOrCreate()

In [37]:
spark.sparkContext.addFile('../libraries/spark-csv_2.11-1.5.0.jar')

In [3]:
sqlContext = SQLContext(spark.sparkContext)

In [4]:
app_events = spark.read.format("csv").option("header", "true").load('../data/app_events.csv')
phone_brands = spark.read.format("csv").option("header", "true").load('../modeled_data/phone_brand_device_model_mod.csv')
events = spark.read.format("csv").option("header", "true").load('../data/events.csv')
app_labels = spark.read.format("csv").option("header", "true").load('../data/app_labels.csv')
label_categories = spark.read.format("csv").option("header", "true").load('../data/label_categories.csv')
gender_age = spark.read.format("csv").option("header", "true").load('../data/gender_age_train.csv')

In [5]:
#app_events_sampled = app_events.sample(False, 0.004, seed=0)

In [6]:
app_events.limit(5).toPandas()

Unnamed: 0,event_id,app_id,is_installed,is_active
0,2,5927333115845830913,1,1
1,2,-5720078949152207372,1,0
2,2,-1633887856876571208,1,0
3,2,-653184325010919369,1,1
4,2,8693964245073640147,1,1


In [7]:
app_events_sampled.count()

129728

In [9]:
inner_join = events.join(app_events_sampled, 'event_id', 'inner')

In [50]:
from pyspark.sql.functions import desc
app_events_sampled.select(col('event_id').cast('float').\
                          alias('event_id_int')).\
                          groupBy('event_id_int').count().\
                          sort('event_id_int').show()

+------------+-----+
|event_id_int|count|
+------------+-----+
|        39.0|    1|
|        44.0|    1|
|        61.0|    1|
|        87.0|    1|
|        99.0|    1|
|       114.0|    1|
|       116.0|    1|
|       122.0|    1|
|       127.0|    1|
|       130.0|    1|
|       136.0|    1|
|       138.0|    1|
|       139.0|    1|
|       149.0|    2|
|       163.0|    1|
|       164.0|    1|
|       172.0|    1|
|       229.0|    1|
|       243.0|    1|
|       284.0|    1|
+------------+-----+
only showing top 20 rows



In [65]:
#inner_join.limit(5).toPandas()
inner_join.select(col('event_id').cast('float').\
                          alias('event_id_int')).\
                          groupBy('event_id_int').count().\
                          sort(desc('count')).show()

+------------+-----+
|event_id_int|count|
+------------+-----+
|    127215.0|    4|
|   1575608.0|    4|
|    595011.0|    4|
|    967852.0|    4|
|   2871565.0|    4|
|   2974398.0|    4|
|   1307712.0|    4|
|   2403830.0|    4|
|   1660728.0|    4|
|   2016371.0|    4|
|    890346.0|    4|
|   3067706.0|    4|
|   1814487.0|    4|
|   2076599.0|    4|
|   2091032.0|    4|
|    640263.0|    4|
|   1550168.0|    4|
|    410166.0|    4|
|   2842067.0|    4|
|   1958728.0|    4|
+------------+-----+
only showing top 20 rows



In [10]:
inner_join.filter((inner_join.event_id == 127215.0)).toPandas()

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude,app_id,is_installed,is_active
0,127215,3893593125511450725,2016-05-05 09:27:03,116.28,39.91,-3820274685127793704,1,0
1,127215,3893593125511450725,2016-05-05 09:27:03,116.28,39.91,2417928574338319794,1,0
2,127215,3893593125511450725,2016-05-05 09:27:03,116.28,39.91,5094932332409343445,1,0
3,127215,3893593125511450725,2016-05-05 09:27:03,116.28,39.91,1538962728253750182,1,0


In [11]:
#inner_join.count()
inner_join.filter((inner_join.longitude == '0.00') & (inner_join.latitude == '0.00')).count()

77944

In [22]:
full_data = inner_join.join(phone_brands, 'device_id', 'inner').\
                        join(app_labels, 'app_id', 'inner').\
                        join(label_categories, 'label_id', 'inner').\
                        join(gender_age, 'device_id', 'inner')
full_data.filter((full_data.longitude == '0.00') & (full_data.latitude == '0.00')).count()

184812

In [24]:
full_data=full_data.withColumn('time-stamp',to_timestamp('timestamp')).drop('timestamp').drop('timestamp')
full_data=full_data.withColumn('year',year(fn.col('time-stamp'))).\
    withColumn('month',month(fn.col('time-stamp'))).\
    withColumn('day',dayofmonth(fn.col('time-stamp'))).\
    withColumn('hour',hour(fn.col('time-stamp'))).\
    withColumn('minute',minute(fn.col('time-stamp'))).\
    withColumn('second',second(fn.col('time-stamp')))
full_data=full_data.drop('year').drop('month').drop('time-stamp')

In [33]:
events_train = full_data.filter((full_data.longitude != '0.00') & (full_data.latitude != '0.00'))
events_test = full_data.filter((full_data.longitude == '0.00') & (full_data.latitude == '0.00'))

In [23]:
events_train.count()

130261

In [26]:
events_train.limit(5).toPandas()

Unnamed: 0,device_id,label_id,app_id,event_id,longitude,latitude,is_installed,is_active,device_model,phone_brand,category,gender,age,group,day,hour,minute,second
0,-4968154927622705128,713,-145658454112781034,4633,116.38,39.96,1,0,荣耀6 Plus,Huawei,Services 1,F,53,F43+,1,7,48,6
1,-4968154927622705128,704,-145658454112781034,4633,116.38,39.96,1,0,荣耀6 Plus,Huawei,Property Industry 2.0,F,53,F43+,1,7,48,6
2,-4968154927622705128,548,-145658454112781034,4633,116.38,39.96,1,0,荣耀6 Plus,Huawei,Industry tag,F,53,F43+,1,7,48,6
3,-4968154927622705128,302,-145658454112781034,4633,116.38,39.96,1,0,荣耀6 Plus,Huawei,unknown,F,53,F43+,1,7,48,6
4,-4968154927622705128,303,-145658454112781034,4633,116.38,39.96,1,0,荣耀6 Plus,Huawei,unknown,F,53,F43+,1,7,48,6


In [34]:
from pyspark.sql.types import FloatType
from pyspark.sql.types import IntegerType

float_columns = ['app_id', 'device_id', 'label_id', 'event_id', 'longitude', 'latitude']
int_columns = ['is_active', 'age', 'is_installed', 'day', 'hour', 'minute', 'second']
string_columns = ['gender', 'group', 'category', 'phone_brand', 'device_model']

combined_data_set = [events_train, events_test]
events_train, events_test = [dataset.select(*(col(c).cast("float").alias(c) for c in float_columns), \
                                                 *(col(c).cast("int").alias(c) for c in int_columns), \
                                                 *(col(c).alias(c) for c in string_columns)) \
                for dataset in combined_data_set]

In [36]:
events_test.printSchema()

root
 |-- app_id: float (nullable = true)
 |-- device_id: float (nullable = true)
 |-- label_id: float (nullable = true)
 |-- event_id: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- latitude: float (nullable = true)
 |-- is_active: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- is_installed: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- second: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- group: string (nullable = true)
 |-- category: string (nullable = true)
 |-- phone_brand: string (nullable = true)
 |-- device_model: string (nullable = true)



In [42]:
events_train.toPandas().to_csv("../modeled_data/train.csv", index=False)

In [41]:
events_test.toPandas().to_csv("../modeled_data/test.csv", index=False)