In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pandas as pd
import sys
import os
from pyspark.sql.functions import isnan, when, count, col

In [None]:
spark = SparkSession.builder.appName('data-cleaning').\
                        config("spark.executor.instances", '3').\
                        config("spark.executor.memory", '40g').\
                        config('spark.executor.cores', '5').\
                        config('spark.cores.max', '5').appName('data_clean').\
                        getOrCreate()

In [None]:
spark.sparkContext.addFile('../libraries/spark-csv_2.11-1.5.0.jar')

In [None]:
sqlContext = SQLContext(spark.sparkContext)

In [None]:
app_events = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../data/app_events.csv')
phone_brands = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../modeled_data/phone_brand_device_model_mod.csv')
events = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../data/events.csv')
app_labels = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../data/app_labels.csv')
label_categories = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../data/label_categories.csv')
gender_age = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../data/gender_age_train.csv')

In [None]:
app_events.limit(5).toPandas()

In [None]:
phone_brands.limit(5).toPandas()

In [None]:
events.limit(5).toPandas()

In [None]:
app_labels.limit(5).toPandas()

In [None]:
label_categories.limit(5).toPandas()

In [None]:
gender_age.limit(5).toPandas()

In [None]:
inner_join = events.join(phone_brands, "device_id").\
                    join(app_events, 'event_id').\
                    join(app_labels, 'app_id').\
                    join(label_categories, 'label_id').\
                    join(gender_age, 'device_id')
join(app_events, "app_id")

In [None]:
sample = inner_join.limit(10).toPandas()

In [None]:
sample

In [None]:
reordered_columns = ['device_id', 'phone_brand', 'device_model', 'app_id', 'is_active', 'is_installed', 'category', 'gender', 'age', 'group', 'event_id', 'timestamp', 'latitude', 'longitude']

In [None]:
inner_join = inner_join[reordered_columns]

In [None]:
inner_join.printSchema()

In [None]:
cols = inner_join.columns
if 'timestamp' in cols: cols.remove('timestamp')
cols

In [None]:
inner_join.select([count(when(isnan(c), c)).alias(c)\
                   for c in cols]).show()

In [None]:
clean_data = inner_join.filter((col('latitude').isin([0]) == 'False') | \
                               (col('longitude').isin([0]) == 'False'))

In [None]:
inner_join.groupBy('phone_brand').agg({'phone_brand':'count'}).limit(5).show()

In [None]:
clean_data.write('../data/clean_data.csv', 'com.databricks.spark.csv')
clean_data.repartition(1).write.format('com.databricks.spark.csv').save("../data/clean_data.csv",header = 'true')

In [None]:
load_data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../data/clean_data.csv/part-00000-a4e5319d-2cf6-42ab-a97f-4e4ce1cfee52-c000.csv')
clean_data.write.parquet('../modeled_data/clean_data.parquet')
clean_data_rdd = clean_data.rdd

In [None]:
clean_data_rdd.getNumPartitions()

In [None]:
clean_data.write.parquet('../modeled_data/clean_data.parquet')

In [None]:
sample = sqlContext.read.parquet('../modeled_data/clean_data.parquet')

In [None]:
sample.limit(5).toPandas()

In [None]:
spark.stop()