In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from datetime import datetime
from string import *

spark=SparkSession.builder \
      .appName("Market_campaign_analysis") \
      .enableHiveSupport() \
      .getOrCreate()
               
ad_campaign_data=[{
"campaign_id": "ABCDFAE",
"campaign_name": "Food category target campaign",
"campaign_country": "USA",
"os_type": "ios",
"device_type": "apple",
"place_id": "CASSBB-11",
"user_id": "1264374214654454321",
"event_type": "impression",
"event_time": "2018-10-12T13:10:05.000Z"
},
{
"campaign_id": "ABCDFAE",
"campaign_name": "Food category target campaign",
"campaign_country": "USA",
"os_type": "android",
"device_type": "MOTOROLA",
"place_id": "CADGBD-13",
"user_id": "1674374214654454321",
"event_type": "impression",
"event_time": "2018-10-12T13:09:04.000Z"
},
{
"campaign_id": "ABCDFAE",
"campaign_name": "Food category target campaign",
"campaign_country": "USA",
"os_type": "android",
"device_type": "SAMSUNG",
"place_id": "BADGBA-12",
"user_id": "5747421465445443",
"event_type": "video ad",
"event_time": "2018-10-12T13:10:10.000Z"
},
{
"campaign_id": "ABCDFAE",
"campaign_name": "Food category target campaign",
"campaign_country": "USA",
"os_type": "android",
"device_type": "SAMSUNG",
"place_id": "CASSBB-11",
"user_id": "1864374214654454132",
"event_type": "click",
"event_time": "2018-10-12T13:10:12.000Z"
}
]

### User profile dataset
user_profile_data=[
{
"user_id": "1264374214654454321",
"country": "USA",
"gender": "male",
"age_group": "18-25",
"category": [
"shopper",
"student"
]
},
{
"user_id": "1674374214654454321",
"country": "USA",
"gender": "female",
"age_group": "25-50",
"category": [
"parent"
]
},
{
"user_id": "5747421465445443",
"country": "USA",
"gender": "male",
"age_group": "25-50",
"category": [
"shopper",
"parent",
"professional"
]
},
{
"user_id": "1864374214654454132",
"country": "USA",
"gender": "male",
"age_group": "50+",
"category": [
"professional"
]
},
{
"user_id": "14537421465445443",
"country": "USA",
"gender": "female",
"age_group": "18-25",
"category": [
"shopper",
"student"
]
},
{
"user_id": "25547421465445443",
"country": "USA",
"gender": "female",
"age_group": "50+",
"category": [
"shopper",
"professional"
]
}
]

#store_data
store_data=[
{
"store_name": "McDonald",
"place_ids": [
"CASSBB-11",
"CADGBD-13",
"FDBEGD-14"
]
},
{
"store_name": "BurgerKing",
"place_ids": [
"CASSBB-11"
]
},
{
"store_name": "Macys",
"place_ids": [
"BADGBA-13",
"CASSBB-15",
"FDBEGD-15"
]
},
{
"store_name": "shoppers stop",
"place_ids": [
"BADGBA-12"
]
}
]

    
    
ad_campaigns_df=spark.createDataFrame(ad_campaign_data)
user_df=spark.createDataFrame(user_profile_data)
store_df=spark.createDataFrame(store_data)

ad_campaigns_df.show(5)
user_df.show(5)
store_df.show(5)

+----------------+-----------+--------------------+-----------+--------------------+----------+-------+---------+-------------------+
|campaign_country|campaign_id|       campaign_name|device_type|          event_time|event_type|os_type| place_id|            user_id|
+----------------+-----------+--------------------+-----------+--------------------+----------+-------+---------+-------------------+
|             USA|    ABCDFAE|Food category tar...|      apple|2018-10-12T13:10:...|impression|    ios|CASSBB-11|1264374214654454321|
|             USA|    ABCDFAE|Food category tar...|   MOTOROLA|2018-10-12T13:09:...|impression|android|CADGBD-13|1674374214654454321|
|             USA|    ABCDFAE|Food category tar...|    SAMSUNG|2018-10-12T13:10:...|  video ad|android|BADGBA-12|   5747421465445443|
|             USA|    ABCDFAE|Food category tar...|    SAMSUNG|2018-10-12T13:10:...|     click|android|CASSBB-11|1864374214654454132|
+----------------+-----------+--------------------+-----------

##### Analyse data for each campaign_id, date, hour, os_type & value to get all the events with counts

In [20]:
from pyspark.sql.functions import *
#ad_campaign_df_transform=ad_campaigns_df.select("campaign_country","campaign_id","campaign_name","device_type","event_time","event_type")
ad_campaign_df_transform=ad_campaigns_df.groupBy("campaign_country",\
                                                 "campaign_id",\
                                                 "campaign_name",\
                                                 to_date(col("event_time")).alias("event_date"),"event_type") \
                            .agg(count('*'))
ad_campaign_df_transform.show(5)

+----------------+-----------+--------------------+----------+----------+--------+
|campaign_country|campaign_id|       campaign_name|event_date|event_type|count(1)|
+----------------+-----------+--------------------+----------+----------+--------+
|             USA|    ABCDFAE|Food category tar...|2018-10-12|impression|       2|
|             USA|    ABCDFAE|Food category tar...|2018-10-12|  video ad|       1|
|             USA|    ABCDFAE|Food category tar...|2018-10-12|     click|       1|
+----------------+-----------+--------------------+----------+----------+--------+



##### Analyse data for each campaign_id, date, hour, store_name & value to get all the events with counts

In [43]:
#explode place_ids per store name
store_details_df=store_df.select(explode("place_ids").alias("place_ids") , "store_name")
#store_details_df.show(10)
ad_campaign_and_store_join=ad_campaigns_df.join(store_details_df, col("place_ids")==col("place_id"), how='left')
#ad_campaign_and_store_join.show(5)
ad_campaign_and_store_join=ad_campaign_and_store_join.withColumn('type', when(col('place_ids').isNotNull(),'store_name'))
#ad_campaign_and_store_join.show(5)
ad_campaign_and_store_join_transform=ad_campaign_and_store_join.groupBy("campaign_id",\
                                                 to_date("event_time").alias('event_date'),\
                                                 hour("event_time").alias("event_hour"),\
                                                 col("type").alias("type"),\
                                                "store_name",\
                                                "event_type"
                                                )\
                        .agg(count('*').alias("event_count"))
ad_campaign_and_store_join_transform.show(10)

                                                                                

+-----------+----------+----------+----------+-------------+----------+-----------+
|campaign_id|event_date|event_hour|      type|   store_name|event_type|event_count|
+-----------+----------+----------+----------+-------------+----------+-----------+
|    ABCDFAE|2018-10-12|        13|store_name|   BurgerKing|impression|          1|
|    ABCDFAE|2018-10-12|        13|store_name|     McDonald|     click|          1|
|    ABCDFAE|2018-10-12|        13|store_name|shoppers stop|  video ad|          1|
|    ABCDFAE|2018-10-12|        13|store_name|   BurgerKing|     click|          1|
|    ABCDFAE|2018-10-12|        13|store_name|     McDonald|impression|          2|
+-----------+----------+----------+----------+-------------+----------+-----------+



##### Analyse data for each campaign_id, date, hour, gender_type & value to get all the events with counts

In [46]:
ad_campaign_and_user_join=ad_campaigns_df.join(user_df,on="user_id",how="left")
#ad_campaign_and_user_join.show(10)
#ad_campaign_and_user_join.printSchema()

ad_campaign_and_user_join_transform=ad_campaign_and_user_join.groupBy("campaign_id",to_date("event_time").alias("date"),hour("event_time").alias("hour"),\
                                                                      col("gender").alias("gender_type"),\
                                                                     col('event_type').alias("type"))\
.agg(count('*').alias('value'))
ad_campaign_and_user_join_transform.show(10)

                                                                                

+-----------+----------+----+-----------+----------+-----+
|campaign_id|      date|hour|gender_type|      type|value|
+-----------+----------+----+-----------+----------+-----+
|    ABCDFAE|2018-10-12|  13|       male|  video ad|    1|
|    ABCDFAE|2018-10-12|  13|     female|impression|    1|
|    ABCDFAE|2018-10-12|  13|       male|     click|    1|
|    ABCDFAE|2018-10-12|  13|       male|impression|    1|
+-----------+----------+----+-----------+----------+-----+



##### write the data into files in json format

In [49]:
path='/tmp/market_campaign_analysis_output_files'

ad_campaign_and_user_join_transform.write.mode('append').json(path+"/ad_campaign_and_user_join_transform")
ad_campaign_and_store_join_transform.write.mode('append').json(path+"/ad_campaign_and_store_join_transform")
ad_campaign_df_transform.write.mode('append').json(path+"/ad_campaign_df_transform")

                                                                                