In [None]:
 # Importing Postgres JDBC packages for data extraction
pyspark --packages org.postgresql:postgresql:42.2.10 --conf spark.sql.catalogImplementation=in-memory

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import *
from pyspark.sql.types import StructType,StructField, StringType
from pyspark.sql.functions import col
from pyspark.sql.functions import rand
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.mllib.stat import Statistics
from pyspark.sql.functions import covar_pop
from pyspark.sql.functions import input_file_name
import pandas as pd
import glob
import os
import pandas as pd

spark = SparkSession.builder.master("local[1]") \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()

# Temptables cleaning process

spark.catalog.dropTempView("cust_info_s")
spark.catalog.dropTempView("trans_info_s")
spark.catalog.dropTempView("offer_info_s")

# List of table to extract from source server

tablename_list = ['cust_info_s','trans_info_s','offer_info_s']

# Data extraction from Postgres server for dependent tables
    
url = "jdbc:postgresql://localhost:5431/postgres"
reader = (
    sqlContext.read.format("jdbc")
    .option("url", url)
    .option("user", "postgres")
    .option("password", "puvi")
    .option("driver", "org.postgresql.Driver")
)
for tablename in tablename_list:
    reader.option("dbtable", tablename).load().registerTempTable(tablename)


    
# Sample view of source tables extracted from Postgres server   

sqlc = SQLContext(sc)

sqlc.sql("select * from cust_info_s").show(10)
sqlc.sql("select * from trans_info_s").show(10)
sqlc.sql("select * from offer_info_s").show(10)

# Tranforming the data into business requirements

df_transf = sqlc.sql("""  WITH sales_data AS ( select sum(sales) as total_sales,count(trans_id) as visits,cust_id,date 
            from trans_info_s
            group by date,cust_id),
            offer_data AS ( 
            select count(o.offer_id) as no_offer_received,sum(offer_redem) as no_offer_redem,o.cust_id 
            from offer_info_s o
            inner join trans_info_s t
            on t.cust_id = o.cust_id
            group by o.cust_id order by o.cust_id)
            select b.date,b.cust_id,a.cust_name,a.cust_dob,
            int(datediff(current_date(),TO_DATE(CAST(UNIX_TIMESTAMP(a.cust_dob,'yyyy-MM-dd') AS TIMESTAMP)))/365) as age,
            total_sales,c.no_offer_received,c.no_offer_redem,visits, date_format(to_date(b.date),'E') as days,
            int(date_format(to_date(b.date),'w')) AS  week_number
            from cust_info_s a
            left join sales_data b
            on a.cust_id = b.cust_id
            inner join offer_data c
            on b.cust_id = c.cust_id
            group by b.date,a.cust_id,b.cust_id,a.cust_name,a.cust_dob,c.no_offer_received,c.no_offer_redem,total_sales,visits,days
            order by a.cust_id
            """)

final_dfs = df_transf.orderBy('cust_id').groupby('cust_id','cust_name','age','date','days','no_offer_received','no_offer_redem','total_sales').pivot('days').max('no_offer_received').fillna(0)

#Sampling data - In case of job failure data can be viewed in Spark UI

final_dfs.show()


# Writin the final data into CSV files


final_dfs.repartition(1).write.mode("overwrite").option("header",True).csv("file:///C:/hdaoopdata/ics_weekly_data/")


+-------------------+----------------+----------+--------+
|          cust_name|         cust_id|  cust_dob|  gender|
+-------------------+----------------+----------+--------+
|         RoseJacobi|6011837504027367|1981-08-06|   Edgar|
|Dr. Carley Predovic|4716744331063073|1984-07-30|   Deron|
|      Lilla Weimann|5370009829177288|1993-05-16|Leonardo|
|Cathrine Hodkiewicz| 345858730400343|1995-05-20|    Rick|
|      Celia Pfeffer| 340364615541355|2009-06-25|  Bryana|
|        Cleve Fahey|2628327277351883|2017-08-16| Ibrahim|
|     Gillian Senger| 346688355572616|1972-02-06| Keshawn|
|     Joshua Roberts|5290632464794443|1995-08-18|   Ethan|
|   Rasheed Gislason|2506922387299209|1994-07-06| Rasheed|
|      Felicity Ryan|4532251518044506|2001-09-08|   Aleen|
+-------------------+----------------+----------+--------+
only showing top 10 rows

+--------+----------------+----------+--------+---------+-----------+-----+----------+
|trans_id|         cust_id|product_id|store_id| offer_id|offe