In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import IntegerType

In [24]:
spark = SparkSession.builder.getOrCreate()
#specifying the data path
data_path = '/Users/hamidsakhi/git/BDM-P3/'


purchase_file_path = data_path + 'purchase.csv'
purchase_df = spark.read.format('csv').option('header','false').load(purchase_file_path)
#since the dataset doesn't have column names we should add headers here
purchase_df = purchase_df.withColumnRenamed('_c0','TransId')\
                               .withColumnRenamed('_c1','CustId')\
                               .withColumnRenamed('_c2','TransTotal')\
                               .withColumnRenamed('_c3','TransNumItem')\
                               .withColumnRenamed('_c4','TransDesc')
purchase_df.createOrReplaceTempView('purchase')

customer_file_path = data_path + 'customer.csv'
customer_df = spark.read.format('csv').option('header','false').load(customer_file_path)
#since the dataset doesn't have column names we should add headers here
customer_df = customer_df.withColumnRenamed('_c0','ID')\
                               .withColumnRenamed('_c1','Name')\
                               .withColumnRenamed('_c2','Age')\
                               .withColumnRenamed('_c3','CountryCode')\
                               .withColumnRenamed('_c4','Salary')
#creating the view 
customer_df.createOrReplaceTempView('customer')

In [28]:
T1 = spark.sql("""SELECT * FROM purchase where transtotal>600""")

purchase_df.createOrReplaceTempView('T1')

In [30]:
T2 = spark.sql(""" SELECT AVG(TransTotal) as Mean, MIN(TransTotal) As Min, Max(TransTotal) As Max 
                    FROM T1
                    GROUP BY TransNumitem""")

In [65]:
#reporting back
T2.show()

+------------------+-----+------+
|              Mean|  Min|   Max|
+------------------+-----+------+
|1004.8838597354422| 10.0|999.99|
|1004.5357413229322| 10.0|999.99|
|1004.0483739908203| 10.0|999.99|
|1004.8273086510765| 10.0|999.99|
| 1004.497867026794|10.03|999.99|
|1004.5615556615148|10.01|999.99|
|1005.7739147978656|10.01|999.99|
|1004.8441406029958| 10.0|999.99|
|1004.9988513081908| 10.0|999.97|
|1004.5041289494897|10.01|999.99|
|1005.1305228108343| 10.0|999.99|
|1004.8712438917504| 10.0|999.99|
|1006.6067367065345|10.01|999.99|
|1006.5836723757462| 10.0|999.99|
|1004.6530546314438| 10.0|999.98|
+------------------+-----+------+



In [105]:
T3 = spark.sql(""" SELECT p.custid, sum(p.TransNumItem) as total_transnumItem, Max(c.age) FROM T1 p, customer c 
                    where p.custid = c.id
                    and c.age between 18 and 25
                    group by p.custid
                    """).show()

+------+------------------+--------+
|custid|total_transnumItem|max(age)|
+------+------------------+--------+
|  1159|             767.0|      18|
| 14887|            1007.0|      23|
| 14899|             770.0|      19|
|  1572|             900.0|      18|
| 16576|             881.0|      22|
| 17427|             870.0|      20|
| 18726|             693.0|      19|
| 19132|             914.0|      20|
| 20512|             788.0|      21|
| 25969|             896.0|      24|
| 28316|             763.0|      23|
| 29539|             752.0|      18|
| 29573|             651.0|      19|
| 30923|             785.0|      21|
| 31518|             820.0|      21|
| 31713|             910.0|      22|
| 33783|             835.0|      20|
| 35844|             774.0|      20|
| 36526|             851.0|      18|
| 40874|             785.0|      25|
+------+------------------+--------+
only showing top 20 rows



In [None]:
T4 = spark.sql("SELECT custid as c1