
# iphone Sales Analysis

In [0]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
spark = SparkSession.builder.appName("iphone_sales_analysis").getOrCreate()


### API for Sales Data Collector

##### It reads a sales data that has │ has delimiter and has a header, and returns a partitioned hive-table in Parquet format.

##### Sales data with 10,000 rows and Product data was created using chatGPT so that I can work with larger data.



In [0]:
sales_filepath_csv = "dbfs:/FileStore/shared_uploads/snl.adh97@gmail.com/sales_data.csv"

product_filepath_csv = "dbfs:/FileStore/shared_uploads/snl.adh97@gmail.com/product_data.csv"

sales_filepath = "dbfs:/FileStore/shared_uploads/snl.adh97@gmail.com/large_sales_data_pipe_delimited.txt"

In [0]:
def sales_data_api(spark, text_file_path):
    
    # defining schema in order to keep control over data types
    sales_schema = StructType([
        StructField("seller_id", IntegerType(), True),
        StructField("product_id", IntegerType(), True),
        StructField("buyer_id", IntegerType(), True),
        StructField("sale_date", StringType(), True),  # Read as string initially
        StructField("quantity", IntegerType(), True),
        StructField("price", IntegerType(), True)])

    # reading sales data in csv format
    sales = spark.read.format("csv").option("delimiter","|").schema(sales_schema).option("header",True).load(sales_filepath)

    # converting sale_date column to datetype from stringtype
    sales = sales.withColumn("sale_date", col("sale_date").cast(DateType()))

    # creating a hive table with partitioned sale_date
    hive_table_name = "sale_date_partitioned"

    sales.write.mode("overwrite").partitionBy("sale_date").format("parquet").saveAsTable(hive_table_name)

    return hive_table_name

In [0]:
# Count the number of rows in the table
# row_count = spark.sql(f"SELECT COUNT(*) AS row_count FROM {hive_table_name}").collect()[0]["row_count"]
# print(f"The Hive table '{hive_table_name}' contains {row_count} rows.")

The Hive table 'sales_partitioned' contains 10000 rows.


In [0]:
sales_data_api(spark,sales_filepath)

Out[9]: 'sale_date_partitioned'