In [3]:
import findspark
findspark.init()
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [4]:
file_location = "C:/Users/Dinesh_2/Desktop/1000 Sales Records.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df_src = spark.read.format(file_type) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .option("encoding", "UTF-8") \
  .load(file_location)


In [5]:
df_src.printSchema()


root
 |-- Row ID: string (nullable = true)
 |-- Order ID: string (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Ship Date: string (nullable = true)
 |-- Ship Mode: string (nullable = true)
 |-- Customer ID: string (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Postal Code: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product ID: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Sub-Category: string (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Sales: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Discount: string (nullable = true)
 |-- Profit: string (nullable = true)



In [6]:
from pyspark.sql import functions as F
df_src=df_src.withColumn('Order Date',F.col('Order Date').cast('Date')).withColumn('Quantity',F.col('Quantity').cast('int')).withColumn('Sales',F.col('Sales').cast('decimal(23,4)')).withColumn('Discount',F.col('Discount').cast('decimal(23,4)')).withColumn('Profit',F.col('Profit').cast('decimal(23,4)'))
df_src.printSchema()

root
 |-- Row ID: string (nullable = true)
 |-- Order ID: string (nullable = true)
 |-- Order Date: date (nullable = true)
 |-- Ship Date: string (nullable = true)
 |-- Ship Mode: string (nullable = true)
 |-- Customer ID: string (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Postal Code: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product ID: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Sub-Category: string (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Sales: decimal(23,4) (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Discount: decimal(23,4) (nullable = true)
 |-- Profit: decimal(23,4) (nullable = true)



In [12]:
df_src=df_src.withColumn('Price',df_src.Sales/df_src.Quantity)
df_src.select('Customer Name','Country','City','Product Name','Sales','Quantity','Price').show()

+------------------+-------------+---------------+--------------------+---------+--------+-------------------+
|     Customer Name|      Country|           City|        Product Name|    Sales|Quantity|              Price|
+------------------+-------------+---------------+--------------------+---------+--------+-------------------+
|       Claire Gute|United States|      Henderson|Bush Somerset Col...| 261.9600|       2|130.980000000000000|
|       Claire Gute|United States|      Henderson|Hon Deluxe Fabric...| 731.9400|       3|243.980000000000000|
|   Darrin Van Huff|United States|    Los Angeles|Self-Adhesive Add...|  14.6200|       2|  7.310000000000000|
|    Sean O'Donnell|United States|Fort Lauderdale|Bretford CR4500 S...| 957.5775|       5|191.515500000000000|
|    Sean O'Donnell|United States|Fort Lauderdale|Eldon Fold 'N Rol...|  22.3680|       2| 11.184000000000000|
|   Brosina Hoffman|United States|    Los Angeles|Eldon Expressions...|  48.8600|       7|  6.980000000000000|
|

In [13]:
df_src=df_src.withColumnRenamed('Order Date','Order_date')

In [16]:
df_src.createOrReplaceTempView('df_src')
df_tgt=spark.sql("select * from df_src where Order_date<=(select date_add(max(Order_date),-90) from df_src)")

In [17]:
df_tgt.agg(F.count(df_tgt.Order_date).alias('c')).collect()

[Row(c=8787)]

In [18]:
df_src.agg(F.count(df_src.Order_date).alias('c')).collect()

[Row(c=9994)]

In [19]:
df_tgt.createOrReplaceTempView('df_tgt_table')
df_maxtgt_date = spark.sql("select max(Order_date)as MAX_ORDER_DATE from df_tgt_table")

df_maxtgt_date.show()



+--------------+
|MAX_ORDER_DATE|
+--------------+
|    2017-10-01|
+--------------+



In [21]:
df_maxtgt_date.createOrReplaceTempView('df_maxtgt_date')
df_delta=spark.sql("select * from df_src where Order_date>(select MAX_ORDER_DATE from df_maxtgt_date)")
df_delta.agg(F.count(df_delta.Order_date).alias('c')).collect()

[Row(c=1207)]

In [22]:
df_final=df_tgt.union(df_delta)
df_final.agg(F.count(df_final.Order_date).alias('c')).collect()

[Row(c=9994)]