In [0]:
# Step 1: Import necessary libraries
from pyspark.sql import SparkSession

# Step 2: Initialize Spark Session
spark = SparkSession.builder \
    .appName("Read CSV with PySpark") \
    .getOrCreate()

# Step 3: Read the CSV file into a DataFrame
df_Location = spark.read.csv("/FileStore/tables/Location-1.csv", header=True, inferSchema=True)
df_Product = spark.read.csv("/FileStore/tables/Product-1.csv", header=True, inferSchema=True)
df_Facts= spark.read.csv("/FileStore/tables/Fact-2.csv", header=True, inferSchema=True)

# Show the DataFrame content
df_Location.display()
df_Product.display()
df_Facts.display()

# Step 4 (Optional): If you want to perform additional operations, you can do so
# For example, printing the schema
df_Location.printSchema()
df_Product.printSchema()
df_Facts.printSchema()


Area Code,State,Market,Market Size
203,Connecticut,East,Small Market
206,Washington,West,Small Market
209,California,West,Major Market
210,Texas,South,Major Market
212,New York,East,Major Market
213,California,West,Major Market
214,Texas,South,Major Market
216,Ohio,Central,Major Market
217,Illinois,Central,Major Market
224,Illinois,Central,Major Market


ProductId,Product Type,Product,Type
1,Coffee,Amaretto,Regular
2,Coffee,Columbian,Regular
3,Coffee,Decaf Irish Cream,Decaf
4,Espresso,Caffe Latte,Regular
5,Espresso,Caffe Mocha,Regular
6,Espresso,Decaf Espresso,Decaf
7,Espresso,Regular Espresso,Regular
8,Herbal Tea,Chamomile,Decaf
9,Herbal Tea,Lemon,Decaf
10,Herbal Tea,Mint,Decaf


Date,ProductId,Profit,Sales,Margin,COGS,Total Expenses,Marketing,Inventory,Budget Profit,Budget COGS,Budget Margin,Budget Sales,Area Code
1/1/2010,1,94,219,130,89,36,24,777,100,90,130,220,719
1/1/2010,2,68,190,107,83,39,27,623,80,80,110,190,970
1/1/2010,3,101,234,139,95,38,26,821,110,100,140,240,970
1/1/2010,13,30,100,56,44,26,14,623,30,30,50,80,303
1/1/2010,5,54,134,80,54,26,15,456,70,60,90,150,303
1/1/2010,6,53,180,108,72,55,23,558,80,80,130,210,720
1/1/2010,8,99,341,171,170,72,47,1091,110,140,160,300,970
1/1/2010,9,0,150,87,63,87,57,435,20,50,80,130,719
1/1/2010,10,33,140,80,60,47,19,336,40,50,70,120,970
1/1/2010,11,17,130,72,58,55,22,338,20,40,70,110,719


root
 |-- Area Code: integer (nullable = true)
 |-- State: string (nullable = true)
 |-- Market: string (nullable = true)
 |-- Market Size: string (nullable = true)

root
 |-- ProductId: integer (nullable = true)
 |-- Product Type: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Type: string (nullable = true)

root
 |-- Date: string (nullable = true)
 |-- ProductId: integer (nullable = true)
 |-- Profit: integer (nullable = true)
 |-- Sales: integer (nullable = true)
 |-- Margin: integer (nullable = true)
 |-- COGS: integer (nullable = true)
 |-- Total Expenses: integer (nullable = true)
 |-- Marketing: integer (nullable = true)
 |-- Inventory: integer (nullable = true)
 |-- Budget Profit: integer (nullable = true)
 |-- Budget COGS: integer (nullable = true)
 |-- Budget Margin: integer (nullable = true)
 |-- Budget Sales: integer (nullable = true)
 |-- Area Code: integer (nullable = true)



In [0]:
for col in df_Location.columns:
    new_col = col.replace(" ", "_")
    df_Location = df_Location.withColumnRenamed(col, new_col)

In [0]:
df_Location.display()

Area_Code,State,Market,Market_Size
203,Connecticut,East,Small Market
206,Washington,West,Small Market
209,California,West,Major Market
210,Texas,South,Major Market
212,New York,East,Major Market
213,California,West,Major Market
214,Texas,South,Major Market
216,Ohio,Central,Major Market
217,Illinois,Central,Major Market
224,Illinois,Central,Major Market


In [0]:
for col in df_Product.columns:
    new_col = col.replace(" ", "_")
    df_Product = df_Product.withColumnRenamed(col, new_col)

In [0]:
df_Product.display()

ProductId,Product_Type,Product,Type
1,Coffee,Amaretto,Regular
2,Coffee,Columbian,Regular
3,Coffee,Decaf Irish Cream,Decaf
4,Espresso,Caffe Latte,Regular
5,Espresso,Caffe Mocha,Regular
6,Espresso,Decaf Espresso,Decaf
7,Espresso,Regular Espresso,Regular
8,Herbal Tea,Chamomile,Decaf
9,Herbal Tea,Lemon,Decaf
10,Herbal Tea,Mint,Decaf


In [0]:
for col in df_Facts.columns:
    new_col = col.replace(" ", "_")
    df_Facts = df_Facts.withColumnRenamed(col, new_col)

In [0]:
df_Facts.display()

Date,ProductId,Profit,Sales,Margin,COGS,Total_Expenses,Marketing,Inventory,Budget_Profit,Budget_COGS,Budget_Margin,Budget_Sales,Area_Code
1/1/2010,1,94,219,130,89,36,24,777,100,90,130,220,719
1/1/2010,2,68,190,107,83,39,27,623,80,80,110,190,970
1/1/2010,3,101,234,139,95,38,26,821,110,100,140,240,970
1/1/2010,13,30,100,56,44,26,14,623,30,30,50,80,303
1/1/2010,5,54,134,80,54,26,15,456,70,60,90,150,303
1/1/2010,6,53,180,108,72,55,23,558,80,80,130,210,720
1/1/2010,8,99,341,171,170,72,47,1091,110,140,160,300,970
1/1/2010,9,0,150,87,63,87,57,435,20,50,80,130,719
1/1/2010,10,33,140,80,60,47,19,336,40,50,70,120,970
1/1/2010,11,17,130,72,58,55,22,338,20,40,70,110,719


In [0]:
df_Location.createOrReplaceTempView("location")
df_Product.createOrReplaceTempView("Product")
df_Facts.createOrReplaceTempView("Facts")

1)Display number of states present in location table?

In [0]:
%sql
select count (distinct state) AS COUNTOFSTATE from location

COUNTOFSTATE
20


 2)How many products are of regular type?

In [0]:
 %sql
SELECT COUNT(product) AS countofRegulartype
FROM product
WHERE type = 'Regular';

countofRegulartype
8


3)How much spending has been done onmarketingofproduct id = 1

In [0]:
%sql
 select
 sum(marketing) as marketing from facts where productid=1

marketing
4658


 4)What is the minimum sales of a product?

In [0]:
%sql
select min(sales) as min_sales from facts

min_sales
17


5)Display max Cost of Good Sold(COGS).

 

In [0]:
%sql
 select max(COGS) as max_COGS from facts

max_COGS
364


6)Display the Details of the product id where product type is coffee



In [0]:
%sql
select * from product where Product_type='Coffee'

ProductId,Product_Type,Product,Type
1,Coffee,Amaretto,Regular
2,Coffee,Columbian,Regular
3,Coffee,Decaf Irish Cream,Decaf


 7)Display the details where total_expenses is greater than 40.

 

In [0]:
%sql
select * from facts where total_expenses>40

Date,ProductId,Profit,Sales,Margin,COGS,Total_Expenses,Marketing,Inventory,Budget_Profit,Budget_COGS,Budget_Margin,Budget_Sales,Area_Code
1/1/2010,6,53,180,108,72,55,23,558,80,80,130,210,720
1/1/2010,8,99,341,171,170,72,47,1091,110,140,160,300,970
1/1/2010,9,0,150,87,63,87,57,435,20,50,80,130,719
1/1/2010,10,33,140,80,60,47,19,336,40,50,70,120,970
1/1/2010,11,17,130,72,58,55,22,338,20,40,70,110,719
1/1/2010,2,111,345,201,144,90,47,862,130,150,210,360,217
1/1/2010,3,87,234,139,95,52,30,608,100,100,140,240,309
1/1/2010,5,203,546,312,234,109,77,1310,260,270,370,640,309
1/1/2010,6,140,456,228,228,88,63,1459,180,260,270,530,630
1/1/2010,12,54,180,108,72,54,23,558,40,60,90,150,708


8)What is the average sales in Area_Code 719?

 

In [0]:
%sql
 select avg(sales) as avg_sales from facts where area_code=719

avg_sales
186.58333333333331


9)Find out the total profit generated by Colorado state.


In [0]:
%sql
select sum(profit) as total_profit from facts inner join location on facts.Area_Code=location.Area_Code where State='Colorado'

total_profit
17743


 10)Display the average inventory for each productid.




In [0]:
%sql
SELECT productid, AVG(Inventory) AS AVERAGE_Inventory FROM facts group by productid order by productid

productid,AVERAGE_Inventory
1,741.0208333333334
2,707.55
3,838.6822916666666
4,255.99074074074076
5,756.2416666666667
6,755.2058823529412
7,879.6388888888889
8,712.6458333333334
9,718.5375
10,1095.90625


In [0]:
%sql
SELECT productid,AVG(Inventory) AS AVERAGE_Inventory FROM facts
 group by productid 

productid,AVERAGE_Inventory
12,757.1736111111111
1,741.0208333333334
13,900.375
6,755.2058823529412
3,838.6822916666666
5,756.2416666666667
9,718.5375
4,255.99074074074076
8,712.6458333333334
7,879.6388888888889


 11)Display state in as equential order in alocation table.

 


In [0]:
%sql
select * from location order by state 

Area_Code,State,Market,Market_Size
209,California,West,Major Market
213,California,West,Major Market
310,California,West,Major Market
323,California,West,Major Market
408,California,West,Major Market
415,California,West,Major Market
510,California,West,Major Market
530,California,West,Major Market
559,California,West,Major Market
562,California,West,Major Market


In [0]:
%sql
select * from location order by state desc

Area_Code,State,Market,Market_Size
262,Wisconsin,Central,Small Market
414,Wisconsin,Central,Small Market
608,Wisconsin,Central,Small Market
715,Wisconsin,Central,Small Market
920,Wisconsin,Central,Small Market
206,Washington,West,Small Market
253,Washington,West,Small Market
360,Washington,West,Small Market
425,Washington,West,Small Market
509,Washington,West,Small Market


12)Display the average budget margin where average budget margin should be greater than 100


In [0]:
%sql
SELECT productid,AVG(budget_margin) AS AVG_budget_margin FROM
 facts GROUP BY productid HAVING AVG(budget_margin)> 100

productid,AVG_budget_margin
12,103.68055555555556
6,107.99019607843135
8,100.20833333333331
7,182.22222222222223
2,173.20833333333334



 13)What is the total sales done on date 2010-01-01

 

In [0]:
%sql
 select sum(sales) as total_sales from facts where date='1/1/2010'

total_sales
31555


14)Display the average total expense of each product id on individual date



In [0]:
%sql
 SELECT productid, date, avg(total_expenses) as avg_total_expense FROM
 facts group by productid, date 


productid,date,avg_total_expense
4,12/1/2010,48.22222222222222
10,1/1/2011,52.75
1,1/1/2010,46.125
6,3/1/2010,47.0
2,6/1/2010,69.35
3,10/1/2010,49.8125
2,11/1/2011,63.55
8,3/1/2011,51.8125
10,9/1/2010,52.25
5,11/1/2010,62.9


In [0]:
%sql
SELECT productid, date, avg(total_expenses)
 as avg_total_expense FROM facts group by productid ,date order by date

productid,date,avg_total_expense
1,1/1/2010,46.125
3,1/1/2010,50.0625
11,1/1/2010,43.625
2,1/1/2010,63.1
10,1/1/2010,50.25
12,1/1/2010,58.16666666666666
6,1/1/2010,46.588235294117645
7,1/1/2010,67.33333333333333
5,1/1/2010,60.1
9,1/1/2010,56.05


In [0]:
%sql
SELECT productid,date,avg(total_expenses) as avg_total_expense FROM
 facts group by productid,date order by date,productid

productid,date,avg_total_expense
1,1/1/2010,46.125
2,1/1/2010,63.1
3,1/1/2010,50.0625
4,1/1/2010,50.66666666666666
5,1/1/2010,60.1
6,1/1/2010,46.588235294117645
7,1/1/2010,67.33333333333333
8,1/1/2010,50.4375
9,1/1/2010,56.05
10,1/1/2010,50.25


 15)Display the table with the following attributes such as Date, productid, product_type, product, Sales,profit,state,area_code

 

In [0]:
%sql
select facts.Date,facts.ProductId,Product.product_type,Product.product,facts.Sales,facts.profit,Location.state,Location.Area_Code
 from facts inner join location on Facts.Area_code=Location.area_code inner
 join product on facts.productid=product.productid

Date,ProductId,product_type,product,Sales,profit,state,Area_Code
1/1/2010,1,Coffee,Amaretto,219,94,Colorado,719
1/1/2010,2,Coffee,Columbian,190,68,Colorado,970
1/1/2010,3,Coffee,Decaf Irish Cream,234,101,Colorado,970
1/1/2010,13,Tea,Green Tea,100,30,Colorado,303
1/1/2010,5,Espresso,Caffe Mocha,134,54,Colorado,303
1/1/2010,6,Espresso,Decaf Espresso,180,53,Colorado,720
1/1/2010,8,Herbal Tea,Chamomile,341,99,Colorado,970
1/1/2010,9,Herbal Tea,Lemon,150,0,Colorado,719
1/1/2010,10,Herbal Tea,Mint,140,33,Colorado,970
1/1/2010,11,Tea,Darjeeling,130,17,Colorado,719


16)Display the rank without any gap to show the Sales wiserank.


In [0]:
%sql
SELECT Date, productid, Sales, profit, area_code, DENSE_RANK() OVER
 (ORDER BY Sales aSC) AS Sales_Rank FROM Facts;

Date,productid,Sales,profit,area_code,Sales_Rank
1/1/2010,13,17,-354,702,1
5/1/2010,13,18,-380,702,2
5/1/2011,13,18,-524,702,2
9/1/2010,13,19,-430,775,3
1/1/2011,13,19,-552,775,3
9/1/2011,13,20,-638,775,4
8/1/2010,13,21,-404,775,5
10/1/2010,13,21,-363,702,5
8/1/2011,13,21,-558,775,5
10/1/2011,13,22,-539,702,6


17)Find the State wise Profit and Sales.



In [0]:
%sql
 select state, sum(sales) as Sales,sum(profit) as Profit from facts inner join location on facts.Area_Code=location.Area_Code group by state

state,Sales,Profit
Utah,35384,7751
Ohio,34517,10773
Oregon,40899,12439
Texas,37410,15766
Connecticut,25429,7621
Nevada,60159,10616
Washington,38930,11405
Illinois,69883,30821
Oklahoma,27463,8558
New Mexico,15892,799


 18)Find the State wise Profit and Sales along with the Product Name.

 

In [0]:
%sql
select Product , state, sum(sales) as Sales,sum(profit) as Profit from facts inner
 join location on facts.Area_Code=location.Area_Code inner join Product on
 Facts.ProductId=Product.ProductId group by state, Product

Product,state,Sales,Profit
Earl Grey,Ohio,6262,2724
Lemon,Connecticut,4267,988
Caffe Latte,Texas,4027,1357
Decaf Irish Cream,Texas,3807,1568
Decaf Irish Cream,Oregon,2646,834
Decaf Espresso,Missouri,2302,1011
Earl Grey,Iowa,14449,5624
Caffe Latte,Utah,2302,1008
Decaf Espresso,California,14607,6580
Chamomile,Louisiana,3678,1500


19)If there is an increase in sales of 5%.Calculate the increased sales.

 

In [0]:
%sql
SELECT sales, (sales * 0.05) AS increase_in_sales FROM Facts

sales,increase_in_sales
219,10.95
190,9.5
234,11.7
100,5.0
134,6.7
180,9.0
341,17.05
150,7.5
140,7.0
130,6.5


20)Find the maximum profit along with the Product id and Product Type.

 

In [0]:
%sql
Select Facts.ProductId, Product.Product Type, Facts.Profit from Facts inner join
 Product on Facts.ProductId = Product.ProductId where Facts.Profit = (Select
 MAX(Profit) from Facts)


ProductId,Type,Profit
2,Columbian,778


In [0]:
%sql
 SELECT Product.ProductId, product_type, MAX(profit) AS max_profit
 FROM facts inner join product on facts.ProductId=Product.ProductId group by
 Product.ProductId, Product.Product_Type

ProductId,product_type,max_profit
10,Herbal Tea,207
4,Espresso,233
13,Tea,180
9,Herbal Tea,536
1,Coffee,199
3,Coffee,207
6,Espresso,362
8,Herbal Tea,362
7,Espresso,646
2,Coffee,778


21)Create a Stored Procedure to fetch the result according to the product type from Product.

 

In [0]:
from pyspark.sql.functions import col

def PType(product_type):
    return df_Product.filter(col("Product_Type") == product_type)

result_coffee = PType("Coffee")
result_coffee.display()
result_tea = PType("Tea")
result_tea.display()



ProductId,Product_Type,Product,Type
1,Coffee,Amaretto,Regular
2,Coffee,Columbian,Regular
3,Coffee,Decaf Irish Cream,Decaf


ProductId,Product_Type,Product,Type
11,Tea,Darjeeling,Regular
12,Tea,Earl Grey,Regular
13,Tea,Green Tea,Regular


22)Write a query by creating a conditionin which if the total expenses is lessthan 60 then it is a profit or else loss.



In [0]:
%sql
SELECT total_expenses, 
       CASE 
           WHEN total_expenses < 60 THEN 'profit' 
           ELSE 'loss' 
       END AS status 
FROM facts


total_expenses,status
36,profit
39,profit
38,profit
26,profit
26,profit
55,profit
72,loss
87,loss
47,profit
55,profit


 23)Give the total weekly sales value with theDate and product id details. Use roll-upto pull the data in hierarchical order.

 

In [0]:
%sql
Select Date, productid, SUM(Sales)as Total_Sale from facts group by
 Date,productid with Rollup


Date,productid,Total_Sale
4/1/2010,7.0,954
5/1/2010,5.0,3351
12/1/2010,7.0,1008
4/1/2010,11.0,2891
3/1/2011,3.0,2633
4/1/2010,2.0,5048
9/1/2010,3.0,2404
12/1/2010,8.0,3073
3/1/2011,1.0,1100
2/1/2011,4.0,1559


In [0]:
%sql
 Select productid, SUM(Sales)as Total_Sale from facts group by productid with
 Rollup

productid,Total_Sale
11.0,73151
1.0,26269
7.0,24031
,819811
4.0,35899
12.0,66772
8.0,75578
10.0,35710
13.0,32850
6.0,78162


24)Apply union and intersection operator on the tables which consist of attribute area code.



In [0]:
%sql
select area_code from facts 
union 
select area_code from location


area_code
858
516
580
970
918
513
321
857
530
772


In [0]:
%sql
 select area_code from facts 
 intersect 
 select area_code from location

 25)Createa user-defined function for the product table to fetch a particular product type based upon the user’s preference.



In [0]:
from pyspark.sql.functions import col
def product_table(product_type):
    return df_Product.filter(col("Product_Type") == product_type)

# Step 4: Use the Function to Filter Data
result_df = product_table("Coffee")
result_df.display()

ProductId,Product_Type,Product,Type
1,Coffee,Amaretto,Regular
2,Coffee,Columbian,Regular
3,Coffee,Decaf Irish Cream,Decaf


In [0]:
df_Product.display()

ProductId,Product_Type,Product,Type
1,Coffee,Amaretto,Regular
2,Coffee,Columbian,Regular
3,Coffee,Decaf Irish Cream,Decaf
4,Espresso,Caffe Latte,Regular
5,Espresso,Caffe Mocha,Regular
6,Espresso,Decaf Espresso,Decaf
7,Espresso,Regular Espresso,Regular
8,Herbal Tea,Chamomile,Decaf
9,Herbal Tea,Lemon,Decaf
10,Herbal Tea,Mint,Decaf


In [0]:
%sql
 select * from dbo.Producttable('Coffee')

 26)Change the product type from coffee to tea where product id is 1 and undo it.



In [0]:
%sql
 select * from product
 begin transaction update product set product_type=‘Tea’ where productid =1

In [0]:
%sql
select * from product
 rollback transaction
 select * from product

 27)Display the Date, product id and sales where total expenses are between 100 to 200.



In [0]:
%sql
 select date, productid ,sales ,Total_Expenses from facts where Total_Expenses
 between 100 and 200

date,productid,sales,Total_Expenses
1/1/2010,5,546,109
1/1/2010,8,546,110
1/1/2010,12,546,126
1/1/2010,2,190,102
1/1/2010,5,190,102
1/1/2010,5,61,128
1/1/2010,2,678,145
1/1/2010,9,483,127
1/1/2010,9,190,101
1/1/2010,5,250,128


 28)Delete there cords in th eproduct table for regular type

In [0]:
%sql
DELETE FROM product WHERE Type = ‘Regular’;
 select * from product

29)Display the ASCII value of the fifth character from the column product

In [0]:
%sql
SELECT Product, (ASCII(SUBSTRING(Product,5,1))) AS ASCII_VALUE_OF_fifth_CHARACTER
 FROM Product

Product,ASCII_VALUE_OF_fifth_CHARACTER
Amaretto,101
Columbian,109
Decaf Irish Cream,102
Caffe Latte,101
Caffe Mocha,101
Decaf Espresso,102
Regular Espresso,108
Chamomile,111
Lemon,110
Mint,0
