In [1]:

data = """OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
CA-1001,2023-01-15,Ravi,Consumer,South,Laptop,Technology,Computers,1,55000,0.10,5000
CA-1002,2023-02-20,Priya,Corporate,North,Printer,Technology,Peripherals,2,12000,0.15,1800
CA-1003,2023-01-25,Amit,Consumer,East,Notebook,Office Supplies,Paper,3,200,0.05,150
CA-1004,2023-03-01,Anita,Home Office,West,Table,Furniture,Tables,1,18000,0.20,-1500
CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.00,3000
"""


with open("superstore.csv", "w") as f:
    f.write(data)

print(" superstore.csv created successfully!")


 superstore.csv created successfully!


#TASKS ACROSS Pandas, PySpark, and Dask


In [2]:
import pandas as pd


df = pd.read_csv("superstore.csv")


print("Head of the DataFrame:")
print(df.head())


print("\nShape:", df.shape)
print("\nData types:\n", df.dtypes)

Head of the DataFrame:
   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        Furniture       Tables         1      18000      0.20   -1500  
4       Technology       Phones         2      20000      0.00    3000  

Shape: (5, 12)

Data types:
 OrderID         object
OrderDate       object
Customer        object
Segment         object
Region          object
Pro

In [3]:

filtered_orders = df[(df['Profit'] > 2000) & (df['Discount'] == 0)]
print("Orders with Profit > 2000 and Discount = 0:\n")
print(filtered_orders)


Orders with Profit > 2000 and Discount = 0:

   OrderID   OrderDate Customer   Segment Region Product    Category  \
4  CA-1005  2023-02-05    Divya  Consumer  South   Phone  Technology   

  SubCategory  Quantity  UnitPrice  Discount  Profit  
4      Phones         2      20000       0.0    3000  


In [4]:

sorted_df = df.sort_values('Profit', ascending=False)
print("Sorted by Profit (High to Low):\n")
print(sorted_df)


Sorted by Profit (High to Low):

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
4       Technology       Phones         2      20000      0.00    3000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        Furniture       Tables         1      18000      0.20   -1500  


In [5]:

grouped = df.groupby('Category').agg({
    'Profit': 'sum',
    'Discount': 'mean'
})

print("Total Profit and Average Discount by Category:\n")
print(grouped)


Total Profit and Average Discount by Category:

                 Profit  Discount
Category                         
Furniture         -1500  0.200000
Office Supplies     150  0.050000
Technology         9800  0.083333


In [6]:

df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

print("DataFrame with TotalPrice column added:\n")
print(df[['OrderID','Quantity','UnitPrice','TotalPrice']])


DataFrame with TotalPrice column added:

   OrderID  Quantity  UnitPrice  TotalPrice
0  CA-1001         1      55000       55000
1  CA-1002         2      12000       24000
2  CA-1003         3        200         600
3  CA-1004         1      18000       18000
4  CA-1005         2      20000       40000


In [7]:

df = df.drop(columns=['SubCategory'])

print("DataFrame after dropping SubCategory column:\n")
print(df.head())


DataFrame after dropping SubCategory column:

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  Quantity  UnitPrice  Discount  Profit  TotalPrice  
0       Technology         1      55000      0.10    5000       55000  
1       Technology         2      12000      0.15    1800       24000  
2  Office Supplies         3        200      0.05     150         600  
3        Furniture         1      18000      0.20   -1500       18000  
4       Technology         2      20000      0.00    3000       40000  


In [8]:

df['Discount'] = df['Discount'].fillna(0.10)

print("DataFrame after filling nulls in Discount:\n")
print(df)


DataFrame after filling nulls in Discount:

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  Quantity  UnitPrice  Discount  Profit  TotalPrice  
0       Technology         1      55000      0.10    5000       55000  
1       Technology         2      12000      0.15    1800       24000  
2  Office Supplies         3        200      0.05     150         600  
3        Furniture         1      18000      0.20   -1500       18000  
4       Technology         2      20000      0.00    3000       40000  


In [9]:

def classify(row):
    if row['Profit'] > 4000:
        return 'High'
    elif row['Profit'] > 0:
        return 'Medium'
    else:
        return 'Low'

df['ProfitCategory'] = df.apply(classify, axis=1)

print("DataFrame with ProfitCategory column:\n")
print(df[['OrderID', 'Profit', 'ProfitCategory']])


DataFrame with ProfitCategory column:

   OrderID  Profit ProfitCategory
0  CA-1001    5000           High
1  CA-1002    1800         Medium
2  CA-1003     150         Medium
3  CA-1004   -1500            Low
4  CA-1005    3000         Medium


#PART 2: PySpark DataFrame Operations


In [10]:
!pip install pyspark




In [11]:
from pyspark.sql import SparkSession


spark = SparkSession.builder.appName("Superstore").getOrCreate()

print(" Spark session started")


 Spark session started


In [12]:

df_spark = spark.read.csv("superstore.csv", header=True, inferSchema=True)

df_spark.show(5)

df_spark.printSchema()


+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|  Computers|       1|    55000|     0.1|  5000|
|CA-1002|2023-02-20|   Priya|  Corporate| North| Printer|     Technology|Peripherals|       2|    12000|    0.15|  1800|
|CA-1003|2023-01-25|    Amit|   Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|
|CA-1004|2023-03-01|   Anita|Home Office|  West|   Table|      Furniture|     Tables|       1|    18000|     0.2| -1500|
|CA-1005|2023-02-05|   Divya|   Consumer| South|   Phone|     Technology|     Phones|       2|    20000|     0.0|  3000|
+-------+----------+--------+---

In [13]:
from pyspark.sql.functions import col

df_spark.select(
    col("Customer").alias("Client"),
    "Product",
    "Profit"
).show()


+------+--------+------+
|Client| Product|Profit|
+------+--------+------+
|  Ravi|  Laptop|  5000|
| Priya| Printer|  1800|
|  Amit|Notebook|   150|
| Anita|   Table| -1500|
| Divya|   Phone|  3000|
+------+--------+------+



In [14]:
# Step 4: Filter where Segment = 'Consumer' and Profit < 1000
df_spark.filter(
    (col("Segment") == "Consumer") & (col("Profit") < 1000)
).show()


+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer| Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1003|2023-01-25|    Amit|Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|
+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+



In [15]:
from pyspark.sql.functions import avg

df_spark.groupBy("Region").agg(
    avg("Profit").alias("Average_Profit")
).show()


+------+--------------+
|Region|Average_Profit|
+------+--------------+
| South|        4000.0|
|  East|         150.0|
|  West|       -1500.0|
| North|        1800.0|
+------+--------------+



In [16]:

df_spark = df_spark.withColumn(
    "TotalPrice",
    col("Quantity") * col("UnitPrice")
)

df_spark.select("OrderID", "Quantity", "UnitPrice", "TotalPrice").show()


+-------+--------+---------+----------+
|OrderID|Quantity|UnitPrice|TotalPrice|
+-------+--------+---------+----------+
|CA-1001|       1|    55000|     55000|
|CA-1002|       2|    12000|     24000|
|CA-1003|       3|      200|       600|
|CA-1004|       1|    18000|     18000|
|CA-1005|       2|    20000|     40000|
+-------+--------+---------+----------+



In [17]:
from pyspark.sql.functions import when


df_spark = df_spark.withColumn(
    "ProfitCategory",
    when(col("Profit") > 2000, "High")
    .when(col("Profit") <= 0, "Loss")
    .otherwise("Medium")
)
df_spark.select("OrderID", "Profit", "ProfitCategory").show()


+-------+------+--------------+
|OrderID|Profit|ProfitCategory|
+-------+------+--------------+
|CA-1001|  5000|          High|
|CA-1002|  1800|        Medium|
|CA-1003|   150|        Medium|
|CA-1004| -1500|          Loss|
|CA-1005|  3000|          High|
+-------+------+--------------+



In [18]:

df_spark = df_spark.drop("SubCategory")

df_spark.printSchema()


root
 |-- OrderID: string (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Profit: integer (nullable = true)
 |-- TotalPrice: integer (nullable = true)
 |-- ProfitCategory: string (nullable = false)



In [19]:

df_spark = df_spark.fillna({'Discount': 0.10})

df_spark.select("OrderID", "Discount").show()


+-------+--------+
|OrderID|Discount|
+-------+--------+
|CA-1001|     0.1|
|CA-1002|    0.15|
|CA-1003|    0.05|
|CA-1004|     0.2|
|CA-1005|     0.0|
+-------+--------+



In [20]:
from pyspark.sql.functions import to_date, year, month

df_spark = df_spark.withColumn("OrderDate", to_date(col("OrderDate")))

df_spark = df_spark.withColumn("Year", year(col("OrderDate")))
df_spark = df_spark.withColumn("Month", month(col("OrderDate")))

df_spark.select("OrderID", "OrderDate", "Year", "Month").show()


+-------+----------+----+-----+
|OrderID| OrderDate|Year|Month|
+-------+----------+----+-----+
|CA-1001|2023-01-15|2023|    1|
|CA-1002|2023-02-20|2023|    2|
|CA-1003|2023-01-25|2023|    1|
|CA-1004|2023-03-01|2023|    3|
|CA-1005|2023-02-05|2023|    2|
+-------+----------+----+-----+



 #PART 3: Dask DataFrame Operations (Pandas Alternative)

In [21]:
!pip install dask




In [22]:
import dask.dataframe as dd

df_dask = dd.read_csv("superstore.csv")

print(df_dask.head())

print("\nColumns:", df_dask.columns)


   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        Furniture       Tables         1      18000      0.20   -1500  
4       Technology       Phones         2      20000      0.00    3000  

Columns: Index(['OrderID', 'OrderDate', 'Customer', 'Segment', 'Region', 'Product',
       'Category', 'SubCategory', 'Quantity', 'UnitPrice', 'Discount',
       'Profit'

In [23]:

avg_discount = df_dask.groupby('Category')['Discount'].mean().compute()
print("Average Discount by Category:\n", avg_discount)


Average Discount by Category:
 Category
Furniture          0.200000
Office Supplies    0.050000
Technology         0.083333
Name: Discount, dtype: float64


In [24]:

filtered_dask = df_dask[(df_dask['Quantity'] > 1) & (df_dask['Profit'] > 2000)]

print(filtered_dask.compute())


   OrderID   OrderDate Customer   Segment Region Product    Category  \
4  CA-1005  2023-02-05    Divya  Consumer  South   Phone  Technology   

  SubCategory  Quantity  UnitPrice  Discount  Profit  
4      Phones         2      20000       0.0    3000  


In [26]:

filtered_dask.to_csv("filtered_superstore.csv", single_file=True, index=False)

print("Filtered data saved as filtered_superstore.csv")


Filtered data saved as filtered_superstore.csv


#PART 4: JSON Handling (Complex Nested)


In [27]:
json_data = """[
  {
    "OrderID": "CA-1001",
    "Customer": {"Name": "Ravi", "Segment": "Consumer"},
    "Details": {"Region": "South", "Profit": 5000}
  },
  {
    "OrderID": "CA-1002",
    "Customer": {"Name": "Priya", "Segment": "Corporate"},
    "Details": {"Region": "North", "Profit": 1800}
  }
]"""

with open("orders.json", "w") as f:
    f.write(json_data)

print("orders.json created successfully!")


orders.json created successfully!


In [28]:

df_json = spark.read.json("orders.json", multiLine=True)

df_json.printSchema()


root
 |-- Customer: struct (nullable = true)
 |    |-- Name: string (nullable = true)
 |    |-- Segment: string (nullable = true)
 |-- Details: struct (nullable = true)
 |    |-- Profit: long (nullable = true)
 |    |-- Region: string (nullable = true)
 |-- OrderID: string (nullable = true)



In [29]:

df_json.select(
    "OrderID",
    "Customer.Name",
    "Customer.Segment",
    "Details.Region",
    "Details.Profit"
).show()


+-------+-----+---------+------+------+
|OrderID| Name|  Segment|Region|Profit|
+-------+-----+---------+------+------+
|CA-1001| Ravi| Consumer| South|  5000|
|CA-1002|Priya|Corporate| North|  1800|
+-------+-----+---------+------+------+

