# Ex-2260 - Saving and partitioning


In [1]:
import requests

! curl -L -o data.zip  https://www.kaggle.com/api/v1/datasets/download/zahidmughal2343/amazon-sales-2025
! unzip data.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  3617  100  3617    0     0   6803      0 --:--:-- --:--:-- --:--:--  6803
Archive:  data.zip
  inflating: amazon_sales_data 2025.csv  


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").master("local[*]").getOrCreate()

In [3]:
# 2. Load the CSV file assuming headers exist and let Spark infer data types
df = spark.read.option("header", "true").option("inferSchema", "true").csv("amazon_sales_data 2025.csv")

In [4]:
# 3. Check the structure of the DataFrame
df.printSchema()

root
 |-- Order ID: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Total Sales: integer (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Customer Location: string (nullable = true)
 |-- Payment Method: string (nullable = true)
 |-- Status: string (nullable = true)



In [5]:
# Repartition the DataFrame into 4 partitions
df = df.repartition(4)

# Check the number of partitions
print(f"Number of partitions: {df.rdd.getNumPartitions()}")

Number of partitions: 4


In [6]:
df.write.json("amazon.json")

In [7]:
df.write.csv("amazon.csv", header=True)

In [8]:
df.write.parquet("amazon.parquet")

In [9]:
df = df.coalesce(1)
print(f"Number of partitions after reduction: {df.rdd.getNumPartitions()}")

Number of partitions after reduction: 1


In [10]:
df.write.partitionBy("Category").parquet("amazon_partitioned.parquet")

In [11]:
# df.write.parquet("amazon.parquet")

In [12]:
df.write.mode("overwrite").partitionBy("Category").parquet("amazon.parquet")