In [None]:
!pip install pyspark



In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# ✅ Step 2: Download CSV from Google Drive using gdown
import gdown

# Replace with your correct file ID
file_id = "1X5U6lHBsqaBziO1g3XiLlm5EEodh5cun"

# Download to /content/large_data.csv
gdown.download(f"https://drive.google.com/uc?id={file_id}", "large_data.csv", quiet=False)


Downloading...
From (original): https://drive.google.com/uc?id=1X5U6lHBsqaBziO1g3XiLlm5EEodh5cun
From (redirected): https://drive.google.com/uc?id=1X5U6lHBsqaBziO1g3XiLlm5EEodh5cun&confirm=t&uuid=1b1cffd8-0759-43f6-be80-34d9a173dd09
To: /content/large_data.csv
100%|██████████| 625M/625M [00:12<00:00, 48.2MB/s]


'large_data.csv'

In [None]:
# ✅ Step 3: PySpark Analysis Script
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, max, min
import shutil, glob

# Step 1: Start Spark
spark = SparkSession.builder.appName("BigDataAnalysis").getOrCreate()

# Step 2: Load CSV
df = spark.read.csv("large_data.csv", header=True, inferSchema=True)

# Step 3: View data
df.printSchema()
df.show(5)

# Step 4: Data Cleaning
df_clean = df.dropna()

# Step 5: Basic Analysis
# Group by Item Type
df_clean.groupBy("Item Type").count().show()

# Average Unit Price by Region
df_clean.groupBy("Region").agg(avg("Unit Price")).show()

# Max and Min Total Profit
df_clean.select(max("Total Profit"), min("Total Profit")).show()

# ✅ Step 6: Save full cleaned output
df_clean.coalesce(1).write.mode("overwrite")\
    .option("header", "true")\
    .csv("/content/cleaned_output")

# ✅ Step 7: Save only first 1000 rows as sample output
df_clean.limit(1000).coalesce(1).write.mode("overwrite")\
    .option("header", "true")\
    .csv("/content/sample_output")

# ✅ Step 8: Rename both outputs for download
part_cleaned = glob.glob("/content/cleaned_output/part-*.csv")[0]
shutil.move(part_cleaned, "/content/cleaned_output.csv")

part_sample = glob.glob("/content/sample_output/part-*.csv")[0]
shutil.move(part_sample, "/content/sample_output.csv")

print("✅ cleaned_output.csv and sample_output.csv are ready to download!")

# Step 9: Stop Spark Session
spark.stop()


root
 |-- Region: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Item Type: string (nullable = true)
 |-- Sales Channel: string (nullable = true)
 |-- Order Priority: string (nullable = true)
 |-- Order Date: date (nullable = true)
 |-- Order ID: integer (nullable = true)
 |-- Ship Date: date (nullable = true)
 |-- Units Sold: integer (nullable = true)
 |-- Unit Price: double (nullable = true)
 |-- Unit Cost: double (nullable = true)
 |-- Total Revenue: double (nullable = true)
 |-- Total Cost: double (nullable = true)
 |-- Total Profit: double (nullable = true)

+--------------------+-------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+
|              Region|Country|      Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|
+--------------------+-------+---------------+------