# Setup & SparkSession Initialization

In [1]:
!pip install pyspark

from pyspark.sql import SparkSession



In [2]:
spark = SparkSession.builder \
    .appName("BotCampus PySpark Practice") \
    .master("local[*]") \
    .getOrCreate()

In [3]:

data = [
    ("Anjali", "Bangalore", 24),
    ("Ravi", "Hyderabad", 28),
    ("Kavya", "Delhi", 22),
    ("Meena", "Chennai", 25),
    ("Arjun", "Mumbai", 30)
]
columns = ["name", "city", "age"]

df = spark.createDataFrame(data, columns)

print("=== Schema ===")
df.printSchema()

print("=== DataFrame ===")
df.show()


=== Schema ===
root
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: long (nullable = true)

=== DataFrame ===
+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Anjali|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
| Arjun|   Mumbai| 30|
+------+---------+---+



In [4]:
rdd = df.rdd

print("=== Collect Data ===")
print(df.collect())

print("=== RDD Map Output (Name Only) ===")
print(rdd.map(lambda x: x.name).collect())

=== Collect Data ===
[Row(name='Anjali', city='Bangalore', age=24), Row(name='Ravi', city='Hyderabad', age=28), Row(name='Kavya', city='Delhi', age=22), Row(name='Meena', city='Chennai', age=25), Row(name='Arjun', city='Mumbai', age=30)]
=== RDD Map Output (Name Only) ===
['Anjali', 'Ravi', 'Kavya', 'Meena', 'Arjun']


# RDDs & Transformations

In [5]:
feedback = spark.sparkContext.parallelize([
    "Ravi from Bangalore loved the delivery",
    "Meena from Hyderabad had a late order",
    "Ajay from Pune liked the service",
    "Anjali from Delhi faced UI issues",
    "Rohit from Mumbai gave positive feedback"
])

In [6]:

words = feedback.flatMap(lambda line: line.lower().split())

In [7]:
stop_words = {"from", "the", "a", "had"}
filtered_words = words.filter(lambda w: w not in stop_words)

In [8]:
word_counts = filtered_words.map(lambda w: (w, 1)).reduceByKey(lambda a,b: a+b)

print("=== Word Counts ===")
print(word_counts.collect())

=== Word Counts ===
[('loved', 1), ('liked', 1), ('service', 1), ('anjali', 1), ('faced', 1), ('issues', 1), ('rohit', 1), ('mumbai', 1), ('positive', 1), ('feedback', 1), ('ravi', 1), ('bangalore', 1), ('delivery', 1), ('meena', 1), ('hyderabad', 1), ('late', 1), ('order', 1), ('ajay', 1), ('pune', 1), ('delhi', 1), ('ui', 1), ('gave', 1)]


In [9]:
top_3 = word_counts.takeOrdered(3, key=lambda x: -x[1])
print("=== Top 3 Words ===")
print(top_3)

=== Top 3 Words ===
[('loved', 1), ('liked', 1), ('service', 1)]


 # DataFrames & Transformations (Joins)

In [10]:
from pyspark.sql.functions import col, when

students = [
    ("Amit", "10-A", 89),
    ("Kavya", "10-B", 92),
    ("Anjali", "10-A", 78),
    ("Rohit", "10-B", 85),
    ("Sneha", "10-C", 80)
]
columns = ["name", "section", "marks"]

attendance = [
    ("Amit", 24),
    ("Kavya", 22),
    ("Anjali", 20),
    ("Rohit", 25),
    ("Sneha", 19)
]
columns2 = ["name", "days_present"]

df_students = spark.createDataFrame(students, columns)
df_attendance = spark.createDataFrame(attendance, columns2)


In [12]:
df_join = df_students.join(df_attendance, "name")

In [13]:

df_join = df_join.withColumn("attendance_rate", col("days_present")/25)
df_join = df_join.withColumn("grade",
                             when(col("marks")>90, "A")
                             .when((col("marks")>=80) & (col("marks")<=90), "B")
                             .otherwise("C"))

print("=== Students with Grades ===")
df_join.show()

=== Students with Grades ===
+------+-------+-----+------------+---------------+-----+
|  name|section|marks|days_present|attendance_rate|grade|
+------+-------+-----+------------+---------------+-----+
|  Amit|   10-A|   89|          24|           0.96|    B|
|Anjali|   10-A|   78|          20|            0.8|    C|
| Kavya|   10-B|   92|          22|           0.88|    A|
| Rohit|   10-B|   85|          25|            1.0|    B|
| Sneha|   10-C|   80|          19|           0.76|    B|
+------+-------+-----+------------+---------------+-----+



In [14]:
df_poor_attendance = df_join.filter((col("grade").isin("A","B")) & (col("attendance_rate")<0.8))
print("=== Good Grades but Poor Attendance ===")
df_poor_attendance.show()

=== Good Grades but Poor Attendance ===
+-----+-------+-----+------------+---------------+-----+
| name|section|marks|days_present|attendance_rate|grade|
+-----+-------+-----+------------+---------------+-----+
|Sneha|   10-C|   80|          19|           0.76|    B|
+-----+-------+-----+------------+---------------+-----+



 # Ingest CSV & JSON, Save to Parquet

In [15]:
import pandas as pd


csv_data = """emp_id,name,dept,city,salary
101,Anil,IT,Bangalore,80000
102,Kiran,HR,Mumbai,65000
103,Deepa,Finance,Chennai,72000
"""
with open("employees.csv","w") as f:
    f.write(csv_data)

In [17]:
json_data = {
    "id": 201,
    "name": "Nandini",
    "contact": {"email": "nandi@example.com", "city": "Hyderabad"},
    "skills": ["Python", "Spark", "SQL"]
}
import json
with open("employee.json","w") as f:
    json.dump(json_data, f)

In [18]:
df_csv = spark.read.csv("employees.csv", header=True, inferSchema=True)
df_json = spark.read.json("employee.json", multiLine=True)

print("=== CSV Data ===")
df_csv.show()

print("=== JSON Data ===")
df_json.show(truncate=False)

=== CSV Data ===
+------+-----+-------+---------+------+
|emp_id| name|   dept|     city|salary|
+------+-----+-------+---------+------+
|   101| Anil|     IT|Bangalore| 80000|
|   102|Kiran|     HR|   Mumbai| 65000|
|   103|Deepa|Finance|  Chennai| 72000|
+------+-----+-------+---------+------+

=== JSON Data ===
+------------------------------+---+-------+--------------------+
|contact                       |id |name   |skills              |
+------------------------------+---+-------+--------------------+
|{Hyderabad, nandi@example.com}|201|Nandini|[Python, Spark, SQL]|
+------------------------------+---+-------+--------------------+



In [19]:
from pyspark.sql.functions import col, explode

df_flat = df_json.select(
    col("id"),
    col("name"),
    col("contact.email").alias("email"),
    col("contact.city").alias("city"),
    explode(col("skills")).alias("skill")
)

print("=== Flattened JSON ===")
df_flat.show()


=== Flattened JSON ===
+---+-------+-----------------+---------+------+
| id|   name|            email|     city| skill|
+---+-------+-----------------+---------+------+
|201|Nandini|nandi@example.com|Hyderabad|Python|
|201|Nandini|nandi@example.com|Hyderabad| Spark|
|201|Nandini|nandi@example.com|Hyderabad|   SQL|
+---+-------+-----------------+---------+------+



In [20]:
df_csv.write.partitionBy("city").mode("overwrite").parquet("parquet_csv/")
df_flat.write.partitionBy("city").mode("overwrite").parquet("parquet_json/")

# Spark SQL with Temp Views

In [21]:
df_students.createOrReplaceTempView("students_view")

In [22]:
spark.sql("SELECT section, AVG(marks) as avg_marks FROM students_view GROUP BY section").show()

+-------+---------+
|section|avg_marks|
+-------+---------+
|   10-A|     83.5|
|   10-B|     88.5|
|   10-C|     80.0|
+-------+---------+



In [23]:
spark.sql("""
SELECT section, name, marks
FROM (
    SELECT *, RANK() OVER(PARTITION BY section ORDER BY marks DESC) as rnk
    FROM students_view
) tmp
WHERE rnk=1
""").show()

+-------+-----+-----+
|section| name|marks|
+-------+-----+-----+
|   10-A| Amit|   89|
|   10-B|Kavya|   92|
|   10-C|Sneha|   80|
+-------+-----+-----+



In [24]:
spark.sql("""
SELECT
    CASE
        WHEN marks>90 THEN 'A'
        WHEN marks>=80 THEN 'B'
        ELSE 'C'
    END as grade,
    COUNT(*) as student_count
FROM students_view
GROUP BY grade
""").show()

+-----+-------------+
|grade|student_count|
+-----+-------------+
|    B|            3|
|    A|            1|
|    C|            1|
+-----+-------------+



In [25]:
spark.sql("""
SELECT *
FROM students_view
WHERE marks > (SELECT AVG(marks) FROM students_view)
""").show()

+-----+-------+-----+
| name|section|marks|
+-----+-------+-----+
| Amit|   10-A|   89|
|Kavya|   10-B|   92|
|Rohit|   10-B|   85|
+-----+-------+-----+



# Partitioned Data & Incremental Loading

In [27]:
df_students.write.partitionBy("section").mode("overwrite").parquet("output/students/")


In [28]:
incremental = [("Tejas", "10-A", 91)]
df_inc = spark.createDataFrame(incremental, ["name", "section", "marks"])
df_inc.write.mode("append").partitionBy("section").parquet("output/students/")

In [29]:
import os
print("Files in output/students/:", os.listdir("output/students/"))

Files in output/students/: ['._SUCCESS.crc', 'section=10-A', 'section=10-B', '_SUCCESS', 'section=10-C']


In [30]:

df_10A = spark.read.parquet("output/students/section=10-A")
df_10A.show()


+------+-----+
|  name|marks|
+------+-----+
|Anjali|   78|
| Tejas|   91|
|  Amit|   89|
+------+-----+



In [31]:
print("Count after append in 10-A:", df_10A.count())

Count after append in 10-A: 3


# ETL Pipeline End to End


In [32]:
csv_etl = """emp_id,name,dept,salary,bonus
1,Arjun,IT,75000,5000
2,Kavya,HR,62000,
3,Sneha,Finance,68000,4000
4,Ramesh,Sales,58000,
"""
with open("etl_employees.csv","w") as f:
    f.write(csv_etl)

In [33]:
df_etl = spark.read.csv("etl_employees.csv", header=True, inferSchema=True)

In [34]:
df_etl = df_etl.fillna({"bonus":2000})

In [35]:
df_etl = df_etl.withColumn("total_ctc", col("salary")+col("bonus"))

In [36]:
df_filtered = df_etl.filter(col("total_ctc")>65000)

print("=== Filtered Employees ===")
df_filtered.show()


=== Filtered Employees ===
+------+-----+-------+------+-----+---------+
|emp_id| name|   dept|salary|bonus|total_ctc|
+------+-----+-------+------+-----+---------+
|     1|Arjun|     IT| 75000| 5000|    80000|
|     3|Sneha|Finance| 68000| 4000|    72000|
+------+-----+-------+------+-----+---------+



In [37]:
df_filtered.write.mode("overwrite").json("etl_json_output/")


In [38]:
df_filtered.write.partitionBy("dept").mode("overwrite").parquet("etl_parquet_output/")
