#PySpark Setup & Initialization

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder \
    .appName("BotCampus Intermediate Session") \
    .master("local[*]") \
    .getOrCreate()

In [3]:
data = [("Ananya", "Bangalore", 24),
        ("Ravi", "Hyderabad", 28),
        ("Kavya", "Delhi", 22),
        ("Meena", "Chennai", 25)]
columns = ["name", "city", "age"]
df = spark.createDataFrame(data, columns)

print("Starter DataFrame:")
df.show()

Starter DataFrame:
+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Ananya|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
+------+---------+---+



# RDDs & Transformations

In [4]:
feedback = spark.sparkContext.parallelize([
    "Ravi from Bangalore loved the mobile app",
    "Meena from Delhi reported poor response time",
    "Ajay from Pune liked the delivery speed",
    "Ananya from Hyderabad had an issue with UI",
    "Rohit from Mumbai gave positive feedback"
])

In [5]:
word_count = feedback.flatMap(lambda line: line.split()).count()
print("\nTotal words:", word_count)



Total words: 35


In [6]:
stop_words = {"from", "with", "the", "and", "an"}
words = feedback.flatMap(lambda line: line.lower().split()) \
                .filter(lambda w: w not in stop_words)
word_freq = words.map(lambda w: (w, 1)).reduceByKey(lambda x, y: x+y)

print("\nWord Counts:")
for word, count in word_freq.collect():
    print(word, ":", count)

top3 = word_freq.takeOrdered(3, key=lambda x: -x[1])
print("\nTop 3 words:", top3)


Word Counts:
loved : 1
app : 1
poor : 1
response : 1
liked : 1
speed : 1
ananya : 1
issue : 1
rohit : 1
mumbai : 1
positive : 1
feedback : 1
ravi : 1
bangalore : 1
mobile : 1
meena : 1
delhi : 1
reported : 1
time : 1
ajay : 1
pune : 1
delivery : 1
hyderabad : 1
had : 1
ui : 1
gave : 1

Top 3 words: [('loved', 1), ('app', 1), ('poor', 1)]


In [7]:
word_dict = dict(word_freq.collect())
print("\nWord → Count Dictionary:", word_dict)


Word → Count Dictionary: {'loved': 1, 'app': 1, 'poor': 1, 'response': 1, 'liked': 1, 'speed': 1, 'ananya': 1, 'issue': 1, 'rohit': 1, 'mumbai': 1, 'positive': 1, 'feedback': 1, 'ravi': 1, 'bangalore': 1, 'mobile': 1, 'meena': 1, 'delhi': 1, 'reported': 1, 'time': 1, 'ajay': 1, 'pune': 1, 'delivery': 1, 'hyderabad': 1, 'had': 1, 'ui': 1, 'gave': 1}


#DataFrames Transformations

In [8]:
scores = [("Ravi", "Math", 88),
          ("Ananya", "Science", 92),
          ("Kavya", "English", 79),
          ("Ravi", "English", 67),
          ("Neha", "Math", 94),
          ("Meena", "Science", 85)]
columns = ["name", "subject", "score"]
df_scores = spark.createDataFrame(scores, columns)

In [9]:
df_scores = df_scores.withColumn(
    "grade",
    when(col("score") >= 90, "A")
    .when((col("score") >= 80) & (col("score") < 90), "B")
    .when((col("score") >= 70) & (col("score") < 80), "C")
    .otherwise("D")
)
print("\nExam Scores with Grades:")
df_scores.show()


Exam Scores with Grades:
+------+-------+-----+-----+
|  name|subject|score|grade|
+------+-------+-----+-----+
|  Ravi|   Math|   88|    B|
|Ananya|Science|   92|    A|
| Kavya|English|   79|    C|
|  Ravi|English|   67|    D|
|  Neha|   Math|   94|    A|
| Meena|Science|   85|    B|
+------+-------+-----+-----+



In [10]:
print("\nAverage Score per Subject:")
df_scores.groupBy("subject").agg(avg("score").alias("avg_score")).show()


Average Score per Subject:
+-------+---------+
|subject|avg_score|
+-------+---------+
|Science|     88.5|
|   Math|     91.0|
|English|     73.0|
+-------+---------+



In [11]:
df_scores = df_scores.withColumn(
    "difficulty",
    when(col("subject").isin("Math","Science"), "Difficult").otherwise("Easy")
)


In [12]:
windowSpec = Window.partitionBy("subject").orderBy(col("score").desc())
df_scores = df_scores.withColumn("rank", rank().over(windowSpec))
print("\nRanked Scores:")
df_scores.show()


Ranked Scores:
+------+-------+-----+-----+----------+----+
|  name|subject|score|grade|difficulty|rank|
+------+-------+-----+-----+----------+----+
| Kavya|English|   79|    C|      Easy|   1|
|  Ravi|English|   67|    D|      Easy|   2|
|  Neha|   Math|   94|    A| Difficult|   1|
|  Ravi|   Math|   88|    B| Difficult|   2|
|Ananya|Science|   92|    A| Difficult|   1|
| Meena|Science|   85|    B| Difficult|   2|
+------+-------+-----+-----+----------+----+



In [13]:
from pyspark.sql.types import StringType
to_upper = udf(lambda x: x.upper(), StringType())
df_scores = df_scores.withColumn("name_upper", to_upper(col("name")))
print("\nScores with Uppercase Names:")
df_scores.show()


Scores with Uppercase Names:
+------+-------+-----+-----+----------+----+----------+
|  name|subject|score|grade|difficulty|rank|name_upper|
+------+-------+-----+-----+----------+----+----------+
| Kavya|English|   79|    C|      Easy|   1|     KAVYA|
|  Ravi|English|   67|    D|      Easy|   2|      RAVI|
|  Neha|   Math|   94|    A| Difficult|   1|      NEHA|
|  Ravi|   Math|   88|    B| Difficult|   2|      RAVI|
|Ananya|Science|   92|    A| Difficult|   1|    ANANYA|
| Meena|Science|   85|    B| Difficult|   2|     MEENA|
+------+-------+-----+-----+----------+----+----------+



# Ingest CSV & JSON  Save to Parquet


In [15]:
import pandas as pd

data = [
    [1, "Amit", "IT", "Bangalore", 78000],
    [2, "Kavya", "HR", "Chennai", 62000],
    [3, "Arjun", "Finance", "Hyderabad", 55000]
]
columns = ["id","name","department","city","salary"]

df_pandas = pd.DataFrame(data, columns=columns)
df_pandas.to_csv("students.csv", index=False)

df_csv = spark.read.csv("students.csv", header=True, inferSchema=True)
df_csv.show()


+---+-----+----------+---------+------+
| id| name|department|     city|salary|
+---+-----+----------+---------+------+
|  1| Amit|        IT|Bangalore| 78000|
|  2|Kavya|        HR|  Chennai| 62000|
|  3|Arjun|   Finance|Hyderabad| 55000|
+---+-----+----------+---------+------+



In [16]:
df_csv = spark.read.csv("students.csv", header=True, inferSchema=True)
print("\nCSV Schema:")
df_csv.printSchema()


CSV Schema:
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: integer (nullable = true)



In [18]:
import json


data = [
    {
        "id": 101,
        "name": "Sneha",
        "address": {
            "city": "Mumbai",
            "pincode": 400001
        },
        "skills": ["Python", "Spark"]
    }
]


with open("employee_nested.json", "w") as f:
    json.dump(data, f)


df_json = spark.read.json("employee_nested.json")
df_json.show(truncate=False)
df_json.printSchema()


+----------------+---+-----+---------------+
|address         |id |name |skills         |
+----------------+---+-----+---------------+
|{Mumbai, 400001}|101|Sneha|[Python, Spark]|
+----------------+---+-----+---------------+

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- pincode: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [19]:

df_json = spark.read.json("employee_nested.json")
print("\nJSON Schema:")
df_json.printSchema()



JSON Schema:
root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- pincode: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [20]:

df_flat = df_json.select(
    "id",
    "name",
    col("address.city").alias("city"),
    col("address.pincode").alias("pincode"),
    explode("skills").alias("skill")
)
print("\nFlattened JSON:")
df_flat.show()


Flattened JSON:
+---+-----+------+-------+------+
| id| name|  city|pincode| skill|
+---+-----+------+-------+------+
|101|Sneha|Mumbai| 400001|Python|
|101|Sneha|Mumbai| 400001| Spark|
+---+-----+------+-------+------+



In [21]:

df_csv.write.mode("overwrite").parquet("/tmp/output/students")
df_flat.write.mode("overwrite").parquet("/tmp/output/employees")

# Spark SQL Temp Views & Queries

In [22]:

df_scores.createOrReplaceTempView("exam_scores")

In [23]:

spark.sql("""
SELECT subject, name, score
FROM (
  SELECT *, RANK() OVER(PARTITION BY subject ORDER BY score DESC) as rnk
  FROM exam_scores
)
WHERE rnk=1
""").show()

+-------+------+-----+
|subject|  name|score|
+-------+------+-----+
|English| Kavya|   79|
|   Math|  Neha|   94|
|Science|Ananya|   92|
+-------+------+-----+



In [24]:

spark.sql("SELECT grade, COUNT(*) as cnt FROM exam_scores GROUP BY grade").show()

+-----+---+
|grade|cnt|
+-----+---+
|    B|  2|
|    C|  1|
|    A|  2|
|    D|  1|
+-----+---+



In [25]:

spark.sql("""
SELECT name
FROM exam_scores
GROUP BY name
HAVING COUNT(DISTINCT subject) > 1
""").show()

+----+
|name|
+----+
|Ravi|
+----+



In [26]:

spark.sql("""
SELECT subject, AVG(score) as avg_score
FROM exam_scores
GROUP BY subject
HAVING AVG(score)>85
""").show()


+-------+---------+
|subject|avg_score|
+-------+---------+
|Science|     88.5|
|   Math|     91.0|
+-------+---------+



In [29]:
attendance = [("Ravi", 25), ("Ananya", 18), ("Kavya", 22), ("Neha", 20), ("Meena", 19)]
df_attendance = spark.createDataFrame(attendance, ["name","days_present"])

df_joined = df_scores.join(df_attendance, "name", "left")

In [28]:
def downgrade(grade):
    mapping = {"A":"B","B":"C","C":"D","D":"D"}
    return mapping.get(grade,"D")

downgrade_udf = udf(downgrade, StringType())
df_final = df_joined.withColumn(
    "adj_grade",
    when(col("days_present") < 20, downgrade_udf(col("grade"))).otherwise(col("grade"))
)
print("\nAttendance-adjusted Grades:")
df_final.show()


Attendance-adjusted Grades:
+------+-------+-----+-----+----------+----+----------+------------+---------+
|  name|subject|score|grade|difficulty|rank|name_upper|days_present|adj_grade|
+------+-------+-----+-----+----------+----+----------+------------+---------+
|Ananya|Science|   92|    A| Difficult|   1|    ANANYA|          18|        B|
|  Ravi|English|   67|    D|      Easy|   2|      RAVI|          25|        D|
|  Ravi|   Math|   88|    B| Difficult|   2|      RAVI|          25|        B|
|  Neha|   Math|   94|    A| Difficult|   1|      NEHA|          20|        A|
| Meena|Science|   85|    B| Difficult|   2|     MEENA|          19|        C|
| Kavya|English|   79|    C|      Easy|   1|     KAVYA|          22|        C|
+------+-------+-----+-----+----------+----+----------+------------+---------+



# Partitioned Load (Full + Incremental)

In [30]:
df_scores.write.partitionBy("subject").mode("overwrite").parquet("/tmp/scores/")

In [31]:

incremental = [("Meena", "Math", 93)]
df_inc = spark.createDataFrame(incremental, ["name","subject","score"])
df_inc.write.mode("append").partitionBy("subject").parquet("/tmp/scores/")

In [32]:

import os
print("\nPartitions in /tmp/scores/:", os.listdir("/tmp/scores/"))


Partitions in /tmp/scores/: ['._SUCCESS.crc', 'subject=Science', 'subject=English', '_SUCCESS', 'subject=Math']


In [33]:

math_df = spark.read.parquet("/tmp/scores/subject=Math")
print("\nMath Partition Data:")
math_df.show()


Math Partition Data:
+-----+-----+-----+----------+----+----------+
| name|score|grade|difficulty|rank|name_upper|
+-----+-----+-----+----------+----+----------+
| Neha|   94|    A| Difficult|   1|      NEHA|
| Ravi|   88|    B| Difficult|   2|      RAVI|
|Meena|   93| NULL|      NULL|NULL|      NULL|
+-----+-----+-----+----------+----+----------+



# ETL: Clean, Transform, Load

In [35]:
import pandas as pd

data = [
    [1, "Arjun", "IT", 78000, 5000],
    [2, "Kavya", "HR", 62000, None],
    [3, "Sneha", "Finance", 55000, 3000]
]
columns = ["emp_id","name","dept","salary","bonus"]


df_pandas = pd.DataFrame(data, columns=columns)
df_pandas.to_csv("raw_employees.csv", index=False)

df_emp = spark.read.csv("raw_employees.csv", header=True, inferSchema=True)
df_emp.show()


+------+-----+-------+------+------+
|emp_id| name|   dept|salary| bonus|
+------+-----+-------+------+------+
|     1|Arjun|     IT| 78000|5000.0|
|     2|Kavya|     HR| 62000|  NULL|
|     3|Sneha|Finance| 55000|3000.0|
+------+-----+-------+------+------+



In [36]:
df_emp = spark.read.csv("raw_employees.csv", header=True, inferSchema=True)

In [37]:
df_emp = df_emp.fillna({"bonus":2000})


In [38]:
df_emp = df_emp.withColumn("total_ctc", col("salary")+col("bonus"))

In [39]:
df_final = df_emp.filter(col("total_ctc")>60000)
print("\nFiltered Employee Data:")
df_final.show()


Filtered Employee Data:
+------+-----+----+------+------+---------+
|emp_id| name|dept|salary| bonus|total_ctc|
+------+-----+----+------+------+---------+
|     1|Arjun|  IT| 78000|5000.0|  83000.0|
|     2|Kavya|  HR| 62000|2000.0|  64000.0|
+------+-----+----+------+------+---------+



In [40]:
df_final.write.mode("overwrite").parquet("/tmp/etl/employees_parquet")
df_final.write.mode("overwrite").json("/tmp/etl/employees_json")