In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, explode, split, lower

# Create Spark session
spark = SparkSession.builder \
    .appName("BDA Lab: Titanic MapReduce") \
    .getOrCreate()

# Load the Titanic CSV into a DataFrame
df = spark.read.csv("Titanic-Dataset.csv", header=True, inferSchema=True)

# 1. Basic exploration
df.show(5)
print(f"Total rows: {df.count()}")
print(f"Total columns: {len(df.columns)}")

# 2. Select relevant columns
df_selected = df.select("PassengerId", "Survived", "Pclass", "Name", "Sex", "Age", "Fare")
df_selected.show(5)

# 3. Count passengers per class
passengers_per_class = df_selected.groupBy("Pclass").count()
passengers_per_class.show()

# 4. Survival rate (%) per class
survival_rate = df_selected.groupBy("Pclass").agg((avg("Survived") * 100).alias("survival_rate_percent"))
survival_rate.show()

# 5. Average age per class
avg_age_per_class = df_selected.groupBy("Pclass").agg(avg("Age").alias("avg_age"))
avg_age_per_class.show()

# 6. Most common first names among survivors
#    - Split the 'Name' column on comma or period to extract first names
survivors = df_selected.filter(col("Survived") == 1)
names_df = survivors.withColumn(
    "first_name",
    lower(
        split(col("Name"), "[,\\. ]+")[1]  # picks the first name after the title
    )
)
# Filter only alphabetic names
names_df = names_df.filter(col("first_name").rlike("^[a-z]+$"))

# Count and show the top 10 first names
first_name_counts = names_df.groupBy("first_name").count()
top_first_names = first_name_counts.orderBy(col("count").desc()).limit(10)
top_first_names.show()

# Stop the Spark session
spark.stop()


+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------