<a href="https://colab.research.google.com/github/ravi-3690/ML-WORKSHOP-PROJECTS/blob/main/pySpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install PySpark
# !pip install pyspark

# Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, when, count

# Create Spark Session
spark = SparkSession.builder.appName("TitanicDataAnalysis").getOrCreate()

# Load Dataset (Replace with your file path or upload to Colab)
from google.colab import files
files.upload()  # Upload 'titanic.csv'
df = spark.read.csv("titanic.csv", header=True, inferSchema=True)

# Basic Data Exploration
print("Schema:")
df.printSchema()
print(f"Total Rows: {df.count()}")
df.show(5)

# Count Missing Data
print("Missing Data:")
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

# Data Cleaning
df = df.na.fill({"Age": 30, "Embarked": "S"}).withColumn("Age", col("Age").cast("float")).withColumn("Fare", col("Fare").cast("float"))

# Analysis: Survival Rate
print("Survival Rate:")
df.groupBy("Survived").count().show()

# Survival Rate by Gender
print("Survival Rate by Gender:")
df.groupBy("Sex", "Survived").count().show()

# Average Fare by Passenger Class
print("Average Fare by Pclass:")
df.groupBy("Pclass").avg("Fare").show()

# Correlation Between Age and Fare
print(f"Correlation between Age and Fare: {df.stat.corr('Age', 'Fare')}")

# Top 5 Most Expensive Tickets
print("Top 5 Most Expensive Tickets:")
df.orderBy(col("Fare").desc()).show(5)

# Save Processed Data
df.write.csv("processed_titanic_data.csv", header=True)
print("Processed data saved!")


Saving titanic.csv to titanic.csv
Schema:
root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)

Total Rows: 891
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|      

In [2]:
df.groupBy("Pclass", "Survived").count().show()

# Survival Rate by Age Group
df = df.withColumn("Age_Group", when(col("Age") < 18, "Child").when((col("Age") >= 18) & (col("Age") <= 60), "Adult").otherwise("Senior"))
df.groupBy("Age_Group", "Survived").count().show()

# Survival Rate by Embarked Port
df.groupBy("Embarked", "Survived").count().show()

# Gender vs Survival
df.groupBy("Sex", "Survived").count().show()

# Average Age and Fare by Survival Status
df.groupBy("Survived").avg("Age", "Fare").show()

# Correlation Between Age and Fare
print(f"Correlation between Age and Fare: {df.stat.corr('Age', 'Fare')}")

# Top 5 Most Expensive Tickets
df.orderBy(col("Fare").desc()).show(5)

+------+--------+-----+
|Pclass|Survived|count|
+------+--------+-----+
|     1|       0|   80|
|     3|       1|  119|
|     1|       1|  136|
|     2|       1|   87|
|     2|       0|   97|
|     3|       0|  372|
+------+--------+-----+

+---------+--------+-----+
|Age_Group|Survived|count|
+---------+--------+-----+
|    Adult|       0|  480|
|    Adult|       1|  276|
|   Senior|       0|   17|
|   Senior|       1|    5|
|    Child|       1|   61|
|    Child|       0|   52|
+---------+--------+-----+

+--------+--------+-----+
|Embarked|Survived|count|
+--------+--------+-----+
|       Q|       1|   30|
|       S|       0|  427|
|       S|       1|  219|
|       C|       1|   93|
|       Q|       0|   47|
|       C|       0|   75|
+--------+--------+-----+

+------+--------+-----+
|   Sex|Survived|count|
+------+--------+-----+
|  male|       0|  468|
|female|       1|  233|
|female|       0|   81|
|  male|       1|  109|
+------+--------+-----+

+--------+------------------+-----