<a href="https://colab.research.google.com/github/rahulku91058/Training-AIML/blob/main/Computer_vision/PySpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install PySpark
# !pip install pyspark

In [2]:
# Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, when, count

In [None]:
# Create Spark Session
spark = SparkSession.builder.appName("TitanicDataAnalysis").getOrCreate()

In [None]:
# Load Dataset (Replace with your file path or upload to Colab)
from google.colab import files
files.upload()  # Upload 'titanic.csv'
df = spark.read.csv("titanic.csv", header=True, inferSchema=True)

In [None]:
# Basic Data Exploration
print("Schema:")
df.printSchema()
print(f"Total Rows: {df.count()}")
df.show(5)

In [None]:
# Count Missing Data
print("Missing Data:")
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

In [None]:
# Data Cleaning
df = df.na.fill({"Age": 30, "Embarked": "S"}).withColumn("Age", col("Age").cast("float")).withColumn("Fare", col("Fare").cast("float"))

In [None]:
# Analysis: Survival Rate
print("Survival Rate:")
df.groupBy("Survived").count().show()

In [None]:
# Survival Rate by Gender
print("Survival Rate by Gender:")
df.groupBy("Sex", "Survived").count().show()

In [None]:
# Average Fare by Passenger Class
print("Average Fare by Pclass:")
df.groupBy("Pclass").avg("Fare").show()

In [None]:
# Correlation Between Age and Fare
print(f"Correlation between Age and Fare: {df.stat.corr('Age', 'Fare')}")

In [None]:
# Top 5 Most Expensive Tickets
print("Top 5 Most Expensive Tickets:")
df.orderBy(col("Fare").desc()).show(5)

In [None]:
# Save Processed Data
df.write.csv("processed_titanic_data.csv", header=True)
print("Processed data saved!")