# 1 . Build a Classification Model with Spark with a dataset of your choice

 Install & Import Required Libraries

In [None]:
pip install pyspark



In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

Initialize Spark Session

In [None]:
from pyspark.sql import SparkSession

try:
    spark = SparkSession.builder \
        .appName("ClassificationModel") \
        .getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    print("Spark session initialized successfully.")
except Exception as e:
    print(f"Error initializing Spark session: {e}")
    exit(1)

Spark session initialized successfully.


Load the Titanic Dataset

In [None]:
import urllib.request
import os

titanic_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
local_path = "titanic.csv"  # Specify a local path to save the file

# Download the file
urllib.request.urlretrieve(titanic_url, local_path)

# Read the file into a PySpark DataFrame
titanic_df = spark.read.csv(local_path, header=True, inferSchema=True)

# Optionally, delete the local file after loading
# os.remove(local_path)

titanic_df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

Data Preprocessing (Vector Assembler + Label Indexing)

In [None]:
#We’ll select important columns and handle nulls

# Select columns
columns = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
df = titanic_df.select(columns)

# Drop rows with null values
df = df.dropna()

In [None]:
#Convert categorical Sex to numeric:

indexer = StringIndexer(inputCol="Sex", outputCol="SexIndexed")
df = indexer.fit(df).transform(df)

In [None]:
#Assemble all features into one vector

assembler = VectorAssembler(
    inputCols=["Pclass", "SexIndexed", "Age", "SibSp", "Parch", "Fare"],
    outputCol="features"
)
df = assembler.transform(df)

Train-Test Split

In [None]:
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

Train a Classifier (Random Forest)

In [None]:
rf = RandomForestClassifier(labelCol="Survived", featuresCol="features")
model = rf.fit(train_data)

Evaluate the Model

In [None]:
predictions = model.transform(test_data)

evaluator = MulticlassClassificationEvaluator(
    labelCol="Survived",
    predictionCol="prediction",
    metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy = {accuracy:.2f}")


Test Accuracy = 0.82


In [None]:
spark.stop()