In [None]:
pip install pyspark



**Loading the Dataset and Exploring**

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Titanic_Survival_Prediction").getOrCreate()

In [None]:
train_data = spark.read.csv("/content/train.csv", header=True, inferSchema=True)

In [None]:
train_data.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [None]:
train_data.describe

<bound method DataFrame.describe of DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]>

In [None]:
train_data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



**Preprocessing the dataset**



> Exploring the dataset



In [None]:
train_data.toPandas()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"""Johnston, Miss. Catherine Helen """"Carrie""""""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C




> Plots



In [None]:
import numpy as np
from google.colab import autoviz

def heatmap(df, x_colname, y_colname, figscale=1, mpl_palette_name='viridis'):
  from matplotlib import pyplot as plt
  import seaborn as sns
  import pandas as pd
  plt.subplots(figsize=(8 * figscale, 8 * figscale))
  df_2dhist = pd.DataFrame({
      x_label: grp[y_colname].value_counts()
      for x_label, grp in df.groupby(x_colname)
  })
  sns.heatmap(df_2dhist, cmap=mpl_palette_name)
  plt.xlabel(x_colname)
  plt.ylabel(y_colname)
  return autoviz.MplChart.from_current_mpl_state()

chart = heatmap(_df_15, *['Sex', 'Embarked'], **{})
chart

In [None]:
import numpy as np
from google.colab import autoviz

def categorical_histogram(df, colname, figscale=1, mpl_palette_name='Dark2'):
  from matplotlib import pyplot as plt
  import seaborn as sns
  df.groupby(colname).size().plot(kind='barh', color=sns.palettes.mpl_palette(mpl_palette_name), figsize=(8*figscale, 4.8*figscale))
  plt.gca().spines[['top', 'right',]].set_visible(False)
  return autoviz.MplChart.from_current_mpl_state()

chart = categorical_histogram(_df_4, *['Sex'], **{})
chart



> Treating NULL



In [None]:
from pyspark.sql.functions import isnull, when, count

In [None]:
null_counts = train_data.select([count(when(isnull(c), c)).alias(c) for c in train_data.columns])
null_counts.show()

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|  0|177|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+





> Imputing MEAN to the age column



In [None]:
from pyspark.sql.functions import mean
mean_age = train_data.select(mean(train_data['Age'])).collect()[0][0]



> Droping the missing values based on Embarked column



In [None]:
train_data = train_data.na.drop(subset=["Embarked"])

In [None]:
null_counts = train_data.select([count(when(isnull(c), c)).alias(c) for c in train_data.columns])
null_counts.show()

+--------+------+---+---+-----+-----+----+--------+
|Survived|Pclass|Sex|Age|SibSp|Parch|Fare|Embarked|
+--------+------+---+---+-----+-----+----+--------+
|       0|     0|  0|  0|    0|    0|   0|       0|
+--------+------+---+---+-----+-----+----+--------+





> Droping the unwanted columns





In [None]:
train_data = train_data.drop('PassengerID', 'Cabin', 'Name', 'Ticket', 'Title')

In [None]:
train_data.show()

+--------+------+------+-----------------+-----+-----+-------+--------+
|Survived|Pclass|   Sex|              Age|SibSp|Parch|   Fare|Embarked|
+--------+------+------+-----------------+-----+-----+-------+--------+
|       0|     3|  male|             22.0|    1|    0|   7.25|       S|
|       1|     1|female|             38.0|    1|    0|71.2833|       C|
|       1|     3|female|             26.0|    0|    0|  7.925|       S|
|       1|     1|female|             35.0|    1|    0|   53.1|       S|
|       0|     3|  male|             35.0|    0|    0|   8.05|       S|
|       0|     3|  male|29.69911764705882|    0|    0| 8.4583|       Q|
|       0|     1|  male|             54.0|    0|    0|51.8625|       S|
|       0|     3|  male|              2.0|    3|    1| 21.075|       S|
|       1|     3|female|             27.0|    0|    2|11.1333|       S|
|       1|     2|female|             14.0|    1|    0|30.0708|       C|
|       1|     3|female|              4.0|    1|    1|   16.7|  

**Modeling**

In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression,RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder



> String Indexer



In [None]:
stringIndex = StringIndexer(inputCols=['Sex', 'Embarked'],outputCols=['Sex_', 'Embarked_'])

stringIndex_model = stringIndex.fit(train_data)

In [None]:
train_data = stringIndex_model.transform(train_data).drop('Sex', 'Embarked')



> Vector assembler



In [None]:
vector_assembler = VectorAssembler(inputCols=train_data.columns[1:],outputCol='features')

train_data = vector_assembler.transform(train_data).select('features', 'Survived')

In [None]:
train_df, valid_df = train_data.randomSplit([0.7, 0.3])



> Evaluator



In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol='Survived',metricName='accuracy')

***GBT Classifier***

In [None]:
gb = GBTClassifier(labelCol='Survived', maxIter=75, maxDepth=3)

model = gb.fit(train_df)
pred_gb = model.transform(valid_df)
evaluator.evaluate(pred_gb)


0.8110236220472441

***Lasso Regression***

In [None]:
lasso = LogisticRegression(labelCol='Survived',maxIter=100,elasticNetParam=1,regParam=0.0003)

model = lasso.fit(train_df)
pred_la = model.transform(valid_df)
evaluator.evaluate(pred_la)

0.8031496062992126

***Random Forest Classifier***

In [None]:
rf = RandomForestClassifier(labelCol='Survived',numTrees=100, maxDepth=3)

model = rf.fit(train_df)
pred_rf = model.transform(valid_df)
evaluator.evaluate(pred_rf)

0.8188976377952756

***Logistic Regression***

In [None]:
ridge = LogisticRegression(labelCol='Survived',maxIter=100,elasticNetParam=0,regParam=0.03)

model = ridge.fit(train_df)
pred_rg = model.transform(valid_df)
evaluator.evaluate(pred_rg)

0.7913385826771654

**Model Evaluation**

In [None]:
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator

classifiers = [LogisticRegression(labelCol='Survived', maxIter=100, elasticNetParam=0, regParam=0.03),
               DecisionTreeClassifier(labelCol='Survived'),
               RandomForestClassifier(labelCol='Survived'),
               GBTClassifier(labelCol='Survived', maxIter=10),
               LinearSVC(labelCol='Survived')]

results = {}

for classifier in classifiers:
    model = classifier.fit(train_df)
    predictions = model.transform(valid_df)

    evaluator = BinaryClassificationEvaluator(labelCol='Survived')
    auc = evaluator.evaluate(predictions)

    results[classifier.__class__.__name__] = auc

best_classifier = max(results, key=results.get)
best_auc = results[best_classifier]

print("Results:")
for classifier, auc in results.items():
    print(f"{classifier}: AUC = {auc}")

print(f"Best Classifier: {best_classifier} with AUC = {best_auc}")


Results:
LogisticRegression: AUC = 0.8592506309454483
DecisionTreeClassifier: AUC = 0.6118229470005824
RandomForestClassifier: AUC = 0.8669190448456608
GBTClassifier: AUC = 0.8421342134213424
LinearSVC: AUC = 0.8119459004724002
Best Classifier: RandomForestClassifier with AUC = 0.8669190448456608


The Random Forest model did an excellent job in accurately predicting passenger survival on the Titanic dataset with an impressive accuracy rate of 86.6%, showcasing its effectiveness in the field of machine learning.