In [55]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import seaborn as sns
from pyspark.ml import Pipeline
from pyspark.ml.classification import *
from pyspark.ml.regression import *
from pyspark.ml.tree import *
from pyspark.ml.feature import *

In [56]:
spark = SparkSession.builder.appName("GenderSubmission").config("spark.driver.memory","4g").config("spark.executor.memory","4g").getOrCreate()

In [57]:
read_df = spark.read.csv("D:/DataSets/Titanic_MLLearning_From_Disaster/train.csv", header=True)

In [58]:
read_df.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Survived: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- SibSp: string (nullable = true)
 |-- Parch: string (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [59]:
print("Total number of records are: ", read_df.count())
print("Total number of columns are: ", len(read_df.columns))

Total number of records are:  891
Total number of columns are:  12


In [60]:
read_df.show(5,False)

+-----------+--------+------+---------------------------------------------------+------+---+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|Name                                               |Sex   |Age|SibSp|Parch|Ticket          |Fare   |Cabin|Embarked|
+-----------+--------+------+---------------------------------------------------+------+---+-----+-----+----------------+-------+-----+--------+
|1          |0       |3     |Braund, Mr. Owen Harris                            |male  |22 |1    |0    |A/5 21171       |7.25   |null |S       |
|2          |1       |1     |Cumings, Mrs. John Bradley (Florence Briggs Thayer)|female|38 |1    |0    |PC 17599        |71.2833|C85  |C       |
|3          |1       |3     |Heikkinen, Miss. Laina                             |female|26 |0    |0    |STON/O2. 3101282|7.925  |null |S       |
|4          |1       |1     |Futrelle, Mrs. Jacques Heath (Lily May Peel)       |female|35 |1    |0    |113803          |53.1   |C

In [61]:
read_df.limit(5).toPandas()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [62]:
print("Survived people are: ",read_df.select("Survived").where(col("Survived")==1).count())
print("Deceased people are: ",read_df.select("Survived").where(col("Survived")==0).count())

Survived people are:  342
Deceased people are:  549


In [63]:
#Data Preparation and Feature Engineering

In [64]:
dataset = read_df.select(col("Survived").cast(FloatType()),
                            col("Pclass").cast(FloatType()),
                            col("Sex"),
                            col("Age").cast(FloatType()),
                            col("Fare").cast(FloatType()),
                            col("Embarked"))

In [65]:
dataset.show(5,False)

+--------+------+------+----+-------+--------+
|Survived|Pclass|Sex   |Age |Fare   |Embarked|
+--------+------+------+----+-------+--------+
|0.0     |3.0   |male  |22.0|7.25   |S       |
|1.0     |1.0   |female|38.0|71.2833|C       |
|1.0     |3.0   |female|26.0|7.925  |S       |
|1.0     |1.0   |female|35.0|53.1   |S       |
|0.0     |3.0   |male  |35.0|8.05   |S       |
+--------+------+------+----+-------+--------+
only showing top 5 rows



In [66]:
from pyspark.sql.functions import isnull, when, count, col
dataset.select([count(when(isnull(c), c)).alias(c) for c in dataset.columns]).show()

+--------+------+---+---+----+--------+
|Survived|Pclass|Sex|Age|Fare|Embarked|
+--------+------+---+---+----+--------+
|       0|     0|  0|177|   0|       2|
+--------+------+---+---+----+--------+



In [67]:
dataset = dataset.replace('?', None) \
                .dropna(how='any')

In [68]:
dataset.count()

712

In [69]:
from pyspark.ml.feature import StringIndexer

dataset = StringIndexer(
        inputCol='Sex',
        outputCol='Gender',
        handleInvalid='keep').fit(dataset).transform(dataset)

dataset = StringIndexer(
    inputCol='Embarked', 
    outputCol='Boarded', 
    handleInvalid='keep').fit(dataset).transform(dataset)

dataset.show(5,False)

+--------+------+------+----+-------+--------+------+-------+
|Survived|Pclass|Sex   |Age |Fare   |Embarked|Gender|Boarded|
+--------+------+------+----+-------+--------+------+-------+
|0.0     |3.0   |male  |22.0|7.25   |S       |0.0   |0.0    |
|1.0     |1.0   |female|38.0|71.2833|C       |1.0   |1.0    |
|1.0     |3.0   |female|26.0|7.925  |S       |1.0   |0.0    |
|1.0     |1.0   |female|35.0|53.1   |S       |1.0   |0.0    |
|0.0     |3.0   |male  |35.0|8.05   |S       |0.0   |0.0    |
+--------+------+------+----+-------+--------+------+-------+
only showing top 5 rows



In [70]:
dataset.dtypes

[('Survived', 'float'),
 ('Pclass', 'float'),
 ('Sex', 'string'),
 ('Age', 'float'),
 ('Fare', 'float'),
 ('Embarked', 'string'),
 ('Gender', 'double'),
 ('Boarded', 'double')]

In [71]:
#drop unnecessary columns

dataset = dataset.drop('Sex')  
dataset = dataset.drop('Embarked')

In [72]:
dataset.show(10,False)

+--------+------+----+-------+------+-------+
|Survived|Pclass|Age |Fare   |Gender|Boarded|
+--------+------+----+-------+------+-------+
|0.0     |3.0   |22.0|7.25   |0.0   |0.0    |
|1.0     |1.0   |38.0|71.2833|1.0   |1.0    |
|1.0     |3.0   |26.0|7.925  |1.0   |0.0    |
|1.0     |1.0   |35.0|53.1   |1.0   |0.0    |
|0.0     |3.0   |35.0|8.05   |0.0   |0.0    |
|0.0     |1.0   |54.0|51.8625|0.0   |0.0    |
|0.0     |3.0   |2.0 |21.075 |0.0   |0.0    |
|1.0     |3.0   |27.0|11.1333|1.0   |0.0    |
|1.0     |2.0   |14.0|30.0708|1.0   |1.0    |
|1.0     |3.0   |4.0 |16.7   |1.0   |0.0    |
+--------+------+----+-------+------+-------+
only showing top 10 rows



In [73]:
required_features = ['Pclass', 'Age', 'Fare', 'Gender', 'Boarded']

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=required_features, outputCol='Features')
transformed_data = assembler.transform(dataset)

In [74]:
transformed_data.show(5,False)

+--------+------+----+-------+------+-------+-------------------------------------+
|Survived|Pclass|Age |Fare   |Gender|Boarded|Features                             |
+--------+------+----+-------+------+-------+-------------------------------------+
|0.0     |3.0   |22.0|7.25   |0.0   |0.0    |[3.0,22.0,7.25,0.0,0.0]              |
|1.0     |1.0   |38.0|71.2833|1.0   |1.0    |[1.0,38.0,71.2833023071289,1.0,1.0]  |
|1.0     |3.0   |26.0|7.925  |1.0   |0.0    |[3.0,26.0,7.925000190734863,1.0,0.0] |
|1.0     |1.0   |35.0|53.1   |1.0   |0.0    |[1.0,35.0,53.099998474121094,1.0,0.0]|
|0.0     |3.0   |35.0|8.05   |0.0   |0.0    |[3.0,35.0,8.050000190734863,0.0,0.0] |
+--------+------+----+-------+------+-------+-------------------------------------+
only showing top 5 rows



In [75]:
(training_data, test_data) = transformed_data.randomSplit([0.75,0.25])

In [76]:
print(transformed_data.count())
print(training_data.count())
print(test_data.count())

712
527
185


In [77]:
from pyspark.ml.classification import RandomForestClassifier

randomForest = RandomForestClassifier(labelCol='Survived', featuresCol='Features', maxDepth=5)

In [78]:
model = randomForest.fit(transformed_data)

In [79]:
predictions = model.transform(test_data)

In [80]:
#Evaluate our model

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol='Survived', predictionCol='prediction', metricName='accuracy')

In [81]:
accuracy = evaluator.evaluate(predictions)
print("Test Accuracy is: ", accuracy)

Test Accuracy is:  0.8486486486486486
