In [1]:
%matplotlib inline
from pyspark.sql import SparkSession
import pandas as pd

spark = SparkSession.builder.appName("titanic").getOrCreate()

In [2]:
train = spark.read.csv('./train.csv', header="true", inferSchema="true")
test = spark.read.csv('./test.csv', header="true", inferSchema="true")

train.printSchema()
test.printSchema()


root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [3]:
train.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In this section we will explore missing data.

In [4]:
from pyspark.sql.functions import col

# Find columns with missing values
def findNullColumns(df):
    nullCols = []
    numRows = df.count()
    for k in df.columns:
        notNullCount = df.filter(col(k).isNotNull()).count()
        if df.filter(col(k).isNotNull()).count() != numRows:
            nullCols.append((k, notNullCount/numRows))
    return nullCols

findNullColumns(train)

[('Age', 0.8013468013468014),
 ('Cabin', 0.22895622895622897),
 ('Embarked', 0.9977553310886644)]

We can see almost 80% of Cabin column is missing data. So we will drop the Cabin column.
Very few data is missing in Embarked column. We will just drop those rows.

In [5]:
from pyspark.ml.feature import Imputer
ageImputer = Imputer(inputCols=['Age'], outputCols=['imputedAge'], strategy='median')
# ageImputer.fit(train)

In [6]:
# train = train.filter(train.Age.isNotNull())
train = train.filter(train.Embarked.isNotNull())
train = train.drop('Cabin')
train.printSchema()
train.count()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)



889

In next few sections, we will explore training data and the relationship between different features and labels.
As we already know, most of passengers in Titanic didn't survive. Our training data suggests the same, around one-third of the passengers survived. Same goes for passenger class and sex.

In [7]:
labelCol = 'Survived'
train.groupby(labelCol).count().show()

+--------+-----+
|Survived|count|
+--------+-----+
|       1|  340|
|       0|  549|
+--------+-----+



In [8]:
train.crosstab(labelCol, 'Sex').show()

+------------+------+----+
|Survived_Sex|female|male|
+------------+------+----+
|           1|   231| 109|
|           0|    81| 468|
+------------+------+----+



In [9]:
train.crosstab(labelCol, 'Pclass').show()

+---------------+---+---+---+
|Survived_Pclass|  1|  2|  3|
+---------------+---+---+---+
|              1|134| 87|119|
|              0| 80| 97|372|
+---------------+---+---+---+



In [10]:
train.crosstab(labelCol, 'Embarked').show()

+-----------------+---+---+---+
|Survived_Embarked|  C|  Q|  S|
+-----------------+---+---+---+
|                1| 93| 30|217|
|                0| 75| 47|427|
+-----------------+---+---+---+



In [11]:
train.crosstab(labelCol, 'SibSp').show()

+--------------+---+---+---+---+---+---+---+
|Survived_SibSp|  0|  1|  2|  3|  4|  5|  8|
+--------------+---+---+---+---+---+---+---+
|             1|208|112| 13|  4|  3|  0|  0|
|             0|398| 97| 15| 12| 15|  5|  7|
+--------------+---+---+---+---+---+---+---+



In [12]:
train.crosstab(labelCol, 'Parch').show()

+--------------+---+---+---+---+---+---+---+
|Survived_Parch|  0|  1|  2|  3|  4|  5|  6|
+--------------+---+---+---+---+---+---+---+
|             1|231| 65| 40|  3|  0|  1|  0|
|             0|445| 53| 40|  2|  4|  4|  1|
+--------------+---+---+---+---+---+---+---+



Now we will calculate the entropy of categorical features, which will give us the variance for categorical features.

In [13]:
def calcEntropy(df, *columns):
    from scipy.stats import entropy
    n = df.count()
    entropyDf = pd.DataFrame(columns=['Entropy'])
    for column in columns:
        aggr = df.groupby(column).count()
        rows = aggr.select((col('count') / n).alias('prob')).collect()
        probs = [row[0] for row in rows]
        entropyDf.loc[column] = entropy(probs)
    return entropyDf

calcEntropy(train, 'Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch')

Unnamed: 0,Entropy
Sex,0.648037
Pclass,0.99671
Embarked,0.760292
SibSp,0.929045
Parch,0.783248


In [14]:
from pyspark.ml import Pipeline
from pyspark.ml.stat import ChiSquareTest
from pyspark.ml.feature import Bucketizer, OneHotEncoderEstimator, StringIndexer, VectorAssembler, VectorIndexer

edaEmbarkedIndexer = StringIndexer(inputCol='Embarked', outputCol='indexedEmbarked')
edaSexIndexer = StringIndexer(inputCol='Sex', outputCol='indexedSex')

edaAgeImputer = Imputer(inputCols=['Age'], outputCols=['imputedAge'], strategy='median')

ageSplits = [0, 16, 32, 48, 64, 200]
edaAgeBucketizer = Bucketizer(splits=ageSplits, inputCol='imputedAge', outputCol='bucketedAge')

fareSplits = [-float('inf'), 7.91, 14.454, 31, float('inf')]
edaFareBucketizer = Bucketizer(splits=fareSplits, inputCol='Fare', outputCol='bucketedFare')

oneHotEncoderEstimator = OneHotEncoderEstimator(inputCols=['indexedSex', 'indexedEmbarked', 'bucketedFare', 'bucketedAge'], 
                                                outputCols=['oneHotSex', 'oneHotEmbarked','oneHotFare', 'oneHotAge'])
inputCols=['Pclass', 'oneHotSex', 'oneHotEmbarked','oneHotFare', 'oneHotAge']
edaAssembler = VectorAssembler(inputCols=inputCols, outputCol='features')

pipeline = Pipeline(stages=[edaEmbarkedIndexer, edaSexIndexer, edaAgeImputer, edaAgeBucketizer, 
                            edaFareBucketizer, oneHotEncoderEstimator, edaAssembler])
chiSqTrain = pipeline.fit(train).transform(train)

r = ChiSquareTest.test(chiSqTrain, 'features', 'Survived').head()
print("pValues: " + str(r.pValues))
print("degreesOfFreedom: " + str(r.degreesOfFreedom))
print("statistics: " + str(r.statistics))

pValues: [0.0,0.0,6.02813466444e-06,4.02603175464e-07,4.93843854699e-11,0.0101897422598,0.0315461645121,4.25298058386e-05,0.00150622342036,0.612884928604,0.116808580457]
degreesOfFreedom: [2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
statistics: [100.980407261,260.756342249,20.4792462347,25.6818141585,43.2014376768,6.60142083307,4.62299205997,16.755010532,10.0709867693,0.255995235761,2.45959925254]


In [15]:
from pyspark.ml.feature import StringIndexer

embarkedIndexer = StringIndexer(inputCol='Embarked', outputCol='indexedEmbarked', handleInvalid='skip')
sexFeatureIndexer = StringIndexer(inputCol='Sex', outputCol='indexedSex', handleInvalid='skip')

In [16]:
from pyspark.ml.feature import Bucketizer

ageSplits = [0, 16, 32, 48, 64, 200]
ageBucketizer = Bucketizer(splits=ageSplits, inputCol='imputedAge', outputCol='bucketedAge', handleInvalid='skip')
fareSplits = [-float('inf'), 7.91, 14.454, 31, float('inf')]
fareBucketizer = Bucketizer(splits=fareSplits, inputCol='Fare', outputCol='bucketedFare', handleInvalid='skip')

In [17]:
from pyspark.ml.feature import OneHotEncoderEstimator, VectorIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier

oneHotEncoderEstimator = OneHotEncoderEstimator(inputCols=['indexedSex', 'indexedEmbarked', 'bucketedFare', 'bucketedAge'], 
                                                outputCols=['oneHotSex', 'oneHotEmbarked','oneHotFare', 'oneHotAge'])
assembler = VectorAssembler(inputCols=['Pclass', 'SibSp', 'Parch', 'bucketedAge', 
                                       'bucketedFare', 'indexedEmbarked', 'indexedSex'], outputCol='features')
rf = RandomForestClassifier(labelCol=labelCol, featuresCol='features')

In [18]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[ageImputer, embarkedIndexer, sexFeatureIndexer, ageBucketizer, 
                            fareBucketizer, oneHotEncoderEstimator, assembler, rf])

grid = ParamGridBuilder().addGrid(rf.numTrees, [15, 20, 25, 30])\
                         .addGrid(rf.maxDepth, [5, 8])\
                         .build()

cv = CrossValidator(estimator=pipeline, 
                    estimatorParamMaps=grid, 
                    evaluator=BinaryClassificationEvaluator(labelCol=labelCol, metricName='areaUnderROC'), 
                    numFolds=10)

model = cv.fit(train)
train = model.transform(train)

In [19]:
evaluator = model.getEvaluator()
evaluator.evaluate(train)

0.925950926818815

In [20]:
test = model.transform(test)

In [21]:
test.show()

+-----------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+----------+---------------+----------+-----------+------------+-------------+--------------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|PassengerId|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|imputedAge|indexedEmbarked|indexedSex|bucketedAge|bucketedFare|    oneHotSex|oneHotEmbarked|   oneHotFare|    oneHotAge|            features|       rawPrediction|         probability|prediction|
+-----------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+----------+---------------+----------+-----------+------------+-------------+--------------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|        892|     3|    Kelly, Mr. James|  male|34.5|    0|    0|          330911| 7.8292| null|    

Write the predictions to CSV file in Kaggle spceified format.

In [22]:
import os.path
from pyspark.sql.types import IntegerType

csvPath = 'prediction.csv'

if not os.path.exists(csvPath):
    test.select('PassengerId', 'prediction')\
        .coalesce(1)\
        .withColumn('Survived', test['prediction'].cast(IntegerType()))\
        .drop('prediction')\
        .write.csv(csvPath, header='true')