In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('regression').getOrCreate()

In [2]:
df = spark.read.csv('titanic_train.csv', header=True, inferSchema=True)

In [3]:
df.show(4)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 4 rows



In [4]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [5]:
print('Number of rows: \t', df.count())
print('Number of columns: \t', len(df.columns))

Number of rows: 	 891
Number of columns: 	 12


In [6]:
df.groupBy('Survived').count().show()

+--------+-----+
|Survived|count|
+--------+-----+
|       1|  342|
|       0|  549|
+--------+-----+



In [7]:
df.groupBy('Survived').mean('Fare', 'Age').show()

+--------+------------------+------------------+
|Survived|         avg(Fare)|          avg(Age)|
+--------+------------------+------------------+
|       1| 48.39540760233917|28.343689655172415|
|       0|22.117886885245877| 30.62617924528302|
+--------+------------------+------------------+



In [8]:
df.groupBy('Survived').pivot('Sex').count().show()

+--------+------+----+
|Survived|female|male|
+--------+------+----+
|       1|   233| 109|
|       0|    81| 468|
+--------+------+----+



In [9]:
df.groupBy('Survived').pivot('Pclass').count().show()

+--------+---+---+---+
|Survived|  1|  2|  3|
+--------+---+---+---+
|       1|136| 87|119|
|       0| 80| 97|372|
+--------+---+---+---+



In [10]:
df.groupBy('Survived').pivot('SibSp').count().show()

+--------+---+---+---+---+---+----+----+
|Survived|  0|  1|  2|  3|  4|   5|   8|
+--------+---+---+---+---+---+----+----+
|       1|210|112| 13|  4|  3|null|null|
|       0|398| 97| 15| 12| 15|   5|   7|
+--------+---+---+---+---+---+----+----+



In [11]:
df.groupBy('Survived').pivot('Parch').count().show()

+--------+---+---+---+---+----+---+----+
|Survived|  0|  1|  2|  3|   4|  5|   6|
+--------+---+---+---+---+----+---+----+
|       1|233| 65| 40|  3|null|  1|null|
|       0|445| 53| 40|  2|   4|  4|   1|
+--------+---+---+---+---+----+---+----+



In [12]:
df.groupBy('Survived').pivot('Embarked').count().show()

+--------+----+---+---+---+
|Survived|null|  C|  Q|  S|
+--------+----+---+---+---+
|       1|   2| 93| 30|217|
|       0|null| 75| 47|427|
+--------+----+---+---+---+



In [13]:
for col in df.columns:
    print(col, '\t\t', df.filter(df[col].isNull()).count())

PassengerId 		 0
Survived 		 0
Pclass 		 0
Name 		 0
Sex 		 0
Age 		 177
SibSp 		 0
Parch 		 0
Ticket 		 0
Fare 		 0
Cabin 		 687
Embarked 		 2


In [14]:
df = df.fillna({'Embarked': 'S'})

In [15]:
df = df.withColumn('FamilySize', df['Parch'] + df['SibSp']).\
            drop('Parch', 'SibSp')

In [16]:
df = df.drop('PassengerID', 'Age', 'Cabin', 'Name', 'Ticket')

In [17]:
df.show(4)

+--------+------+------+-------+--------+----------+
|Survived|Pclass|   Sex|   Fare|Embarked|FamilySize|
+--------+------+------+-------+--------+----------+
|       0|     3|  male|   7.25|       S|         1|
|       1|     1|female|71.2833|       C|         1|
|       1|     3|female|  7.925|       S|         0|
|       1|     1|female|   53.1|       S|         1|
+--------+------+------+-------+--------+----------+
only showing top 4 rows



In [18]:
for col in df.columns:
    print(col, '\t\t', df.filter(df[col].isNull()).count())

Survived 		 0
Pclass 		 0
Sex 		 0
Fare 		 0
Embarked 		 0
FamilySize 		 0


In [19]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [20]:
stringIndex = StringIndexer(inputCols=['Sex', 'Embarked'], 
                       outputCols=['SexNum', 'EmbNum'])

stringIndex_model = stringIndex.fit(df)

df = stringIndex_model.transform(df).drop('Sex', 'Embarked')
df.show(4)

+--------+------+-------+----------+------+------+
|Survived|Pclass|   Fare|FamilySize|SexNum|EmbNum|
+--------+------+-------+----------+------+------+
|       0|     3|   7.25|         1|   0.0|   0.0|
|       1|     1|71.2833|         1|   1.0|   1.0|
|       1|     3|  7.925|         0|   1.0|   0.0|
|       1|     1|   53.1|         1|   1.0|   0.0|
+--------+------+-------+----------+------+------+
only showing top 4 rows



In [21]:
vec_asmbl = VectorAssembler(inputCols=df.columns[1:], 
                           outputCol='features')

df = vec_asmbl.transform(df).select('features', 'Survived')
df.show(4, truncate=False)

+-------------------------+--------+
|features                 |Survived|
+-------------------------+--------+
|[3.0,7.25,1.0,0.0,0.0]   |0       |
|[1.0,71.2833,1.0,1.0,1.0]|1       |
|[3.0,7.925,0.0,1.0,0.0]  |1       |
|[1.0,53.1,1.0,1.0,0.0]   |1       |
+-------------------------+--------+
only showing top 4 rows



In [22]:
train_df, test_df = df.randomSplit([0.7, 0.3])

In [23]:
train_df.show(4, truncate=False)

+-------------+--------+
|features     |Survived|
+-------------+--------+
|(5,[0],[1.0])|0       |
|(5,[0],[1.0])|0       |
|(5,[0],[2.0])|0       |
|(5,[0],[2.0])|0       |
+-------------+--------+
only showing top 4 rows



The regularization function is given by 

$$ \alpha (\lambda | {\bf{w}} |_1) + (1 - \alpha) \left(\frac\lambda2 |{\bf{w}}|_2^2 \right) $$

In spark API $\alpha$ is eleasticNetParam and $\lambda$ is regParam.

In [24]:
ridge = LogisticRegression(labelCol='Survived', 
                        maxIter=100, 
                        elasticNetParam=0, # Ridge regression is choosen 
                        regParam=0.3)

model = ridge.fit(train_df)
pred = model.evaluate(test_df)
pred.accuracy

0.8282442748091603

In [25]:
lasso = LogisticRegression(labelCol='Survived', 
                           maxIter=100,
                           elasticNetParam=1, # Lasso
                           regParam=0.0003)

model = lasso.fit(train_df)
pred = model.evaluate(test_df)
pred.accuracy

0.8015267175572519

In [26]:
evaluator = MulticlassClassificationEvaluator(labelCol='Survived', 
                                          metricName='accuracy')

In [27]:
rf = RandomForestClassifier(labelCol='Survived', 
                           numTrees=100, maxDepth=3)

model = rf.fit(train_df)
pred = model.transform(test_df)
evaluator.evaluate(pred)

0.8129770992366412

In [28]:
gb = GBTClassifier(labelCol='Survived', maxIter=100, maxDepth=4)

model = gb.fit(train_df)
pred = model.transform(test_df)
evaluator.evaluate(pred)

0.851145038167939