In [17]:
# import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean,col,split, col, regexp_extract, when, lit
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import QuantileDiscretizer

# create sparksession
spark = SparkSession \
    .builder \
    .appName("Spark ML example on titanic data") \
    .getOrCreate()

In [18]:
spark

In [19]:
titanic_df = spark.read.json('data_titanic.json')

In [20]:
titanic_df.printSchema()

root
 |-- age: double (nullable = true)
 |-- boat: string (nullable = true)
 |-- body: long (nullable = true)
 |-- cabin: string (nullable = true)
 |-- embarked: string (nullable = true)
 |-- fare: double (nullable = true)
 |-- homeDest: string (nullable = true)
 |-- name: string (nullable = true)
 |-- parch: long (nullable = true)
 |-- pclass: long (nullable = true)
 |-- sex: string (nullable = true)
 |-- sibsp: long (nullable = true)
 |-- survived: long (nullable = true)
 |-- ticket: string (nullable = true)



In [23]:
titanic_df.count()

1309

In [24]:
titanic_df.show(5)

+----+----+----+-------+--------+-------+--------------------+--------------------+-----+------+------+-----+--------+--------+
| age|boat|body|  cabin|embarked|   fare|            homeDest|                name|parch|pclass|   sex|sibsp|survived|  ticket|
+----+----+----+-------+--------+-------+--------------------+--------------------+-----+------+------+-----+--------+--------+
| 2.0|    |null|C22 C26|       S| 151.55|Montreal, PQ / Ch...|Allison, Miss. He...|    2|     1|female|    1|       0|  113781|
|30.0|    | 135|C22 C26|       S| 151.55|Montreal, PQ / Ch...|Allison, Mr. Huds...|    2|     1|  male|    1|       0|  113781|
|25.0|    |null|C22 C26|       S| 151.55|Montreal, PQ / Ch...|Allison, Mrs. Hud...|    2|     1|female|    1|       0|  113781|
|39.0|    |null|    A36|       S|    0.0|         Belfast, NI|Andrews, Mr. Thom...|    0|     1|  male|    0|       0|  112050|
|71.0|    |  22|       |       C|49.5042| Montevideo, Uruguay|Artagaveytia, Mr....|    0|     1|  male| 

In [25]:
titanic_df.describe().show()

+-------+------------------+-----------------+-----------------+-----+--------+------------------+-------------------+--------------------+------------------+------------------+------+------------------+------------------+------------------+
|summary|               age|             boat|             body|cabin|embarked|              fare|           homeDest|                name|             parch|            pclass|   sex|             sibsp|          survived|            ticket|
+-------+------------------+-----------------+-----------------+-----+--------+------------------+-------------------+--------------------+------------------+------------------+------+------------------+------------------+------------------+
|  count|              1309|             1309|              121| 1309|    1309|              1309|               1309|                1309|              1309|              1309|  1309|              1309|              1309|              1309|
|   mean| 29.50318311688312|9.40

Exploratory Data Analysis (EDA)

In [28]:
titanic_df.groupBy("Survived").count().show()

+--------+-----+
|Survived|count|
+--------+-----+
|       0|  809|
|       1|  500|
+--------+-----+



In [29]:
gropuBy_output = titanic_df.groupBy("Survived").count()

In [30]:
titanic_df.groupBy("Sex","Survived").count().show()

+------+--------+-----+
|   Sex|Survived|count|
+------+--------+-----+
|  male|       0|  682|
|female|       1|  339|
|female|       0|  127|
|  male|       1|  161|
+------+--------+-----+



In [31]:
titanic_df.groupBy("Pclass","Survived").count().show()

+------+--------+-----+
|Pclass|Survived|count|
+------+--------+-----+
|     3|       0|  528|
|     1|       0|  123|
|     1|       1|  200|
|     2|       0|  158|
|     2|       1|  119|
|     3|       1|  181|
+------+--------+-----+



Here it can be seen that the Pclass1 people were given priority to pclass3 people, even though
We can clearly see that Passenegers Of Pclass 1 were given a very high priority while rescue. Even though the the number of Passengers in Pclass 3 were a lot higher, still the number of survival from them is very low.

In [32]:
# This function use to print feature with null values and null count 
def null_value_count(df):
  null_columns_counts = []
  numRows = df.count()
  for k in df.columns:
    nullRows = df.where(col(k).isNull()).count()
    if(nullRows > 0):
      temp = k,nullRows
      null_columns_counts.append(temp)
  return(null_columns_counts)

In [33]:
null_columns_count_list = null_value_count(titanic_df)

In [34]:
spark.createDataFrame(null_columns_count_list, ['Column_With_Null_Value', 'Null_Values_Count']).show()

+----------------------+-----------------+
|Column_With_Null_Value|Null_Values_Count|
+----------------------+-----------------+
|                  body|             1188|
+----------------------+-----------------+



In [35]:
mean_age = titanic_df.select(mean('Age')).collect()[0][0]
print(mean_age)

29.50318311688312


In [36]:
titanic_df.select("Name").show()

+--------------------+
|                Name|
+--------------------+
|Allison, Miss. He...|
|Allison, Mr. Huds...|
|Allison, Mrs. Hud...|
|Andrews, Mr. Thom...|
|Artagaveytia, Mr....|
|Astor, Col. John ...|
| Baumann, Mr. John D|
|Baxter, Mr. Quigg...|
|Beattie, Mr. Thomson|
| Birnbaum, Mr. Jakob|
|Blackwell, Mr. St...|
|Borebank, Mr. Joh...|
|Brady, Mr. John B...|
|  Brandeis, Mr. Emil|
|Brewe, Dr. Arthur...|
|Butt, Major. Arch...|
|Cairns, Mr. Alexa...|
|Carlsson, Mr. Fra...|
|Carrau, Mr. Franc...|
|Carrau, Mr. Jose ...|
+--------------------+
only showing top 20 rows



To replace these NaN values, we can assign them the mean age of the dataset.But the problem is, there were many people with many different ages. We just cant assign a 4 year kid with the mean age that is 29 years.
we can check the Name feature. Looking upon the feature, we can see that the names have a salutation like Mr or Mrs. Thus we can assign the mean values of Mr and Mrs to the respective groups

In [37]:
titanic_df = titanic_df.withColumn("Initial",regexp_extract(col("Name"),"([A-Za-z]+)\.",1))

Using the Regex ""[A-Za-z]+)." we extract the initials from the Name. It looks for strings which lie between A-Z or a-z and followed by a .(dot).

In [38]:
titanic_df.show()

+----+----+----+-----------+--------+--------+--------------------+--------------------+-----+------+------+-----+--------+--------+-------+
| age|boat|body|      cabin|embarked|    fare|            homeDest|                name|parch|pclass|   sex|sibsp|survived|  ticket|Initial|
+----+----+----+-----------+--------+--------+--------------------+--------------------+-----+------+------+-----+--------+--------+-------+
| 2.0|    |null|    C22 C26|       S|  151.55|Montreal, PQ / Ch...|Allison, Miss. He...|    2|     1|female|    1|       0|  113781|   Miss|
|30.0|    | 135|    C22 C26|       S|  151.55|Montreal, PQ / Ch...|Allison, Mr. Huds...|    2|     1|  male|    1|       0|  113781|     Mr|
|25.0|    |null|    C22 C26|       S|  151.55|Montreal, PQ / Ch...|Allison, Mrs. Hud...|    2|     1|female|    1|       0|  113781|    Mrs|
|39.0|    |null|        A36|       S|     0.0|         Belfast, NI|Andrews, Mr. Thom...|    0|     1|  male|    0|       0|  112050|     Mr|
|71.0|    |  

In [39]:
titanic_df.select("Initial").distinct().show()

+--------+
| Initial|
+--------+
|    Dona|
|     Don|
|    Miss|
|Countess|
|     Col|
|     Rev|
|    Lady|
|  Master|
|    Capt|
|     Mme|
|      Mr|
|      Dr|
|     Mrs|
|     Sir|
|Jonkheer|
|    Mlle|
|   Major|
|      Ms|
+--------+



There are some misspelled Initials like Mlle or Mme that stand for Miss. I will replace them with Miss and same thing for other values.

In [40]:
titanic_df = titanic_df.replace(['Mlle','Mme', 'Ms', 'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
               ['Miss','Miss','Miss','Mr','Mr',  'Mrs',  'Mrs',  'Other',  'Other','Other','Mr','Mr','Mr'])

In [41]:
titanic_df.select("Initial").distinct().show()

+-------+
|Initial|
+-------+
|   Dona|
|   Miss|
|  Other|
| Master|
|     Mr|
|    Mrs|
+-------+



lets check the average age by Initials

In [42]:
titanic_df.groupby('Initial').avg('Age').collect()

[Row(Initial='Dona', avg(Age)=39.0),
 Row(Initial='Miss', avg(Age)=23.021069433962264),
 Row(Initial='Other', avg(Age)=44.92307692307692),
 Row(Initial='Master', avg(Age)=8.435791803278688),
 Row(Initial='Mr', avg(Age)=31.50064935064935),
 Row(Initial='Mrs', avg(Age)=35.80904522613066)]

Let's impute missing values in age feature based on average age of Initials

In [43]:
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Miss") & (titanic_df["Age"].isNull()), 22).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Other") & (titanic_df["Age"].isNull()), 46).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Master") & (titanic_df["Age"].isNull()), 5).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Mr") & (titanic_df["Age"].isNull()), 33).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Mrs") & (titanic_df["Age"].isNull()), 36).otherwise(titanic_df["Age"]))

Check the imputation

In [44]:
titanic_df.filter(titanic_df.Age==46).select("Initial").show()

+-------+
|Initial|
+-------+
|     Mr|
|     Mr|
|     Mr|
|     Mr|
|     Mr|
|     Mr|
+-------+



In [45]:
titanic_df.select("Age").show()

+----+
| Age|
+----+
| 2.0|
|30.0|
|25.0|
|39.0|
|71.0|
|47.0|
|28.0|
|24.0|
|36.0|
|25.0|
|45.0|
|42.0|
|41.0|
|48.0|
|28.0|
|45.0|
|28.0|
|33.0|
|28.0|
|17.0|
+----+
only showing top 20 rows



Embarked feature has only two missining values. Let's check values within Embarked

In [47]:
titanic_df.groupBy("Embarked").count().show()

+--------+-----+
|Embarked|count|
+--------+-----+
|       Q|  123|
|       C|  270|
|       S|  914|
|        |    2|
+--------+-----+



Majority Passengers boarded from "S". We can impute with "S"

In [48]:
titanic_df = titanic_df.na.fill({"Embarked" : 'S'})

We can drop Cabin features as it has lots of null values

In [49]:
titanic_df = titanic_df.drop("Cabin")

In [50]:
titanic_df.printSchema()

root
 |-- Age: double (nullable = true)
 |-- boat: string (nullable = true)
 |-- body: long (nullable = true)
 |-- embarked: string (nullable = false)
 |-- fare: double (nullable = true)
 |-- homeDest: string (nullable = true)
 |-- name: string (nullable = true)
 |-- parch: long (nullable = true)
 |-- pclass: long (nullable = true)
 |-- sex: string (nullable = true)
 |-- sibsp: long (nullable = true)
 |-- survived: long (nullable = true)
 |-- ticket: string (nullable = true)
 |-- Initial: string (nullable = true)



We can create a new feature called "Family_size" and "Alone" and analyse it. This feature is the summation of Parch(parents/children) and SibSp(siblings/spouses). It gives us a combined data so that we can check if survival rate have anything to do with family size of the passengers

In [51]:
titanic_df = titanic_df.withColumn("Family_Size",col('SibSp')+col('Parch'))

In [52]:
titanic_df.groupBy("Family_Size").count().show()

+-----------+-----+
|Family_Size|count|
+-----------+-----+
|          0|  790|
|          7|    8|
|          6|   16|
|          5|   25|
|          1|  235|
|         10|   11|
|          3|   43|
|          2|  159|
|          4|   22|
+-----------+-----+



In [53]:
titanic_df = titanic_df.withColumn('Alone',lit(0))

In [54]:
titanic_df = titanic_df.withColumn("Alone",when(titanic_df["Family_Size"] == 0, 1).otherwise(titanic_df["Alone"]))

In [55]:
titanic_df.columns

['Age',
 'boat',
 'body',
 'embarked',
 'fare',
 'homeDest',
 'name',
 'parch',
 'pclass',
 'sex',
 'sibsp',
 'survived',
 'ticket',
 'Initial',
 'Family_Size',
 'Alone']

Lets convert Sex, Embarked & Initial columns from string to number using StringIndexer

In [57]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(titanic_df) for column in ["sex","embarked","Initial"]]
pipeline = Pipeline(stages=indexers)
titanic_df = pipeline.fit(titanic_df).transform(titanic_df)

In [58]:
titanic_df.show()

+----+----+----+--------+--------+--------------------+--------------------+-----+------+------+-----+--------+--------+-------+-----------+-----+---------+--------------+-------------+
| Age|boat|body|embarked|    fare|            homeDest|                name|parch|pclass|   sex|sibsp|survived|  ticket|Initial|Family_Size|Alone|sex_index|embarked_index|Initial_index|
+----+----+----+--------+--------+--------------------+--------------------+-----+------+------+-----+--------+--------+-------+-----------+-----+---------+--------------+-------------+
| 2.0|    |null|       S|  151.55|Montreal, PQ / Ch...|Allison, Miss. He...|    2|     1|female|    1|       0|  113781|   Miss|          3|    0|      1.0|           0.0|          1.0|
|30.0|    | 135|       S|  151.55|Montreal, PQ / Ch...|Allison, Mr. Huds...|    2|     1|  male|    1|       0|  113781|     Mr|          3|    0|      0.0|           0.0|          0.0|
|25.0|    |null|       S|  151.55|Montreal, PQ / Ch...|Allison, Mrs. H

In [59]:
titanic_df.printSchema()

root
 |-- Age: double (nullable = true)
 |-- boat: string (nullable = true)
 |-- body: long (nullable = true)
 |-- embarked: string (nullable = false)
 |-- fare: double (nullable = true)
 |-- homeDest: string (nullable = true)
 |-- name: string (nullable = true)
 |-- parch: long (nullable = true)
 |-- pclass: long (nullable = true)
 |-- sex: string (nullable = true)
 |-- sibsp: long (nullable = true)
 |-- survived: long (nullable = true)
 |-- ticket: string (nullable = true)
 |-- Initial: string (nullable = true)
 |-- Family_Size: long (nullable = true)
 |-- Alone: integer (nullable = false)
 |-- sex_index: double (nullable = false)
 |-- embarked_index: double (nullable = false)
 |-- Initial_index: double (nullable = false)



Drop Columns not required

In [71]:
titanic_df = titanic_df.drop("PassengerId","Name","Ticket","Cabin","Embarked","Sex","Initial", "Boat", "homeDest", "body")

In [72]:
titanic_df.show()

+----+--------+-----+------+-----+--------+-----------+-----+---------+--------------+-------------+
| Age|    fare|parch|pclass|sibsp|survived|Family_Size|Alone|sex_index|embarked_index|Initial_index|
+----+--------+-----+------+-----+--------+-----------+-----+---------+--------------+-------------+
| 2.0|  151.55|    2|     1|    1|       0|          3|    0|      1.0|           0.0|          1.0|
|30.0|  151.55|    2|     1|    1|       0|          3|    0|      0.0|           0.0|          0.0|
|25.0|  151.55|    2|     1|    1|       0|          3|    0|      1.0|           0.0|          2.0|
|39.0|     0.0|    0|     1|    0|       0|          0|    1|      0.0|           0.0|          0.0|
|71.0| 49.5042|    0|     1|    0|       0|          0|    1|      0.0|           1.0|          0.0|
|47.0| 227.525|    0|     1|    1|       0|          1|    0|      0.0|           1.0|          4.0|
|28.0|  25.925|    0|     1|    0|       0|          0|    1|      0.0|           0.0|     

Let's put all features into vector

In [73]:
feature = VectorAssembler(inputCols=titanic_df.columns[1:],outputCol="features")
feature_vector= feature.transform(titanic_df)

In [74]:
feature_vector.show()

+----+--------+-----+------+-----+--------+-----------+-----+---------+--------------+-------------+--------------------+
| Age|    fare|parch|pclass|sibsp|survived|Family_Size|Alone|sex_index|embarked_index|Initial_index|            features|
+----+--------+-----+------+-----+--------+-----------+-----+---------+--------------+-------------+--------------------+
| 2.0|  151.55|    2|     1|    1|       0|          3|    0|      1.0|           0.0|          1.0|[151.55,2.0,1.0,1...|
|30.0|  151.55|    2|     1|    1|       0|          3|    0|      0.0|           0.0|          0.0|(10,[0,1,2,3,5],[...|
|25.0|  151.55|    2|     1|    1|       0|          3|    0|      1.0|           0.0|          2.0|[151.55,2.0,1.0,1...|
|39.0|     0.0|    0|     1|    0|       0|          0|    1|      0.0|           0.0|          0.0|(10,[2,6],[1.0,1.0])|
|71.0| 49.5042|    0|     1|    0|       0|          0|    1|      0.0|           1.0|          0.0|(10,[0,2,6,8],[49...|
|47.0| 227.525|    0|   

Now that the data is all set, let's split it into training and test. I'll be using 80% of it.

In [75]:
(trainingData, testData) = feature_vector.randomSplit([0.8, 0.2],seed = 11)

Modelling
----------------

Here is the list of few Classification Algorithms from Spark ML
LogisticRegression

DecisionTreeClassifier

RandomForestClassifier

Gradient-boosted tree classifier

NaiveBayes

Support Vector Machine

LogisticRegression
------------------

In [79]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="survived", featuresCol="features")
#Training algo
lrModel = lr.fit(trainingData)
lr_prediction = lrModel.transform(testData)
lr_prediction.select("prediction", "survived", "features").show()
evaluator = MulticlassClassificationEvaluator(labelCol="survived", predictionCol="prediction", metricName="accuracy")

+----------+--------+--------------------+
|prediction|survived|            features|
+----------+--------+--------------------+
|       1.0|       1|[14.5,1.0,2.0,1.0...|
|       1.0|       1|[29.0,2.0,2.0,0.0...|
|       1.0|       1|[27.75,2.0,2.0,1....|
|       1.0|       1|[151.55,2.0,1.0,1...|
|       0.0|       0|[46.9,2.0,3.0,5.0...|
|       1.0|       1|[12.2875,1.0,3.0,...|
|       1.0|       1|[31.3875,2.0,3.0,...|
|       1.0|       1|[11.1333,1.0,3.0,...|
|       1.0|       1|[13.4167,1.0,3.0,...|
|       0.0|       0|[29.125,1.0,3.0,4...|
|       0.0|       0|[31.275,2.0,3.0,4...|
|       0.0|       0|[21.075,1.0,3.0,3...|
|       0.0|       0|[31.3875,2.0,3.0,...|
|       0.0|       0|[46.9,2.0,3.0,5.0...|
|       0.0|       0|[46.9,2.0,3.0,5.0...|
|       0.0|       0|[31.275,2.0,3.0,4...|
|       0.0|       0|[14.5,1.0,3.0,1.0...|
|       1.0|       1|[11.2417,0.0,3.0,...|
|       0.0|       0|(10,[0,2,6],[65.0...|
|       0.0|       0|[14.4542,0.0,3.0,...|
+----------

Evaluating accuracy of LogisticRegression.

In [80]:
lr_accuracy = evaluator.evaluate(lr_prediction)
print("Accuracy of LogisticRegression is = %g"% (lr_accuracy))
print("Test Error of LogisticRegression = %g " % (1.0 - lr_accuracy))

Accuracy of LogisticRegression is = 1
Test Error of LogisticRegression = 0 


DecisionTreeClassifier
----------------------

In [82]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="survived", featuresCol="features")
dt_model = dt.fit(trainingData)
dt_prediction = dt_model.transform(testData)
dt_prediction.select("prediction", "survived", "features").show()

+----------+--------+--------------------+
|prediction|survived|            features|
+----------+--------+--------------------+
|       1.0|       1|[14.5,1.0,2.0,1.0...|
|       1.0|       1|[29.0,2.0,2.0,0.0...|
|       1.0|       1|[27.75,2.0,2.0,1....|
|       1.0|       1|[151.55,2.0,1.0,1...|
|       0.0|       0|[46.9,2.0,3.0,5.0...|
|       1.0|       1|[12.2875,1.0,3.0,...|
|       1.0|       1|[31.3875,2.0,3.0,...|
|       1.0|       1|[11.1333,1.0,3.0,...|
|       1.0|       1|[13.4167,1.0,3.0,...|
|       0.0|       0|[29.125,1.0,3.0,4...|
|       0.0|       0|[31.275,2.0,3.0,4...|
|       0.0|       0|[21.075,1.0,3.0,3...|
|       0.0|       0|[31.3875,2.0,3.0,...|
|       0.0|       0|[46.9,2.0,3.0,5.0...|
|       0.0|       0|[46.9,2.0,3.0,5.0...|
|       0.0|       0|[31.275,2.0,3.0,4...|
|       0.0|       0|[14.5,1.0,3.0,1.0...|
|       1.0|       1|[11.2417,0.0,3.0,...|
|       0.0|       0|(10,[0,2,6],[65.0...|
|       0.0|       0|[14.4542,0.0,3.0,...|
+----------

Evaluating accuracy of DecisionTreeClassifier.

In [83]:
dt_accuracy = evaluator.evaluate(dt_prediction)
print("Accuracy of DecisionTreeClassifier is = %g"% (dt_accuracy))
print("Test Error of DecisionTreeClassifier = %g " % (1.0 - dt_accuracy))

Accuracy of DecisionTreeClassifier is = 1
Test Error of DecisionTreeClassifier = 0 


RandomForestClassifier
----------------------

In [85]:
from pyspark.ml.classification import RandomForestClassifier
rf = DecisionTreeClassifier(labelCol="survived", featuresCol="features")
rf_model = rf.fit(trainingData)
rf_prediction = rf_model.transform(testData)
rf_prediction.select("prediction", "survived", "features").show()

+----------+--------+--------------------+
|prediction|survived|            features|
+----------+--------+--------------------+
|       1.0|       1|[14.5,1.0,2.0,1.0...|
|       1.0|       1|[29.0,2.0,2.0,0.0...|
|       1.0|       1|[27.75,2.0,2.0,1....|
|       1.0|       1|[151.55,2.0,1.0,1...|
|       0.0|       0|[46.9,2.0,3.0,5.0...|
|       1.0|       1|[12.2875,1.0,3.0,...|
|       1.0|       1|[31.3875,2.0,3.0,...|
|       1.0|       1|[11.1333,1.0,3.0,...|
|       1.0|       1|[13.4167,1.0,3.0,...|
|       0.0|       0|[29.125,1.0,3.0,4...|
|       0.0|       0|[31.275,2.0,3.0,4...|
|       0.0|       0|[21.075,1.0,3.0,3...|
|       0.0|       0|[31.3875,2.0,3.0,...|
|       0.0|       0|[46.9,2.0,3.0,5.0...|
|       0.0|       0|[46.9,2.0,3.0,5.0...|
|       0.0|       0|[31.275,2.0,3.0,4...|
|       0.0|       0|[14.5,1.0,3.0,1.0...|
|       1.0|       1|[11.2417,0.0,3.0,...|
|       0.0|       0|(10,[0,2,6],[65.0...|
|       0.0|       0|[14.4542,0.0,3.0,...|
+----------

Evaluating accuracy of RandomForestClassifier.

In [86]:
rf_accuracy = evaluator.evaluate(rf_prediction)
print("Accuracy of RandomForestClassifier is = %g"% (rf_accuracy))
print("Test Error of RandomForestClassifier  = %g " % (1.0 - rf_accuracy))

Accuracy of RandomForestClassifier is = 1
Test Error of RandomForestClassifier  = 0 


Gradient-boosted tree classifier

In [87]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(labelCol="survived", featuresCol="features",maxIter=10)
gbt_model = gbt.fit(trainingData)
gbt_prediction = gbt_model.transform(testData)
gbt_prediction.select("prediction", "survived", "features").show()

+----------+--------+--------------------+
|prediction|survived|            features|
+----------+--------+--------------------+
|       1.0|       1|[14.5,1.0,2.0,1.0...|
|       1.0|       1|[29.0,2.0,2.0,0.0...|
|       1.0|       1|[27.75,2.0,2.0,1....|
|       1.0|       1|[151.55,2.0,1.0,1...|
|       0.0|       0|[46.9,2.0,3.0,5.0...|
|       1.0|       1|[12.2875,1.0,3.0,...|
|       1.0|       1|[31.3875,2.0,3.0,...|
|       1.0|       1|[11.1333,1.0,3.0,...|
|       1.0|       1|[13.4167,1.0,3.0,...|
|       0.0|       0|[29.125,1.0,3.0,4...|
|       0.0|       0|[31.275,2.0,3.0,4...|
|       0.0|       0|[21.075,1.0,3.0,3...|
|       0.0|       0|[31.3875,2.0,3.0,...|
|       0.0|       0|[46.9,2.0,3.0,5.0...|
|       0.0|       0|[46.9,2.0,3.0,5.0...|
|       0.0|       0|[31.275,2.0,3.0,4...|
|       0.0|       0|[14.5,1.0,3.0,1.0...|
|       1.0|       1|[11.2417,0.0,3.0,...|
|       0.0|       0|(10,[0,2,6],[65.0...|
|       0.0|       0|[14.4542,0.0,3.0,...|
+----------

Evaluate accuracy of Gradient-boosted.

In [88]:
gbt_accuracy = evaluator.evaluate(gbt_prediction)
print("Accuracy of Gradient-boosted tree classifie is = %g"% (gbt_accuracy))
print("Test Error of Gradient-boosted tree classifie %g"% (1.0 - gbt_accuracy))

Accuracy of Gradient-boosted tree classifie is = 1
Test Error of Gradient-boosted tree classifie 0


NaiveBayes
------------

In [90]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(labelCol="survived", featuresCol="features")
nb_model = nb.fit(trainingData)
nb_prediction = nb_model.transform(testData)
nb_prediction.select("prediction", "survived", "features").show()

+----------+--------+--------------------+
|prediction|survived|            features|
+----------+--------+--------------------+
|       1.0|       1|[14.5,1.0,2.0,1.0...|
|       1.0|       1|[29.0,2.0,2.0,0.0...|
|       1.0|       1|[27.75,2.0,2.0,1....|
|       1.0|       1|[151.55,2.0,1.0,1...|
|       0.0|       0|[46.9,2.0,3.0,5.0...|
|       1.0|       1|[12.2875,1.0,3.0,...|
|       0.0|       1|[31.3875,2.0,3.0,...|
|       1.0|       1|[11.1333,1.0,3.0,...|
|       1.0|       1|[13.4167,1.0,3.0,...|
|       0.0|       0|[29.125,1.0,3.0,4...|
|       0.0|       0|[31.275,2.0,3.0,4...|
|       0.0|       0|[21.075,1.0,3.0,3...|
|       0.0|       0|[31.3875,2.0,3.0,...|
|       0.0|       0|[46.9,2.0,3.0,5.0...|
|       0.0|       0|[46.9,2.0,3.0,5.0...|
|       0.0|       0|[31.275,2.0,3.0,4...|
|       0.0|       0|[14.5,1.0,3.0,1.0...|
|       1.0|       1|[11.2417,0.0,3.0,...|
|       1.0|       0|(10,[0,2,6],[65.0...|
|       0.0|       0|[14.4542,0.0,3.0,...|
+----------

Evaluating accuracy of NaiveBayes.

In [91]:
nb_accuracy = evaluator.evaluate(nb_prediction)
print("Accuracy of NaiveBayes is  = %g"% (nb_accuracy))
print("Test Error of NaiveBayes  = %g " % (1.0 - nb_accuracy))

Accuracy of NaiveBayes is  = 0.938776
Test Error of NaiveBayes  = 0.0612245 


Support Vector Machine
-----------------------

In [92]:
from pyspark.ml.classification import LinearSVC
svm = LinearSVC(labelCol="survived", featuresCol="features")
svm_model = svm.fit(trainingData)
svm_prediction = svm_model.transform(testData)
svm_prediction.select("prediction", "survived", "features").show()

+----------+--------+--------------------+
|prediction|survived|            features|
+----------+--------+--------------------+
|       1.0|       1|[14.5,1.0,2.0,1.0...|
|       1.0|       1|[29.0,2.0,2.0,0.0...|
|       1.0|       1|[27.75,2.0,2.0,1....|
|       1.0|       1|[151.55,2.0,1.0,1...|
|       0.0|       0|[46.9,2.0,3.0,5.0...|
|       1.0|       1|[12.2875,1.0,3.0,...|
|       1.0|       1|[31.3875,2.0,3.0,...|
|       1.0|       1|[11.1333,1.0,3.0,...|
|       1.0|       1|[13.4167,1.0,3.0,...|
|       0.0|       0|[29.125,1.0,3.0,4...|
|       0.0|       0|[31.275,2.0,3.0,4...|
|       0.0|       0|[21.075,1.0,3.0,3...|
|       0.0|       0|[31.3875,2.0,3.0,...|
|       0.0|       0|[46.9,2.0,3.0,5.0...|
|       0.0|       0|[46.9,2.0,3.0,5.0...|
|       0.0|       0|[31.275,2.0,3.0,4...|
|       0.0|       0|[14.5,1.0,3.0,1.0...|
|       1.0|       1|[11.2417,0.0,3.0,...|
|       0.0|       0|(10,[0,2,6],[65.0...|
|       0.0|       0|[14.4542,0.0,3.0,...|
+----------

Evaluating the accuracy of Support Vector Machine.

In [93]:
svm_accuracy = evaluator.evaluate(svm_prediction)
print("Accuracy of Support Vector Machine is = %g"% (svm_accuracy))
print("Test Error of Support Vector Machine = %g " % (1.0 - svm_accuracy))

Accuracy of Support Vector Machine is = 1
Test Error of Support Vector Machine = 0 
