In [1]:
from  pyspark.sql import SparkSession
import  databricks.koalas as ks
ss = SparkSession.builder.appName("titanic").getOrCreate()

In [2]:
titanic_kdf = ks.read_csv("titanic_dataset.csv")

In [3]:
titanic_kdf.shape

(891, 12)

In [4]:
titanic_kdf.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
titanic_kdf.dtypes

PassengerId      int32
Survived         int32
Pclass           int32
Name            object
Sex             object
Age            float64
SibSp            int32
Parch            int32
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [6]:
titanic_kdf.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.0,0.0,2.0,20.0,0.0,0.0,7.8958
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,669.0,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
# drop passengerId 
titanic_kdf = titanic_kdf.drop(['PassengerId'],axis=1)

In [8]:
ss

In [9]:
titanic_kdf.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [10]:
titanic_kdf.Pclass.value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [11]:
titanic_kdf.SibSp.value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [12]:
titanic_kdf.Cabin.value_counts()

C23 C25 C27        4
B96 B98            4
G6                 4
E101               3
C22 C26            3
F33                3
F2                 3
D                  3
C78                2
F G73              2
B22                2
E8                 2
C124               2
D20                2
B28                2
E67                2
B20                2
E25                2
E33                2
D17                2
C52                2
C123               2
D33                2
F4                 2
D35                2
E24                2
C92                2
B77                2
C65                2
E44                2
B18                2
C126               2
B5                 2
C2                 2
E121               2
C125               2
C83                2
B58 B60            2
B35                2
B49                2
C93                2
C68                2
B51 B53 B55        2
D36                2
D26                2
B57 B59 B63 B66    2
A23                1
B79          

In [13]:
# drop cabin 
titanic_kdf = titanic_kdf.drop(['Cabin'],axis=1)

In [35]:
# ks.ml.VectorAssembler

In [14]:
titanic_kdf.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Embarked'],
      dtype='object')

#### Lets consider only three columns for simplicity
sex,Age,Fare

In [17]:
titanic_kdf[['Sex','Age','Fare']].isnull().sum()

Sex       0
Age     177
Fare      0
dtype: int64

In [21]:
# age has missing values , lets fill those values by mean of the data 
titanic_kdf['Age'] = titanic_kdf.Age.fillna(titanic_kdf.Age.mean())

#### To train classification model

In [24]:
# converting into Spark Dataframe for further processing
titanic_spark_df = titanic_kdf.to_spark()

In [35]:
# Train test split
splits = titanic_spark_df.randomSplit([0.8,0.2])
df_train = splits[0]
df_test = splits[1]

In [43]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier

In [37]:
indexer = StringIndexer(inputCol="Sex", outputCol="Sex_index")
fit_indexer = indexer.fit(df_train)
df_train = fit_indexer.transform(df_train)

In [38]:
encoder = OneHotEncoder(inputCol="Sex_index", outputCol="SexOHE")
df_train = encoder.transform(df_train)

In [39]:
vectorAssembler = VectorAssembler(inputCols=[
                                    "SexOHE","Age","Fare"],
                                  outputCol="features")

In [40]:
df_train = vectorAssembler.transform(df_train)

In [41]:
df_train.head(1)

[Row(Survived=0, Pclass=1, Name='Allison, Miss. Helen Loraine', Sex='female', Age=2.0, SibSp=1, Parch=2, Ticket='113781', Fare=151.55, Embarked='S', Sex_index=1.0, SexOHE=SparseVector(1, {}), features=DenseVector([0.0, 2.0, 151.55]))]

In [44]:
rf = RandomForestClassifier(labelCol="Survived", numTrees=10)

In [45]:
model = rf.fit(df_train)

In [47]:
# prediction on test set 
df_test = fit_indexer.transform(df_test)
df_test = encoder.transform(df_test)
df_test = vectorAssembler.transform(df_test)

In [48]:
df_test.head(1)

[Row(Survived=0, Pclass=1, Name='Carrau, Mr. Francisco M', Sex='male', Age=28.0, SibSp=0, Parch=0, Ticket='113059', Fare=47.1, Embarked='S', Sex_index=0.0, SexOHE=SparseVector(1, {0: 1.0}), features=DenseVector([1.0, 28.0, 47.1]))]

In [49]:
predictions = model.transform(df_test)

In [51]:
predictions.head(1)

[Row(Survived=0, Pclass=1, Name='Carrau, Mr. Francisco M', Sex='male', Age=28.0, SibSp=0, Parch=0, Ticket='113059', Fare=47.1, Embarked='S', Sex_index=0.0, SexOHE=SparseVector(1, {0: 1.0}), features=DenseVector([1.0, 28.0, 47.1]), rawPrediction=DenseVector([6.6854, 3.3146]), probability=DenseVector([0.6685, 0.3315]), prediction=0.0)]

In [52]:
def classification_metrics(prediction):
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    mcEval =BinaryClassificationEvaluator(labelCol='Survived')
    accuracy = mcEval.evaluate(prediction)
    print("Accuracy on test data = %g" % accuracy)

In [53]:
classification_metrics(predictions)

Accuracy on test data = 0.859442


#### use pipeline feature

In [56]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[indexer,encoder,vectorAssembler,rf])

In [58]:
# Train test split again so that we can refresh the variable
splits = titanic_spark_df.randomSplit([0.8,0.2])
df_train = splits[0]
df_test = splits[1]

In [59]:
model = pipeline.fit(df_train)

In [60]:
prediction =model.transform(df_test)

In [61]:
prediction.head()

Row(Survived=0, Pclass=1, Name='Allison, Miss. Helen Loraine', Sex='female', Age=2.0, SibSp=1, Parch=2, Ticket='113781', Fare=151.55, Embarked='S', Sex_index=1.0, SexOHE=SparseVector(1, {}), features=DenseVector([0.0, 2.0, 151.55]), rawPrediction=DenseVector([1.6265, 8.3735]), probability=DenseVector([0.1627, 0.8373]), prediction=1.0)

In [62]:
classification_metrics(prediction)

Accuracy on test data = 0.851756
