## Establishing Spark session

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Logistic_Regression_Titanic").getOrCreate()

## Reading the file

In [2]:
df = spark.read.csv("titanic.csv", inferSchema = True, header = True)

In [3]:
df.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

In [4]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [5]:
df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [6]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [7]:
df.head()

Row(PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=None, Embarked='S')

## Checking if the data is balanced/imbalanced

In [8]:
df.groupBy("Survived").count().show()

+--------+-----+
|Survived|count|
+--------+-----+
|       1|  342|
|       0|  549|
+--------+-----+



In [9]:
df_a = df.filter(df['Survived'] == 0)  # 0 is the mayority class
df_b = df.filter(df['Survived'] == 1)  # 1 is the minority class to be oversampled

In [10]:
a_count = df_a.count() # 0 is the majority class
b_count = df_b.count() # 1 is the minority class

print("Survived 0 count: ", a_count)
print("Survived 1 count: ", b_count)

Survived 0 count:  549
Survived 1 count:  342


## Implementing Under sampling

In [11]:
# ratio = minority class count / majority class count:
# ratio = b_count / a_count # a_count: 0 is the majority class and b_count: 1 is the minority class

# print(ratio)

In [12]:
# Undersample for 0. Here we need to decrease the count of majority class:
# majority class dataframe.sample(withReplacement=False, fraction=ratio, seed=1)

# df_a_underampled = df_a.sample(withReplacement=False, fraction=ratio, seed=1) # df_a is the majority class

# df = df_b.union(df_a_underampled) # minority class.union(undersample majority class)

In [13]:
# target_0 = df.filter(df["Survived"]==0).count()
# target_1 = df.filter(df["Survived"]==1).count()

# print("Survived Count for 0: ", target_0)
# print("Survived Count for 1: ", target_1)

## Implementing Over sampling

In [14]:
# ratio = majority class count / minority class count:
ratio = a_count / b_count  # Oversamping for 1  # a_count: 0 is the majority clas and b_count: a is the minority class

print(ratio)

1.605263157894737


In [15]:
#Oversampling for 1, here we are increasing the count of minority class
#minority class dataframe.sample(withReplacement=True, fraction=ratio, seed=1)

df_b_oversampled = df_b.sample(withReplacement=True, fraction=ratio, seed=1) # df_b is the minority class

df = df_a.unionAll(df_b_oversampled) # majority class.union(oversampling minority class)

In [16]:
target_0 = df.filter(df["Survived"]==0).count()
target_1 = df.filter(df["Survived"]==1).count()

print("Survived Count for 0: ", target_0)
print("Survived Count for 1: ", target_1)

Survived Count for 0:  549
Survived Count for 1:  505


## Selecting the required features

In [17]:
my_cols = df.select(["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"])

## Checking for Null in the features

In [18]:
from pyspark.sql.functions import *

In [19]:
my_cols.select([count(when(isnan(x)|col(x).isNull(), x)).alias(x) for x in my_cols.columns]).show()

+--------+------+---+---+-----+-----+----+--------+
|Survived|Pclass|Sex|Age|SibSp|Parch|Fare|Embarked|
+--------+------+---+---+-----+-----+----+--------+
|       0|     0|  0|188|    0|    0|   0|       4|
+--------+------+---+---+-----+-----+----+--------+



## Handling missing values - dropping them

In [20]:
my_final_data = my_cols.na.drop()

## Handling Catergorical data and converting them into Numerical

In [21]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer

In [22]:
# 1. StringIndexer - This convert string/text into numeric, the input is the actual column which has text data.
# 2. OneHotEncoder - This is use to do One Hot Encoding on the output of String Indexer because string indexer 
#                    will convert the text to numbers only.

## Applying StringIndexer

In [23]:
gender_indexer = StringIndexer(inputCol = "Sex", outputCol = "SexIndex") #Converting Sex column values into numeric

## OneHot Encoding on the output of StringIndexer

In [24]:
gender_encoder = OneHotEncoder(inputCol = "SexIndex", outputCol = "SexVec") #Creating OneHot encoding on string indexer 
                                                                            #output (Sex column value which was 
                                                                            #string indexed)

In [25]:
embark_indexer = StringIndexer(inputCol = "Embarked", outputCol = "EmbarkIndex")

embark_encoder = OneHotEncoder(inputCol = "EmbarkIndex", outputCol = "EmbarkVec")

## Converting all the data in the form of array for Pyspark ML algorithim

In [26]:
assembler = VectorAssembler(inputCols = ["Pclass", "SexVec", "EmbarkVec", "Age", "SibSp", "Parch", "Fare"], 
                            outputCol = "features")

## Creating Train and Test dataset

In [27]:
train_data, test_data = my_final_data.randomSplit([0.7,0.3])

## Creating Logistic Regression Model

In [28]:
from pyspark.ml.classification import LogisticRegression

In [29]:
log_reg_titanic = LogisticRegression(featuresCol = "features", labelCol = "Survived")

## Creating Pipeline

In [30]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages = [gender_indexer, embark_indexer, gender_encoder, embark_encoder, assembler, log_reg_titanic])

In [31]:
fit_model = pipeline.fit(train_data)

In [32]:
prediction  = fit_model.transform(test_data)

In [33]:
prediction.select("Survived", "prediction").show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
+--------+----------+
only showing top 20 rows



In [34]:
prediction.head()

Row(Survived=0, Pclass=1, Sex='female', Age=2.0, SibSp=1, Parch=2, Fare=151.55, Embarked='S', SexIndex=1.0, EmbarkIndex=0.0, SexVec=SparseVector(1, {}), EmbarkVec=SparseVector(2, {0: 1.0}), features=DenseVector([1.0, 0.0, 1.0, 0.0, 2.0, 1.0, 2.0, 151.55]), rawPrediction=DenseVector([-4.7865, 4.7865]), probability=DenseVector([0.0083, 0.9917]), prediction=1.0)

## Model Evaluation

In [35]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [36]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "Survived")

## AUC Score

In [37]:
AUC = my_eval.evaluate(prediction)

In [38]:
print("AUC Score: ", AUC)

AUC Score:  0.7657433712121213


## Confusion Matrix

In [39]:
import pandas as pd
from pyspark.mllib.evaluation import MulticlassMetrics

predictionAndLabels = prediction.select("Survived", "prediction").rdd

metrics = MulticlassMetrics(predictionAndLabels.map(lambda x: tuple(map(float, x))))

confusion_matrix = metrics.confusionMatrix().toArray()
labels = [int(l) for l in metrics.call('labels')]
confusion_matrix = pd.DataFrame(confusion_matrix , index=labels, columns=labels)

print(confusion_matrix)

       0     1
0  101.0  34.0
1   27.0  98.0


## F1 Score

In [40]:
from sklearn.metrics import f1_score

print("F1 Score evaluator : ", f1_score(prediction.select('Survived').toPandas(), 
                                        prediction.select('prediction').toPandas()))

F1 Score evaluator :  0.7626459143968871


## Classification Report

In [41]:
from sklearn.metrics import classification_report

print(classification_report(prediction.select('Survived').toPandas(), prediction.select('prediction').toPandas()))

             precision    recall  f1-score   support

          0       0.75      0.79      0.77       128
          1       0.78      0.74      0.76       132

avg / total       0.77      0.77      0.77       260



## Sklearn Confusion Matrix

In [42]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(prediction.select('Survived').toPandas(), prediction.select('prediction').toPandas()))

[[101  27]
 [ 34  98]]


## Accuracy Score

In [43]:
from sklearn.metrics import accuracy_score

print(accuracy_score(prediction.select('Survived').toPandas(), prediction.select('prediction').toPandas()))

0.7653846153846153
