In [1]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.appName("Mlib").getOrCreate()

In [4]:
spark

In [31]:
train =  spark.read.csv("titanic.csv",inferSchema=True, header=True)

In [32]:
train.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [33]:
train.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [34]:
from  pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["PassengerId","Survived"],outputCol="Independent Features")

In [35]:
output=featureassembler.transform(train)

In [36]:
output.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked',
 'Independent Features']

In [37]:
output.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+--------------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Independent Features|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+--------------------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|           [1.0,0.0]|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|           [2.0,1.0]|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|           [3.0,1.0]|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|           [4.0,1.0]|
|          5|       0|     3|Allen, Mr. Willia..

In [39]:
finalized_data=output.select("Independent Features","Parch")

In [40]:
finalized_data.show()

+--------------------+-----+
|Independent Features|Parch|
+--------------------+-----+
|           [1.0,0.0]|    0|
|           [2.0,1.0]|    0|
|           [3.0,1.0]|    0|
|           [4.0,1.0]|    0|
|           [5.0,0.0]|    0|
|           [6.0,0.0]|    0|
|           [7.0,0.0]|    0|
|           [8.0,0.0]|    1|
|           [9.0,1.0]|    2|
|          [10.0,1.0]|    0|
|          [11.0,1.0]|    1|
|          [12.0,1.0]|    0|
|          [13.0,0.0]|    0|
|          [14.0,0.0]|    5|
|          [15.0,0.0]|    0|
|          [16.0,1.0]|    0|
|          [17.0,0.0]|    1|
|          [18.0,1.0]|    0|
|          [19.0,0.0]|    0|
|          [20.0,1.0]|    0|
+--------------------+-----+
only showing top 20 rows



In [41]:
from pyspark.ml.regression import LinearRegression
##train test split
train_data,test_data=finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independent Features', labelCol='Parch')
regressor=regressor.fit(train_data)

In [42]:
### Coefficients
regressor.coefficients

DenseVector([-0.0, 0.1932])

In [43]:
### Intercepts
regressor.intercept

0.29854754182799936

In [44]:
### Prediction
pred_results=regressor.evaluate(test_data)

In [45]:
pred_results.predictions.show()

+--------------------+-----+-------------------+
|Independent Features|Parch|         prediction|
+--------------------+-----+-------------------+
|           [2.0,1.0]|    0| 0.4916417303914017|
|           [3.0,1.0]|    0| 0.4915920908353375|
|          [10.0,1.0]|    0| 0.4912446139428881|
|          [11.0,1.0]|    1| 0.4911949743868239|
|          [15.0,0.0]|    0| 0.2978029484870365|
|          [18.0,1.0]|    0| 0.4908474974943746|
|          [21.0,0.0]|    0| 0.2975051111506513|
|          [27.0,0.0]|    0|0.29720727381426615|
|          [29.0,1.0]|    0| 0.4903014623776685|
|          [35.0,0.0]|    0| 0.2968101573657526|
|          [42.0,0.0]|    0|0.29646268047330326|
|          [45.0,1.0]|    0| 0.4895072294806414|
|          [48.0,1.0]|    0|0.48935831081244885|
|          [57.0,1.0]|    0| 0.4889115548078711|
|          [60.0,0.0]|    2| 0.2955691684641478|
|          [70.0,0.0]|    0|0.29507277290350586|
|          [77.0,0.0]|    0| 0.2947252960110565|
|          [80.0,1.0

In [46]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(0.6206186825855146, 0.8056766817394527)