Installing the Pyspark Library

In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 44 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 45.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=11d5eb2fee2ada32aa8513a233964b2a4c8e16d25cb982754dc427c382c5381c
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


Creating the Spark Session

In [44]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]").appName("Insurance_Premium_Prediction").getOrCreate()

Importing the Source Dataset

In [45]:
df = spark.read.csv("Medicalpremium.csv", header = True, inferSchema = True)

In [46]:
df.show()

+---+--------+---------------------+--------------+------------------+------+------+--------------+-----------------------+----------------------+------------+
|Age|Diabetes|BloodPressureProblems|AnyTransplants|AnyChronicDiseases|Height|Weight|KnownAllergies|HistoryOfCancerInFamily|NumberOfMajorSurgeries|PremiumPrice|
+---+--------+---------------------+--------------+------------------+------+------+--------------+-----------------------+----------------------+------------+
| 45|       0|                    0|             0|                 0|   155|    57|             0|                      0|                     0|       25000|
| 60|       1|                    0|             0|                 0|   180|    73|             0|                      0|                     0|       29000|
| 36|       1|                    1|             0|                 0|   158|    59|             0|                      0|                     1|       23000|
| 52|       1|                    1|    

Checking for the Correlation between Independent features and Dependent Feature

In [47]:
columns_list = list(df.columns)
for column in columns_list :
  if column == columns_list[-1]:
    pass
  else :
    print(f"Correlation between Independent feature {column} and Dependent feature  {columns_list[-1]} is ", end = "" )
    print(df.corr(column, columns_list[-1]))

Correlation between Independent feature Age and Dependent feature  PremiumPrice is 0.6975399655058024
Correlation between Independent feature Diabetes and Dependent feature  PremiumPrice is 0.07620924095592171
Correlation between Independent feature BloodPressureProblems and Dependent feature  PremiumPrice is 0.16709674701094776
Correlation between Independent feature AnyTransplants and Dependent feature  PremiumPrice is 0.2890559369634019
Correlation between Independent feature AnyChronicDiseases and Dependent feature  PremiumPrice is 0.20860986049455094
Correlation between Independent feature Height and Dependent feature  PremiumPrice is 0.026909513982139487
Correlation between Independent feature Weight and Dependent feature  PremiumPrice is 0.14150740525639777
Correlation between Independent feature KnownAllergies and Dependent feature  PremiumPrice is 0.01210279064300931
Correlation between Independent feature HistoryOfCancerInFamily and Dependent feature  PremiumPrice is 0.083139

Since No correlation between independent features with dependent feature is greater than 0.9 or less than -0.9, their is no necessity for dropping any independent feature

Applying VectorAssembler

In [48]:
from pyspark.ml.feature import VectorAssembler
independent_cols = list(df.columns)
independent_cols = independent_cols[:-1]
assembler = VectorAssembler(inputCols= independent_cols, outputCol="independent_features")
df = assembler.transform(df)

In [49]:
df.show(truncate = False)

+---+--------+---------------------+--------------+------------------+------+------+--------------+-----------------------+----------------------+------------+---------------------------------------------+
|Age|Diabetes|BloodPressureProblems|AnyTransplants|AnyChronicDiseases|Height|Weight|KnownAllergies|HistoryOfCancerInFamily|NumberOfMajorSurgeries|PremiumPrice|independent_features                         |
+---+--------+---------------------+--------------+------------------+------+------+--------------+-----------------------+----------------------+------------+---------------------------------------------+
|45 |0       |0                    |0             |0                 |155   |57    |0             |0                      |0                     |25000       |(10,[0,5,6],[45.0,155.0,57.0])               |
|60 |1       |0                    |0             |0                 |180   |73    |0             |0                      |0                     |29000       |(10,[0,1,5,6],[60

In [52]:
df_new = df.select(["independent_features", "PremiumPrice"])

Splitting the Data into train and test data

In [53]:
df_new.show()

+--------------------+------------+
|independent_features|PremiumPrice|
+--------------------+------------+
|(10,[0,5,6],[45.0...|       25000|
|(10,[0,1,5,6],[60...|       29000|
|[36.0,1.0,1.0,0.0...|       23000|
|[52.0,1.0,1.0,0.0...|       28000|
|(10,[0,4,5,6,9],[...|       23000|
|(10,[0,5,6,7,9],[...|       23000|
|(10,[0,5,6],[33.0...|       21000|
|(10,[0,5,6,7],[23...|       15000|
|(10,[0,1,5,6,7],[...|       23000|
|(10,[0,5,6],[38.0...|       23000|
|(10,[0,2,5,6,9],[...|       28000|
|(10,[0,1,5,6],[66...|       25000|
|(10,[0,5,6,7,9],[...|       15000|
|(10,[0,2,5,6],[46...|       35000|
|(10,[0,3,5,6,9],[...|       15000|
|(10,[0,5,6,7,9],[...|       23000|
|(10,[0,4,5,6],[42...|       30000|
|(10,[0,1,5,6],[38...|       23000|
|(10,[0,1,5,6],[57...|       25000|
|(10,[0,2,5,6],[21...|       15000|
+--------------------+------------+
only showing top 20 rows



In [54]:
train_data , test_data = df_new.randomSplit([0.7,0.3])

In [55]:
train_data.show()

+--------------------+------------+
|independent_features|PremiumPrice|
+--------------------+------------+
|(10,[0,1,2,5,6],[...|       26000|
|(10,[0,1,2,5,6],[...|       15000|
|(10,[0,1,2,5,6],[...|       15000|
|(10,[0,1,2,5,6],[...|       15000|
|(10,[0,1,2,5,6],[...|       15000|
|(10,[0,1,2,5,6],[...|       15000|
|(10,[0,1,2,5,6],[...|       15000|
|(10,[0,1,2,5,6],[...|       15000|
|(10,[0,1,2,5,6],[...|       15000|
|(10,[0,1,2,5,6],[...|       15000|
|(10,[0,1,2,5,6],[...|       15000|
|(10,[0,1,2,5,6],[...|       23000|
|(10,[0,1,2,5,6],[...|       23000|
|(10,[0,1,2,5,6],[...|       23000|
|(10,[0,1,2,5,6],[...|       23000|
|(10,[0,1,2,5,6],[...|       23000|
|(10,[0,1,2,5,6],[...|       23000|
|(10,[0,1,2,5,6],[...|       23000|
|(10,[0,1,2,5,6],[...|       23000|
|(10,[0,1,2,5,6],[...|       23000|
+--------------------+------------+
only showing top 20 rows



In [56]:
test_data.show()

+--------------------+------------+
|independent_features|PremiumPrice|
+--------------------+------------+
|(10,[0,1,2,5,6],[...|       15000|
|(10,[0,1,2,5,6],[...|       32000|
|(10,[0,1,2,5,6],[...|       23000|
|(10,[0,1,2,5,6],[...|       23000|
|(10,[0,1,2,5,6],[...|       23000|
|(10,[0,1,2,5,6],[...|       23000|
|(10,[0,1,2,5,6],[...|       23000|
|(10,[0,1,2,5,6],[...|       29000|
|(10,[0,1,2,5,6],[...|       25000|
|(10,[0,1,2,5,6],[...|       25000|
|(10,[0,1,2,5,6],[...|       29000|
|(10,[0,1,2,5,6],[...|       29000|
|(10,[0,1,3,5,6],[...|       38000|
|(10,[0,1,3,5,6],[...|       38000|
|(10,[0,1,4,5,6],[...|       30000|
|(10,[0,1,5,6],[19...|       15000|
|(10,[0,1,5,6],[20...|       15000|
|(10,[0,1,5,6],[21...|       15000|
|(10,[0,1,5,6],[23...|       15000|
|(10,[0,1,5,6],[24...|       15000|
+--------------------+------------+
only showing top 20 rows



In [58]:
from pyspark.ml.regression import LinearRegression

In [64]:
lr = LinearRegression(featuresCol = "independent_features", labelCol = "PremiumPrice")

In [65]:
trained_model = lr.fit(train_data)

In [66]:
results = trained_model.evaluate(train_data)

METRICS

In [67]:
print(results.r2)

0.6503596806947105


In [68]:
print(results.meanSquaredError)

13499367.13302893


In [69]:
print(results.meanAbsoluteError)

2612.9963731209623


In [70]:
print(results.r2adj)

0.6453790493655753


Testing the test dataset

In [71]:
test_data = test_data.select(["independent_features"])

In [72]:
predictions = trained_model.transform(test_data)

Predictions of Test Data

In [73]:
predictions.show()

+--------------------+------------------+
|independent_features|        prediction|
+--------------------+------------------+
|(10,[0,1,2,5,6],[...| 17207.75157392525|
|(10,[0,1,2,5,6],[...|16696.686028407443|
|(10,[0,1,2,5,6],[...| 18841.56618788353|
|(10,[0,1,2,5,6],[...|19763.421282430376|
|(10,[0,1,2,5,6],[...|19247.805736424223|
|(10,[0,1,2,5,6],[...|22342.489155623858|
|(10,[0,1,2,5,6],[...| 23951.98010349158|
|(10,[0,1,2,5,6],[...|25813.592062170857|
|(10,[0,1,2,5,6],[...|27424.366390282186|
|(10,[0,1,2,5,6],[...|29225.897845923726|
|(10,[0,1,2,5,6],[...|31247.290732606038|
|(10,[0,1,2,5,6],[...|31058.345854618892|
|(10,[0,1,3,5,6],[...|32565.978224589147|
|(10,[0,1,3,5,6],[...|34378.592921208525|
|(10,[0,1,4,5,6],[...|30958.673862223535|
|(10,[0,1,5,6],[19...|15270.562883223252|
|(10,[0,1,5,6],[20...|14570.612106191867|
|(10,[0,1,5,6],[21...|15543.097772920983|
|(10,[0,1,5,6],[23...| 18288.07023825466|
|(10,[0,1,5,6],[24...|17196.289420378722|
+--------------------+------------