# Linear Regression

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.stat import Correlation
import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.read.csv("insurance.csv", inferSchema=True,
                   header=True)

In [4]:
df.show()

+---+------+------+--------+------+---------+-----------+
|age|gender|   bmi|children|smoker|   region|    charges|
+---+------+------+--------+------+---------+-----------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|
| 33|  male|22.705|       0|    no|northwest|21984.47061|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|
| 31|female| 25.74|       0|    no|southeast|  3756.6216|
| 46|female| 33.44|       1|    no|southeast|  8240.5896|
| 37|female| 27.74|       3|    no|northwest|  7281.5056|
| 37|  male| 29.83|       2|    no|northeast|  6406.4107|
| 60|female| 25.84|       0|    no|northwest|28923.13692|
| 25|  male| 26.22|       0|    no|northeast|  2721.3208|
| 62|female| 26.29|       0|   yes|southeast| 27808.7251|
| 23|  male|  34.4|       0|    no|southwest|   1826.843|
| 56|female| 39.82|       0|    no|southeast| 11090.7178|
| 27|  male| 4

In [5]:
df.count()

1338

In [6]:
len(df.columns)

7

In [7]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- bmi: double (nullable = true)
 |-- children: integer (nullable = true)
 |-- smoker: string (nullable = true)
 |-- region: string (nullable = true)
 |-- charges: double (nullable = true)



In [8]:
df.describe().show()

+-------+------------------+------+------------------+-----------------+------+---------+------------------+
|summary|               age|gender|               bmi|         children|smoker|   region|           charges|
+-------+------------------+------+------------------+-----------------+------+---------+------------------+
|  count|              1338|  1338|              1338|             1338|  1338|     1338|              1338|
|   mean| 39.20702541106129|  null|30.663396860986538|  1.0949177877429|  null|     null|13270.422265141257|
| stddev|14.049960379216147|  null| 6.098186911679012|1.205492739781914|  null|     null|12110.011236693992|
|    min|                18|female|             15.96|                0|    no|northeast|         1121.8739|
|    max|                64|  male|             53.13|                5|   yes|southwest|       63770.42801|
+-------+------------------+------+------------------+-----------------+------+---------+------------------+



In [9]:
df.head(5)

[Row(age=19, gender='female', bmi=27.9, children=0, smoker='yes', region='southwest', charges=16884.924),
 Row(age=18, gender='male', bmi=33.77, children=1, smoker='no', region='southeast', charges=1725.5523),
 Row(age=28, gender='male', bmi=33.0, children=3, smoker='no', region='southeast', charges=4449.462),
 Row(age=33, gender='male', bmi=22.705, children=0, smoker='no', region='northwest', charges=21984.47061),
 Row(age=32, gender='male', bmi=28.88, children=0, smoker='no', region='northwest', charges=3866.8552)]

In [10]:
df.corr('age', 'charges')

0.299008193330648

In [11]:
df.corr('bmi', 'charges')

0.19834096883362903

In [12]:
df.columns

['age', 'gender', 'bmi', 'children', 'smoker', 'region', 'charges']

In [13]:
from pyspark.ml.feature import StringIndexer

In [14]:
indexer=StringIndexer(inputCol= "gender",outputCol='gender_cat')
indexed=indexer.fit(df).transform(df)

In [15]:
indexer=StringIndexer(inputCol= "smoker",outputCol='smoker_cat')
indexed=indexer.fit(indexed).transform(indexed)

In [16]:
indexer=StringIndexer(inputCol= "region",outputCol='region_cat')
indexed=indexer.fit(indexed).transform(indexed)

In [17]:
indexed.show()

+---+------+------+--------+------+---------+-----------+----------+----------+----------+
|age|gender|   bmi|children|smoker|   region|    charges|gender_cat|smoker_cat|region_cat|
+---+------+------+--------+------+---------+-----------+----------+----------+----------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|       1.0|       1.0|       2.0|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|       0.0|       0.0|       0.0|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|       0.0|       0.0|       0.0|
| 33|  male|22.705|       0|    no|northwest|21984.47061|       0.0|       0.0|       1.0|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|       0.0|       0.0|       1.0|
| 31|female| 25.74|       0|    no|southeast|  3756.6216|       1.0|       0.0|       0.0|
| 46|female| 33.44|       1|    no|southeast|  8240.5896|       1.0|       0.0|       0.0|
| 37|female| 27.74|       3|    no|northwest|  7281.5056|       1.0|       0.0|       1.0|

In [18]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [19]:
indexed.columns

['age',
 'gender',
 'bmi',
 'children',
 'smoker',
 'region',
 'charges',
 'gender_cat',
 'smoker_cat',
 'region_cat']

In [20]:
assembler= VectorAssembler(inputCols=['age',
 'bmi',
 'children',
 'gender_cat',
 'smoker_cat',
 'region_cat'],outputCol='features')

In [21]:
assembler

VectorAssembler_c20a04e93788

In [22]:
output= assembler.transform(indexed)

In [23]:
output

DataFrame[age: int, gender: string, bmi: double, children: int, smoker: string, region: string, charges: double, gender_cat: double, smoker_cat: double, region_cat: double, features: vector]

In [24]:
output.show()

+---+------+------+--------+------+---------+-----------+----------+----------+----------+--------------------+
|age|gender|   bmi|children|smoker|   region|    charges|gender_cat|smoker_cat|region_cat|            features|
+---+------+------+--------+------+---------+-----------+----------+----------+----------+--------------------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|       1.0|       1.0|       2.0|[19.0,27.9,0.0,1....|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|       0.0|       0.0|       0.0|[18.0,33.77,1.0,0...|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|       0.0|       0.0|       0.0|[28.0,33.0,3.0,0....|
| 33|  male|22.705|       0|    no|northwest|21984.47061|       0.0|       0.0|       1.0|[33.0,22.705,0.0,...|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|       0.0|       0.0|       1.0|[32.0,28.88,0.0,0...|
| 31|female| 25.74|       0|    no|southeast|  3756.6216|       1.0|       0.0|       0.0|[31.0,25.74,0.

In [25]:
output.select('features','charges').show(5)

+--------------------+-----------+
|            features|    charges|
+--------------------+-----------+
|[19.0,27.9,0.0,1....|  16884.924|
|[18.0,33.77,1.0,0...|  1725.5523|
|[28.0,33.0,3.0,0....|   4449.462|
|[33.0,22.705,0.0,...|21984.47061|
|[32.0,28.88,0.0,0...|  3866.8552|
+--------------------+-----------+
only showing top 5 rows



In [26]:
#final data consist of features and label which is crew.
final_data=output.select('features','charges')

In [27]:
#splitting data into train and test
train_data,test_data=final_data.randomSplit([0.7,0.3])

+-------+------------------+
|summary|           charges|
+-------+------------------+
|  count|               945|
|   mean| 12921.72271917354|
| stddev|11938.423901817978|
|    min|         1121.8739|
|    max|       60021.39897|
+-------+------------------+



In [29]:
train_data.describe().show()

+-------+------------------+
|summary|           charges|
+-------+------------------+
|  count|               945|
|   mean| 12921.72271917354|
| stddev|11938.423901817978|
|    min|         1121.8739|
|    max|       60021.39897|
+-------+------------------+



In [28]:
test_data.describe().show()

+-------+------------------+
|summary|           charges|
+-------+------------------+
|  count|               393|
|   mean|14108.898272620865|
| stddev|12488.618390157812|
|    min|         1146.7966|
|    max|       63770.42801|
+-------+------------------+



In [30]:
#import LinearRegression library
from pyspark.ml.regression import LinearRegression

In [31]:
#creating an object of class LinearRegression
#object takes features and label as input arguments
ship_lr=LinearRegression(featuresCol='features',labelCol='charges')

In [32]:
#pass train_data to train model
trained_ship_model=ship_lr.fit(train_data)

In [33]:
#evaluating model trained for Rsquared error
ship_results=trained_ship_model.evaluate(train_data)

In [34]:
print('Rsquared Error :',ship_results.r2)

Rsquared Error : 0.7702513727961525


In [37]:
print(ship_results.meanSquaredError)

32710493.912332077


In [35]:
#testing Model on unlabeled data
#create unlabeled data from test_data
#testing model on unlabeled data
unlabeled_data=test_data.select('features')
unlabeled_data.show(5)

+--------------------+
|            features|
+--------------------+
|(6,[0,1],[18.0,41...|
|(6,[0,1],[21.0,23...|
|(6,[0,1],[21.0,31...|
|(6,[0,1],[21.0,36...|
|(6,[0,1],[23.0,41...|
+--------------------+
only showing top 5 rows



In [36]:
predictions=trained_ship_model.transform(unlabeled_data)
predictions.show()

+--------------------+-------------------+
|            features|         prediction|
+--------------------+-------------------+
|(6,[0,1],[18.0,41...|  5177.842655937637|
|(6,[0,1],[21.0,23...| 255.32823496406127|
|(6,[0,1],[21.0,31...|  2749.883809561443|
|(6,[0,1],[21.0,36...|  4612.016844120053|
|(6,[0,1],[23.0,41...|  6764.489796396754|
|(6,[0,1],[24.0,32...| 3870.5181045139198|
|(6,[0,1],[25.0,25...|  2135.987736035502|
|(6,[0,1],[34.0,34...|   7254.61989877841|
|(6,[0,1],[40.0,41...| 11252.617766568843|
|(6,[0,1],[41.0,33...|  8920.799443609363|
|(6,[0,1],[41.0,40...|  11064.00916262965|
|(6,[0,1],[53.0,31...|  11435.79983218753|
|(6,[0,1],[56.0,34...| 13223.991252736489|
|(6,[0,1],[58.0,36...| 14287.292052724193|
|(6,[0,1],[59.0,26...|  11463.58953195678|
|(6,[0,1],[60.0,25...|  11520.92302692155|
|(6,[0,1],[61.0,31...| 13651.197069791191|
|(6,[0,1],[62.0,38...| 16238.220724911052|
|(6,[0,1],[62.0,39...|  16589.56658048815|
|[18.0,21.47,0.0,0...|-238.72613167946292|
+----------