<a href="https://colab.research.google.com/github/saishshinde15/PySpark_Codes/blob/main/Pyspark_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pyspark



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Machine_Learning").getOrCreate()

In [12]:
df=spark.read.csv('/content/insurance.csv',header=True,inferSchema=True)

In [13]:
df.show()

+---+------+----+--------+------+---------+--------+
|age|   sex| bmi|children|smoker|   region|expenses|
+---+------+----+--------+------+---------+--------+
| 19|female|27.9|       0|   yes|southwest|16884.92|
| 18|  male|33.8|       1|    no|southeast| 1725.55|
| 28|  male|33.0|       3|    no|southeast| 4449.46|
| 33|  male|22.7|       0|    no|northwest|21984.47|
| 32|  male|28.9|       0|    no|northwest| 3866.86|
| 31|female|25.7|       0|    no|southeast| 3756.62|
| 46|female|33.4|       1|    no|southeast| 8240.59|
| 37|female|27.7|       3|    no|northwest| 7281.51|
| 37|  male|29.8|       2|    no|northeast| 6406.41|
| 60|female|25.8|       0|    no|northwest|28923.14|
| 25|  male|26.2|       0|    no|northeast| 2721.32|
| 62|female|26.3|       0|   yes|southeast|27808.73|
| 23|  male|34.4|       0|    no|southwest| 1826.84|
| 56|female|39.8|       0|    no|southeast|11090.72|
| 27|  male|42.1|       0|   yes|southeast|39611.76|
| 19|  male|24.6|       1|    no|southwest| 18

In [14]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- bmi: double (nullable = true)
 |-- children: integer (nullable = true)
 |-- smoker: string (nullable = true)
 |-- region: string (nullable = true)
 |-- expenses: double (nullable = true)



In [15]:
df.describe().show()

+-------+------------------+------+------------------+-----------------+------+---------+------------------+
|summary|               age|   sex|               bmi|         children|smoker|   region|          expenses|
+-------+------------------+------+------------------+-----------------+------+---------+------------------+
|  count|              1338|  1338|              1338|             1338|  1338|     1338|              1338|
|   mean| 39.20702541106129|  NULL|30.665470852017993|  1.0949177877429|  NULL|     NULL|13270.422414050803|
| stddev|14.049960379216147|  NULL|  6.09838219000336|1.205492739781914|  NULL|     NULL|12110.011239706473|
|    min|                18|female|              16.0|                0|    no|northeast|           1121.87|
|    max|                64|  male|              53.1|                5|   yes|southwest|          63770.43|
+-------+------------------+------+------------------+-----------------+------+---------+------------------+



In [16]:
from pyspark.sql.functions import isnan, count, when, col #to calculte the null values in the dataframe
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()


+---+---+---+--------+------+------+--------+
|age|sex|bmi|children|smoker|region|expenses|
+---+---+---+--------+------+------+--------+
|  0|  0|  0|       0|     0|     0|       0|
+---+---+---+--------+------+------+--------+



In [17]:
from pyspark.ml.feature import StringIndexer

In [18]:
indexer=StringIndexer(inputCol='sex',outputCol='sex_new')
indexed=indexer.fit(df).transform(df)

In [19]:
indexer=StringIndexer(inputCol='smoker',outputCol='smoker_new')
indexed=indexer.fit(indexed).transform(indexed)

In [20]:
indexer=StringIndexer(inputCol='region',outputCol='region_new')
indexed=indexer.fit(indexed).transform(indexed)

In [22]:
indexed.show()

+---+------+----+--------+------+---------+--------+-------+----------+----------+
|age|   sex| bmi|children|smoker|   region|expenses|sex_new|smoker_new|region_new|
+---+------+----+--------+------+---------+--------+-------+----------+----------+
| 19|female|27.9|       0|   yes|southwest|16884.92|    1.0|       1.0|       2.0|
| 18|  male|33.8|       1|    no|southeast| 1725.55|    0.0|       0.0|       0.0|
| 28|  male|33.0|       3|    no|southeast| 4449.46|    0.0|       0.0|       0.0|
| 33|  male|22.7|       0|    no|northwest|21984.47|    0.0|       0.0|       1.0|
| 32|  male|28.9|       0|    no|northwest| 3866.86|    0.0|       0.0|       1.0|
| 31|female|25.7|       0|    no|southeast| 3756.62|    1.0|       0.0|       0.0|
| 46|female|33.4|       1|    no|southeast| 8240.59|    1.0|       0.0|       0.0|
| 37|female|27.7|       3|    no|northwest| 7281.51|    1.0|       0.0|       1.0|
| 37|  male|29.8|       2|    no|northeast| 6406.41|    0.0|       0.0|       3.0|
| 60

In [23]:
 from pyspark.ml.linalg import Vector
 from pyspark.ml.feature import VectorAssembler

In [24]:
assembler= VectorAssembler(inputCols=['age','bmi','children','sex_new','smoker_new','region_new'],outputCol='features')

In [25]:
assembler

VectorAssembler_04109e226069

In [28]:
output = assembler.transform(indexed)
output.show()

+---+------+----+--------+------+---------+--------+-------+----------+----------+--------------------+
|age|   sex| bmi|children|smoker|   region|expenses|sex_new|smoker_new|region_new|            features|
+---+------+----+--------+------+---------+--------+-------+----------+----------+--------------------+
| 19|female|27.9|       0|   yes|southwest|16884.92|    1.0|       1.0|       2.0|[19.0,27.9,0.0,1....|
| 18|  male|33.8|       1|    no|southeast| 1725.55|    0.0|       0.0|       0.0|[18.0,33.8,1.0,0....|
| 28|  male|33.0|       3|    no|southeast| 4449.46|    0.0|       0.0|       0.0|[28.0,33.0,3.0,0....|
| 33|  male|22.7|       0|    no|northwest|21984.47|    0.0|       0.0|       1.0|[33.0,22.7,0.0,0....|
| 32|  male|28.9|       0|    no|northwest| 3866.86|    0.0|       0.0|       1.0|[32.0,28.9,0.0,0....|
| 31|female|25.7|       0|    no|southeast| 3756.62|    1.0|       0.0|       0.0|[31.0,25.7,0.0,1....|
| 46|female|33.4|       1|    no|southeast| 8240.59|    1.0|    

In [30]:
output.select('features','expenses').show()

+--------------------+--------+
|            features|expenses|
+--------------------+--------+
|[19.0,27.9,0.0,1....|16884.92|
|[18.0,33.8,1.0,0....| 1725.55|
|[28.0,33.0,3.0,0....| 4449.46|
|[33.0,22.7,0.0,0....|21984.47|
|[32.0,28.9,0.0,0....| 3866.86|
|[31.0,25.7,0.0,1....| 3756.62|
|[46.0,33.4,1.0,1....| 8240.59|
|[37.0,27.7,3.0,1....| 7281.51|
|[37.0,29.8,2.0,0....| 6406.41|
|[60.0,25.8,0.0,1....|28923.14|
|[25.0,26.2,0.0,0....| 2721.32|
|[62.0,26.3,0.0,1....|27808.73|
|[23.0,34.4,0.0,0....| 1826.84|
|[56.0,39.8,0.0,1....|11090.72|
|[27.0,42.1,0.0,0....|39611.76|
|[19.0,24.6,1.0,0....| 1837.24|
|[52.0,30.8,1.0,1....|10797.34|
|[23.0,23.8,0.0,0....| 2395.17|
|[56.0,40.3,0.0,0....|10602.39|
|[30.0,35.3,0.0,0....|36837.47|
+--------------------+--------+
only showing top 20 rows



In [31]:
final_data=output.select('features','expenses')

In [32]:
final_data.show()

+--------------------+--------+
|            features|expenses|
+--------------------+--------+
|[19.0,27.9,0.0,1....|16884.92|
|[18.0,33.8,1.0,0....| 1725.55|
|[28.0,33.0,3.0,0....| 4449.46|
|[33.0,22.7,0.0,0....|21984.47|
|[32.0,28.9,0.0,0....| 3866.86|
|[31.0,25.7,0.0,1....| 3756.62|
|[46.0,33.4,1.0,1....| 8240.59|
|[37.0,27.7,3.0,1....| 7281.51|
|[37.0,29.8,2.0,0....| 6406.41|
|[60.0,25.8,0.0,1....|28923.14|
|[25.0,26.2,0.0,0....| 2721.32|
|[62.0,26.3,0.0,1....|27808.73|
|[23.0,34.4,0.0,0....| 1826.84|
|[56.0,39.8,0.0,1....|11090.72|
|[27.0,42.1,0.0,0....|39611.76|
|[19.0,24.6,1.0,0....| 1837.24|
|[52.0,30.8,1.0,1....|10797.34|
|[23.0,23.8,0.0,0....| 2395.17|
|[56.0,40.3,0.0,0....|10602.39|
|[30.0,35.3,0.0,0....|36837.47|
+--------------------+--------+
only showing top 20 rows



In [33]:
train_data,test_data=final_data.randomSplit([0.7,0.3])

In [35]:
from pyspark.ml.regression import LinearRegression

In [36]:
lr=LinearRegression(featuresCol='features',labelCol='expenses')

In [37]:
model=lr.fit(train_data) # similar to lr.fit(x_train,y_train)

In [39]:
prediction_train=model.evaluate(train_data)

In [40]:
prediction_test=model.evaluate(test_data)

In [41]:
prediction_train.r2

0.7437209293258027

In [42]:
prediction_test.r2

0.7563327351987863

In [43]:
# not much overfitting

In [45]:
prediction_train.meanSquaredError

34363770.14563621

In [44]:
prediction_test.meanSquaredError

42746539.22052919

In [46]:
# Print evaluation metrics
print(f"Training R-squared: {prediction_train.r2}")
print(f"Test R-squared: {prediction_test.r2}")
print(f"Training Mean Squared Error: {prediction_train.meanSquaredError}")
print(f"Test Mean Squared Error: {prediction_test.meanSquaredError}")

Training R-squared: 0.7437209293258027
Test R-squared: 0.7563327351987863
Training Mean Squared Error: 34363770.14563621
Test Mean Squared Error: 42746539.22052919


In [47]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

# Create a ParamGrid for CrossValidator
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

# Create a RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="expenses", predictionCol="prediction", metricName="r2")

# Create a CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3) # Use 3 folds for cross-validation

# Run cross-validation
cvModel = cv.fit(train_data)

# Make predictions on the test data using the best model from cross-validation
predictions = cvModel.transform(test_data)

# Evaluate the best model
r2 = evaluator.evaluate(predictions)
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})

# Print evaluation metrics
print(f"Cross-validated R-squared: {r2}")
print(f"Cross-validated Mean Squared Error: {mse}")

Cross-validated R-squared: 0.7563236989060906
Cross-validated Mean Squared Error: 42748124.457021415
