In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.stat import Correlation
import pyspark.sql.functions as F

In [2]:
spark=SparkSession.builder.appName('Demo').getOrCreate()

22/03/28 20:49:30 WARN Utils: Your hostname, shital-VivoBook-ASUS-Laptop-X505ZA-X505ZA resolves to a loopback address: 127.0.1.1; using 192.168.0.241 instead (on interface wlp1s0)
22/03/28 20:49:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/28 20:49:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
df = spark.read.csv("headbrain.csv", inferSchema=True, header=True)

In [6]:
df.show()

+------+---------+---------------+-------------------+
|Gender|Age Range|Head Size(cm^3)|Brain Weight(grams)|
+------+---------+---------------+-------------------+
|     1|        1|           4512|               1530|
|     1|        1|           3738|               1297|
|     1|        1|           4261|               1335|
|     1|        1|           3777|               1282|
|     1|        1|           4177|               1590|
|     1|        1|           3585|               1300|
|     1|        1|           3785|               1400|
|     1|        1|           3559|               1255|
|     1|        1|           3613|               1355|
|     1|        1|           3982|               1375|
|     1|        1|           3443|               1340|
|     1|        1|           3993|               1380|
|     1|        1|           3640|               1355|
|     1|        1|           4208|               1522|
|     1|        1|           3832|               1208|
|     1|  

In [7]:
df.printSchema()

root
 |-- Gender: integer (nullable = true)
 |-- Age Range: integer (nullable = true)
 |-- Head Size(cm^3): integer (nullable = true)
 |-- Brain Weight(grams): integer (nullable = true)



In [11]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [8]:
df.columns

['Gender', 'Age Range', 'Head Size(cm^3)', 'Brain Weight(grams)']

In [12]:
assembler = VectorAssembler(inputCols=['Gender','Age Range'], outputCol='features')

In [13]:
output = assembler.transform(df)

In [14]:
final_df = output.select('features', 'Gender')

In [15]:
train_data, test_data = final_df.randomSplit([0.7, 0.3])

In [16]:
train_data.describe().show()

+-------+------------------+
|summary|            Gender|
+-------+------------------+
|  count|               175|
|   mean|1.4228571428571428|
| stddev|0.4954306812605761|
|    min|                 1|
|    max|                 2|
+-------+------------------+



In [17]:
test_data.describe().show()

+-------+------------------+
|summary|            Gender|
+-------+------------------+
|  count|                62|
|   mean| 1.467741935483871|
| stddev|0.5030315290279456|
|    min|                 1|
|    max|                 2|
+-------+------------------+



In [18]:
from pyspark.ml.regression import LinearRegression

In [19]:
lm = LinearRegression(labelCol='Gender')

In [20]:
model = lm.fit(train_data)

22/03/28 20:51:08 WARN Instrumentation: [a9a9964d] regParam is zero, which might cause numerical instability and overfitting.
22/03/28 20:51:08 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/03/28 20:51:08 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/03/28 20:51:09 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [23]:
import pandas as pd

In [24]:
pd.DataFrame({"Coefficients":model.coefficients}, index=['Gender','Age Range'])

Unnamed: 0,Coefficients
Gender,1.0
Age Range,0.0


In [25]:
res = model.evaluate(test_data)

In [26]:
res.residuals.show()



+---------+
|residuals|
+---------+
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
|      0.0|
+---------+
only showing top 20 rows



In [35]:
unlabeled_data = test_data.select('features')

In [36]:
predictions = model.transform(unlabeled_data)

In [37]:
predictions.show()

+---------+------------------+
| features|        prediction|
+---------+------------------+
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,1.0]|0.9999999999999988|
|[1.0,2.0]|1.0000000000000007|
|[1.0,2.0]|1.0000000000000007|
+---------+------------------+
only showing top 20 rows



In [38]:
print("MAE: ", res.meanAbsoluteError)
print("MSE: ", res.meanSquaredError)
print("RMSE: "), res.rootMeanSquaredError
print("R2", res.r2)
print("Adj R2", res.r2)

MAE:  8.382183835919932e-16
MSE:  7.984135177451724e-31
RMSE: 
R2 1.0
Adj R2 1.0
