In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
import pyspark

In [4]:
from pyspark.sql import SparkSession

In [7]:
spark = SparkSession.builder.appName('Line_reg').getOrCreate()

In [14]:
df = spark.read.csv(r'linreg.csv',header=True,inferSchema=True)

In [15]:
df.show()

+-----+-----+-----+-----+-----+------+
|var_1|var_2|var_3|var_4|var_5|output|
+-----+-----+-----+-----+-----+------+
|  734|  688|   81|0.328|0.259| 0.418|
|  700|  600|   94| 0.32|0.247| 0.389|
|  712|  705|   93|0.311|0.247| 0.417|
|  734|  806|   69|0.315| 0.26| 0.415|
|  613|  759|   61|0.302| 0.24| 0.378|
|  748|  676|   85|0.318|0.255| 0.422|
|  669|  588|   97|0.315|0.251| 0.411|
|  667|  845|   68|0.324|0.251| 0.381|
|  758|  890|   64| 0.33|0.274| 0.436|
|  726|  670|   88|0.335|0.268| 0.422|
|  583|  794|   55|0.302|0.236| 0.371|
|  676|  746|   72|0.317|0.265|   0.4|
|  767|  699|   89|0.332|0.274| 0.433|
|  637|  597|   86|0.317|0.252| 0.374|
|  609|  724|   69|0.308|0.244| 0.382|
|  776|  733|   83|0.325|0.259| 0.437|
|  701|  832|   66|0.325| 0.26|  0.39|
|  650|  709|   74|0.316|0.249| 0.386|
|  804|  668|   95|0.337|0.265| 0.453|
|  713|  614|   94| 0.31|0.238| 0.404|
+-----+-----+-----+-----+-----+------+
only showing top 20 rows



In [16]:
df.count()

1232

In [18]:
df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)



In [20]:
df.describe().show()

+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|            var_1|            var_2|             var_3|               var_4|               var_5|             output|
+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|             1232|             1232|              1232|                1232|                1232|               1232|
|   mean|715.0819805194806|715.0819805194806| 80.90422077922078|  0.3263311688311693| 0.25927272727272715|0.39734172077922014|
| stddev| 91.5342940441652|93.07993263118064|11.458139049993724|0.015012772334166148|0.012907228928000298|0.03326689862173776|
|    min|              463|              472|                40|               0.277|               0.214|              0.301|
|    max|             1009|             1103|               116|               0.373|               0.294|     

## Check correlations of all variables wrt output variable

In [27]:
from pyspark.sql.functions import corr

In [31]:
for col in df.columns[:-1]:
    df.select(corr(df[col], df['output'])).show()

+-------------------+
|corr(var_1, output)|
+-------------------+
| 0.9187399607627283|
+-------------------+

+-------------------+
|corr(var_2, output)|
+-------------------+
|0.43652698913681093|
+-------------------+

+-------------------+
|corr(var_3, output)|
+-------------------+
| 0.4014958408311139|
+-------------------+

+-------------------+
|corr(var_4, output)|
+-------------------+
| 0.7909100204842113|
+-------------------+

+-------------------+
|corr(var_5, output)|
+-------------------+
| 0.7904806260381185|
+-------------------+



## Feature Engineering

In [34]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [35]:
df.columns

['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'output']

In [36]:
vec_asmblr = VectorAssembler(inputCols=df.columns[:-1], outputCol='features')

In [37]:
df = vec_asmblr.transform(df)

In [39]:
df = df.select('features','output')

In [42]:
df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- output: double (nullable = true)



## Splitting dataset

In [43]:
train_df, test_df = df.randomSplit([0.75,0.25])

In [44]:
train_df.count(),test_df.count()

(922, 310)

## Train model

In [45]:
from pyspark.ml.regression import LinearRegression

In [51]:
lin_reg = LinearRegression(labelCol='output')

In [52]:
lin_reg = lin_reg.fit(train_df)

In [54]:
lin_reg.coefficients

DenseVector([0.0003, 0.0001, 0.0003, -0.7147, 0.481])

In [55]:
lin_reg.intercept

0.19502440626807166

In [56]:
test_predictions = lin_reg.evaluate(test_df)

In [57]:
test_predictions.r2

0.8813641574262024

In [58]:
test_predictions.meanSquaredError

0.00013318356462116516