In [34]:
from __future__ import print_function
import findspark
findspark.init()

import pyspark


from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import col, count, isnan, when

In [35]:
if __name__ == "__main__":
    spark = SparkSession\
    .builder\
    .appName('LinearRegressionWithSpark')\
    .getOrCreate()

In [36]:
spark

In [37]:
dataset = spark.read.csv('Admission_Prediction.csv', header=True)

In [38]:
dataset

DataFrame[Serial No.: string, GRE Score: string, TOEFL Score: string, University Rating: string, SOP: string, LOR : string, CGPA: string, Research: string, Chance of Admit : string]

In [39]:
dataset.show(5)

+----------+---------+-----------+-----------------+---+----+----+--------+----------------+
|Serial No.|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+----------+---------+-----------+-----------------+---+----+----+--------+----------------+
|         1|      337|        118|                4|4.5| 4.5|9.65|       1|            0.92|
|         2|      324|        107|                4|  4| 4.5|8.87|       1|            0.76|
|         3|      316|        104|                3|  3| 3.5|   8|       1|            0.72|
|         4|      322|        110|                3|3.5| 2.5|8.67|       1|             0.8|
|         5|      314|        103|                2|  2|   3|8.21|       0|            0.65|
+----------+---------+-----------+-----------------+---+----+----+--------+----------------+
only showing top 5 rows



In [40]:
dataset.printSchema()

root
 |-- Serial No.: string (nullable = true)
 |-- GRE Score: string (nullable = true)
 |-- TOEFL Score: string (nullable = true)
 |-- University Rating: string (nullable = true)
 |-- SOP: string (nullable = true)
 |-- LOR : string (nullable = true)
 |-- CGPA: string (nullable = true)
 |-- Research: string (nullable = true)
 |-- Chance of Admit : string (nullable = true)



In [41]:
dataset = dataset.drop("Serial No.")   

In [42]:
dataset.show()

+---------+-----------+-----------------+---+----+----+--------+----------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+---------+-----------+-----------------+---+----+----+--------+----------------+
|      337|        118|                4|4.5| 4.5|9.65|       1|            0.92|
|      324|        107|                4|  4| 4.5|8.87|       1|            0.76|
|      316|        104|                3|  3| 3.5|   8|       1|            0.72|
|      322|        110|                3|3.5| 2.5|8.67|       1|             0.8|
|      314|        103|                2|  2|   3|8.21|       0|            0.65|
|      330|        115|                5|4.5|   3|9.34|       1|             0.9|
|      321|        109|                3|  3|   4| 8.2|       1|            0.75|
|      308|        101|                2|  3|   4| 7.9|       0|            0.68|
|      302|        102|                1|  2| 1.5|   8|       0|             0.5|
|      323|     

In [43]:
dataset.columns

['GRE Score',
 'TOEFL Score',
 'University Rating',
 'SOP',
 'LOR ',
 'CGPA',
 'Research',
 'Chance of Admit ']

In [44]:
#Casting to convert string to float.
new_data = dataset.select(*(col(c).cast("float").alias(c) for c in dataset.columns))

In [45]:
new_data.printSchema() # converted from string to float

root
 |-- GRE Score: float (nullable = true)
 |-- TOEFL Score: float (nullable = true)
 |-- University Rating: float (nullable = true)
 |-- SOP: float (nullable = true)
 |-- LOR : float (nullable = true)
 |-- CGPA: float (nullable = true)
 |-- Research: float (nullable = true)
 |-- Chance of Admit : float (nullable = true)



In [46]:
new_data.printSchema()

root
 |-- GRE Score: float (nullable = true)
 |-- TOEFL Score: float (nullable = true)
 |-- University Rating: float (nullable = true)
 |-- SOP: float (nullable = true)
 |-- LOR : float (nullable = true)
 |-- CGPA: float (nullable = true)
 |-- Research: float (nullable = true)
 |-- Chance of Admit : float (nullable = true)



In [47]:
new_data.select([count(when(col(c).isNull(), c)).alias(c) for c in new_data.columns]).show()

+---------+-----------+-----------------+---+----+----+--------+----------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|Chance of Admit |
+---------+-----------+-----------------+---+----+----+--------+----------------+
|        0|          0|                0|  0|   0|   0|       0|               0|
+---------+-----------+-----------------+---+----+----+--------+----------------+



No Null Values to deal so no need for imputer.

In [48]:
features = new_data.drop('Chance of Admit ')

In [49]:
features.show(5)

+---------+-----------+-----------------+---+----+----+--------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR |CGPA|Research|
+---------+-----------+-----------------+---+----+----+--------+
|    337.0|      118.0|              4.0|4.5| 4.5|9.65|     1.0|
|    324.0|      107.0|              4.0|4.0| 4.5|8.87|     1.0|
|    316.0|      104.0|              3.0|3.0| 3.5| 8.0|     1.0|
|    322.0|      110.0|              3.0|3.5| 2.5|8.67|     1.0|
|    314.0|      103.0|              2.0|2.0| 3.0|8.21|     0.0|
+---------+-----------+-----------------+---+----+----+--------+
only showing top 5 rows



In [50]:
# Assembling features together with VectorAssemble

assembler = VectorAssembler(inputCols=features.columns, outputCol="features")

In [51]:
output = assembler.transform(new_data)

In [52]:
output = output.select('features', 'Chance of Admit ')

In [53]:
output

DataFrame[features: vector, Chance of Admit : float]

In [54]:
train_df, test_df = output.randomSplit([0.7, 0.3]) # Splitting data into 30:70 ratio for train and test

In [55]:
train_df.show()

+--------------------+----------------+
|            features|Chance of Admit |
+--------------------+----------------+
|[290.0,104.0,4.0,...|            0.45|
|[293.0,97.0,2.0,2...|            0.64|
|[294.0,93.0,1.0,1...|            0.46|
|[295.0,93.0,1.0,2...|            0.46|
|[295.0,96.0,2.0,1...|            0.47|
|[295.0,99.0,1.0,2...|            0.37|
|[295.0,99.0,2.0,2...|            0.57|
|[295.0,101.0,2.0,...|            0.69|
|[296.0,95.0,2.0,3...|            0.44|
|[296.0,97.0,2.0,1...|            0.49|
|[296.0,99.0,2.0,2...|            0.61|
|[296.0,99.0,2.0,3...|            0.47|
|[297.0,96.0,2.0,2...|            0.43|
|[297.0,96.0,2.0,2...|            0.34|
|[297.0,98.0,2.0,2...|            0.59|
|[297.0,99.0,4.0,3...|            0.54|
|[297.0,100.0,1.0,...|            0.52|
|[297.0,101.0,3.0,...|            0.57|
|[298.0,92.0,1.0,2...|            0.51|
|[298.0,97.0,2.0,2...|            0.45|
+--------------------+----------------+
only showing top 20 rows



In [56]:
test_df.show()

+--------------------+----------------+
|            features|Chance of Admit |
+--------------------+----------------+
|[290.0,100.0,1.0,...|            0.47|
|[294.0,95.0,1.0,1...|            0.49|
|[296.0,101.0,1.0,...|             0.6|
|[298.0,98.0,2.0,4...|            0.34|
|[298.0,99.0,2.0,4...|            0.46|
|[298.0,101.0,2.0,...|            0.54|
|[299.0,94.0,1.0,1...|            0.42|
|[299.0,100.0,2.0,...|            0.51|
|[299.0,100.0,3.0,...|            0.63|
|[299.0,102.0,3.0,...|            0.56|
|[299.0,106.0,2.0,...|            0.64|
|[300.0,97.0,2.0,3...|            0.65|
|[300.0,99.0,1.0,3...|            0.36|
|[301.0,96.0,1.0,3...|            0.54|
|[301.0,98.0,1.0,2...|            0.67|
|[301.0,99.0,2.0,3...|            0.64|
|[301.0,104.0,2.0,...|            0.68|
|[301.0,107.0,3.0,...|            0.62|
|[302.0,99.0,1.0,2...|            0.57|
|[302.0,99.0,2.0,1...|            0.56|
+--------------------+----------------+
only showing top 20 rows



In [57]:
lin_reg = LinearRegression(featuresCol= 'features', labelCol='Chance of Admit ')
linear_model = lin_reg.fit(train_df)

In [58]:
print("Coefficients: " ,(linear_model.coefficients))
print("Intercept: " , (linear_model.intercept))

Coefficients:  [0.0011084437349106852,0.003338467894893564,0.004345385880682187,0.005535324353859607,0.01564562441979079,0.12567956807290276,0.02727877346248619]
Intercept:  -1.1654401748903265


In [59]:
trainsummary = linear_model.summary
print('RMSE: %f' % trainsummary.rootMeanSquaredError)
print("r2: %f" % trainsummary.r2)

RMSE: 0.059540
r2: 0.826545


In [60]:
# Predictions:

prediction = linear_model.transform(test_df)
prediction.select('prediction', 'Chance of Admit ', 'features').show()

+-------------------+----------------+--------------------+
|         prediction|Chance of Admit |            features|
+-------------------+----------------+--------------------+
|0.48393244641388433|            0.47|[290.0,100.0,1.0,...|
| 0.4739054255264059|            0.49|[294.0,95.0,1.0,1...|
| 0.5301840592777554|             0.6|[296.0,101.0,1.0,...|
| 0.5790217523141461|            0.34|[298.0,98.0,2.0,4...|
| 0.5126724030922283|            0.46|[298.0,99.0,2.0,4...|
| 0.5381877444620913|            0.54|[298.0,101.0,2.0,...|
| 0.4278148674545825|            0.42|[299.0,94.0,1.0,1...|
| 0.5412389714433488|            0.51|[299.0,100.0,2.0,...|
| 0.5631795400028929|            0.63|[299.0,100.0,3.0,...|
| 0.6798032300594015|            0.56|[299.0,102.0,3.0,...|
| 0.6689849894320847|            0.64|[299.0,106.0,2.0,...|
| 0.6084412722658195|            0.65|[300.0,97.0,2.0,3...|
|0.43174373528888466|            0.36|[300.0,99.0,1.0,3...|
| 0.5223656912886985|            0.54|[3

In [61]:
from pyspark.ml.evaluation import RegressionEvaluator
pred_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Chance of Admit ', metricName='r2')

print('R Squared (R2) on test data = %g' % pred_evaluator.evaluate(prediction))


R Squared (R2) on test data = 0.80444


In [33]:
spark.stop()