### Pyspark
#### Linear Regression

In [0]:
#Import the libraries
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [0]:
#Load and read dataset
df1 = spark.read.format("csv").load("dbfs:/FileStore/shared_uploads/praneethsanthosh555@gmail.com/Admission_Prediction.csv",header=True)

In [0]:
#Show the dataset
df1.show(5)

In [0]:
df1.printSchema()

In [0]:
#From sql function gettig all the columns whith the help of col obejct
from pyspark.sql.functions import col
new_data=df1.select(*(col(c).cast('float').alias(c) for c in df1.columns))

In [0]:
#Printschema to get the info of the data type of the each variable and null values count (Bollian True or False)
new_data.printSchema()

In [0]:
#Check for null values are present in the data
from pyspark.sql.functions import col, when , count, isnan
new_data.select([count(when(col(c).isNull(),c)).alias(c) for c in df1.columns]).show()

In [0]:
#impute the null values 
from pyspark.ml.feature import Imputer
imputer=Imputer(inputCols=["GRE Score","TOEFL Score","University Rating"],
                       outputCols=["GRE Score","TOEFL Score","University Rating"])
model=imputer.fit(new_data)

imputed_data=model.transform(new_data)

In [0]:
imputed_data.select([count(when(col(c).isNull(),c)).alias(c) for c in new_data.columns]).show()

In [0]:
imputed_data.show(5)

In [0]:
#Split the data in to x and y
features = imputed_data.drop('Chance of Admit')
features.show(5)

In [0]:
#Let's assemble all the columns are together
assemble= VectorAssembler(inputCols=features.columns,
                         outputCol="features")

In [0]:
output=assemble.transform(imputed_data)

In [0]:
output=output.select("features","Chance of Admit")
output.show(5)

In [0]:
train_df,test_df=output.randomSplit([0.7,0.3])

In [0]:
train_df.show(5)
test_df.show(5)

In [0]:
linear_reg=LinearRegression(featuresCol="features",labelCol="Chance of Admit")
linear_model=linear_reg.fit(train_df)

In [0]:
print("Coefficients: ",linear_model.coefficients[0])
print("Intercept: ",linear_model.intercept)

In [0]:
train_summary=linear_model.summary
print("RMSE: %f" % train_summary.rootMeanSquaredError)
print('R2:  %f' % train_summary.r2)
print("MAE: %f " % train_summary.meanAbsoluteError)

In [0]:
#predictions
prediction=linear_model.transform(test_df)
prediction.select("prediction","Chance of Admit","features").show()

In [0]:
#Get the prediction score
from pyspark.ml.evaluation import RegressionEvaluator
pred_evaluater=RegressionEvaluator(predictionCol="prediction",
                                  labelCol="Chance of Admit",
                                  metricName="r2")
print("R2 error (r2) is found to be %g" % pred_evaluater.evaluate(prediction))