In [0]:
#Pyspark-DecisionTreeRegressor
#Import the required libraries
from __future__ import print_function
import pyspark
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StandardScaler,VectorAssembler,Imputer,VectorIndexer
from pyspark.sql import SparkSession

In [0]:
#Load the data in to the local machine
df1 = spark.read.format("csv").load("dbfs:/FileStore/shared_uploads/praneethsanthosh555@gmail.com/Admission_Prediction-1.csv",header=True)

In [0]:
df1.show(5)

In [0]:
#Get the column names
df1.columns

In [0]:
#get the data type of the variables 
df1.dtypes.

In [0]:
from pyspark.sql.functions import col,when,count
new_df=df1.select(*(col(c).cast('float').alias(c) for c in df1.columns))
new_df.show(5)

In [0]:
#Check any null value are present in the data
missing_data=new_df.select([count(when(col(c).isNull(),c)).alias(c) for c in new_df.columns])
missing_data.show()

In [0]:
#impute the missing values in the data
impute_data=Imputer(inputCols=["GRE Score","TOEFL Score","University Rating"],
                   outputCols=["GRE Score","TOEFL Score","University Rating"])
model_impute=impute_data.fit(new_df)

imputed_data=model_impute.transform(new_df)

In [0]:
#After imputeing the null values agian check is there any missing values in the data
imputed_data.select([count(when(col(c).isNull(),c)).alias(c) for c in imputed_data.columns]).show()

In [0]:
features=imputed_data.drop("Chance of Admit")
#Assbmle all the fetaures in the data
assemble_data=VectorAssembler(inputCols=features.columns,outputCol="features")
assembler=assemble_data.transform(imputed_data)

In [0]:
assembler.select("features","Chance of Admit").show(5)

In [0]:
indexer=VectorIndexer(inputCol="features",outputCol="IndexedFeatures",maxCategories=4).fit(assembler)
indexer_df=indexer.transform(assembler)

In [0]:
new_output=indexer_df.select("IndexedFeatures","Chance of Admit")
new_output.show(5,truncate=False)

In [0]:
#Split the data for trianing and testing set
train_df,test_df=new_output.randomSplit([0.7,0.3])
#Show the train and test data
train_df.show(5)
test_df.show(5)

In [0]:
#Create a model and fit for training
dt=DecisionTreeRegressor(featuresCol="IndexedFeatures",labelCol="Chance of Admit")
model=dt.fit(train_df)

In [0]:
#Get the prediction on the test data
predictions=model.transform(test_df)
predictions.select("prediction","Chance of Admit").show(5)

In [0]:
#Get the metrics on the test data
evaluter=RegressionEvaluator(labelCol="Chance of Admit",predictionCol="prediction",metricName="mae")
predictions_text=evaluter.evaluate(predictions)
print('MAE Score: %s' % predictions_text)

In [0]:
evaluter=RegressionEvaluator(labelCol="Chance of Admit",predictionCol="prediction",metricName="r2")
predictions_text=evaluter.evaluate(predictions)
print('R2 Score: %s' % predictions_text)

In [0]:
evaluter=RegressionEvaluator(labelCol="Chance of Admit",predictionCol="prediction",metricName="rmse")
predictions_text=evaluter.evaluate(predictions)
print('RMSE Score: %s' % predictions_text)