### Pyspark
#### LogisticRegression

In [0]:
#Pyspark-Logistic Regression
#import the required libararies
import pyspark
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import col,count, when
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.feature import StandardScaler
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

In [0]:
#Load and read the data
df1 = spark.read.format("csv").load("dbfs:/FileStore/shared_uploads/praneethsanthosh555@gmail.com/diabetes.csv",header=True)
df1.show(5)

In [0]:
df1.printSchema()

In [0]:
new_data=df1.select(*(col(c).cast("float").alias(c) for c in df1.columns))
new_data.printSchema()

In [0]:
#Check any null values are present in the data
data_with_null= df1.select([count(when(col(c).isNull(),c)).alias(c) for c in df1.columns]).show()
data_with_null

In [0]:
#Split the data 
col=new_data.columns
col.remove("Outcome")
assembler=VectorAssembler(inputCols=col,
                          outputCol="features"
                         )

In [0]:
#Let's use the transform method to transform our dataset
data = assembler.transform(new_data)
data.select("features",'Outcome').show(truncate=False)

In [0]:
#Scale down all the values in the data
standardScaler = StandardScaler().setInputCol("features").setOutputCol("scaled_features")
data=standardScaler.fit(data).transform(data)

In [0]:
assemble_data= data.select("scaled_features","Outcome")
assemble_data.show(5)

In [0]:
#Split the data in to train and test data
train_df,test_df=assemble_data.randomSplit([0.7,0.3])

In [0]:
train_df.show(5)
test_df.show(5)

In [0]:
log_reg=LogisticRegression(featuresCol="scaled_features",labelCol="Outcome",maxIter=10)
log_model=log_reg.fit(train_df)

In [0]:
#Get the predictions
prediction=log_model.transform(test_df)

In [0]:
prediction.select("prediction","Outcome").show(10)

In [0]:
#Get the results or performance on the test data
#Compute raw scores on the test dataset
pred=prediction.select("Outcome","prediction").rdd.map(lambda row: row[0:])

In [0]:
metrics=BinaryClassificationMetrics(pred)

#Area under the curve
print("Area under ROC = %s" % metrics.areaUnderROC)
print("Area under PR = %s" % metrics.areaUnderPR)