In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [None]:
df = spark.read.csv('/kaggle/input/diabetes-data-set/diabetes.csv', inferSchema=True, header=True)

In [None]:
df.take(5)

In [None]:
df.show()

In [None]:
df.printSchema()

In [None]:
print(df.count(), len(df.columns))

In [None]:
print('Total number of diabetic and non-diabetic:')
print(df.groupBy('Outcome').count().show())

In [None]:
df.describe().show()

In [None]:
for col in df.columns:
    print(col + ': ', df[df[col].isNull()].count())

In [None]:
for col in df.columns:
    print('Correlation between Outcome column and ' + col + ' is', df.stat.corr('Outcome', col))

In [None]:
# feature selection

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI'], outputCol='features')
output = assembler.transform(df)

In [None]:
output.show()

In [None]:
output.printSchema()

In [None]:
from pyspark.ml.classification import LogisticRegression

final_data = output.select('features', 'Outcome')

In [None]:
final_data.printSchema()

In [None]:
train, test = final_data.randomSplit([0.8,0.2])
logistic_regression = LogisticRegression(labelCol='Outcome')
model = logistic_regression.fit(train)

In [None]:
summary = model.summary
summary.predictions.describe().show()

In [None]:
#evaluate model

from pyspark.ml.evaluation import BinaryClassificationEvaluator

predictions = model.evaluate(test)

In [None]:
predictions.predictions.show()

In [None]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='Outcome')
print('Model accuracy: ', evaluator.evaluate(model.transform(test)))