In [19]:
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# load csv data as pandas dataframe
def load_diabetes_data(path):
    return pd.read_csv(path)

file_path = "/Users/pepijnschouten/Desktop/Python_Scripts" \
                "/Python_Scripts_Books/Distributed_ML_with_PySpark" \
                    "/Python_Own_Files/Chapter 2/data/diabetes.csv"
pandas_df = load_diabetes_data(file_path)

# create spark dataframe
spark = SparkSession.builder.appName("diabetes_data").getOrCreate()
spark_df = spark.createDataFrame(pandas_df)

# print dataframes
print(pandas_df.head())
spark_df.show(5)

# data information
pandas_df.info()
spark_df.printSchema()
spark_df.count()

# descriptive statistics
print(pandas_df.drop(columns=["Outcome"]).describe())
spark_df.drop("Outcome").summary().show()

# value count
print(pandas_df["Outcome"].value_counts())
spark_df.groupBy("Outcome").count().show()

# count null values
print(pandas_df.isnull().sum())
spark_df.select([
    F.sum(F.col(c).isNull().cast("int")).alias(c) for c in spark_df.columns]).show()

# print zero values
print((pandas_df == 0).sum())
spark_df.select(
    [F.sum((F.col(c) == 0).cast("int")).alias(c) for c in spark_df.columns]).show()

# excluding rows with zero values
pandas_df = pandas_df.loc[(pandas_df["Glucose"] != 0)
                          & (pandas_df["BloodPressure"] != 0)
                          & (pandas_df["BMI"] != 0),
                          ["Pregnancies", "Glucose", "BloodPressure",
                           "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]]
spark_df = spark_df.filter((spark_df["Glucose"] != 0)
                          & (spark_df["BloodPressure"] != 0)
                          & (spark_df["BMI"] != 0)).select(
                              ["Pregnancies", "Glucose", "BloodPressure",
                               "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]
                          )

# check shapes
print(pandas_df.shape)
print(spark_df.count(), len(spark_df.columns))

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|  