In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, BooleanType
from pyspark.ml import feature, classification, evaluation
schema = StructType([
    StructField("ph", DoubleType(), True),
    StructField("hardness", DoubleType(), True),
    StructField("solids", DoubleType(), True),
    StructField("chloramines", DoubleType(), True),
    StructField("sulfate", DoubleType(), True),
    StructField("conductivity", DoubleType(), True),
    StructField("organic_carbon", DoubleType(), True),
    StructField("trihalomethanes", DoubleType(), True),
    StructField("turbidity", DoubleType(), True),
    StructField("potability", IntegerType(), True),
])

spark = SparkSession.builder.appName("water").getOrCreate()
#df = spark.read.csv("./water_potability.csv", header=False, schema=schema)
df = spark.read.csv("./water_potability.csv", header=False, schema=schema)
df.createOrReplaceTempView("df")
df_train, df_eval = df.randomSplit([0.8,0.2], 42)




In [25]:
df_train.show(3)
df_train = df_train.dropna()
df_train.show(3)

+----+------------------+-----------------+------------------+----------------+------------------+------------------+-----------------+-----------------+----------+
|  ph|          hardness|           solids|       chloramines|         sulfate|      conductivity|    organic_carbon|  trihalomethanes|        turbidity|potability|
+----+------------------+-----------------+------------------+----------------+------------------+------------------+-----------------+-----------------+----------+
|null|              null|             null|              null|            null|              null|              null|             null|             null|      null|
|null|  98.3679148956603|28415.57583214058|10.558949998467961|296.843207792478|505.24026927891407|12.882614472289333|85.32995534051292|4.119087300328971|         1|
|null|105.85926357195498|37928.14217716675| 5.609440345508508|            null|358.88876761151056|12.207108489369546|71.11989017420973|3.873853349593973|         0|
+----+----

In [31]:
print(df.columns[:-1])
vect = feature.VectorAssembler(inputCols=df.columns[:-1], outputCol="features_raw")
df_train_vectorized = vect.transform(df_train)
df_train_vectorized.show(1)
df_train_vectorized = df_train_vectorized.select("potability", "features_raw")

df_train_vectorized.show(3)



['ph', 'hardness', 'solids', 'chloramines', 'sulfate', 'conductivity', 'organic_carbon', 'trihalomethanes', 'turbidity']
+-------------------+-----------------+------------------+------------------+------------------+-----------------+------------------+-----------------+-----------------+----------+--------------------+
|                 ph|         hardness|            solids|       chloramines|           sulfate|     conductivity|    organic_carbon|  trihalomethanes|        turbidity|potability|        features_raw|
+-------------------+-----------------+------------------+------------------+------------------+-----------------+------------------+-----------------+-----------------+----------+--------------------+
|0.22749905020219874|152.5301111764229|39028.599340290755|3.4624920476792767|283.69378223429663|443.0292321286284|13.201943203829217|62.32271110691731|3.545741437567914|         1|[0.22749905020219...|
+-------------------+-----------------+------------------+-------------

In [34]:
scaler = feature.StandardScaler(inputCol="features_raw", outputCol="features")
scaler_t = scaler.fit(df_train_vectorized)
df_train_scaled = scaler_t.transform(df_train_vectorized)
df_train_scaled = df_train_scaled.select("potability", "features")
df_train_scaled.show(2)

+----------+--------------------+
|potability|            features|
+----------+--------------------+
|         1|[0.14545424324450...|
|         1|[0.63291223270976...|
+----------+--------------------+
only showing top 2 rows



In [42]:
forest = classification.RandomForestClassifier(featuresCol='features', labelCol='potability',maxDepth=8, minInstancesPerNode=5, seed=42)
forest_t = forest.fit(df_train_scaled)
prediction_train = forest_t.transform(df_train_scaled)
prediction_train.show(3)


+----------+--------------------+--------------------+--------------------+----------+
|potability|            features|       rawPrediction|         probability|prediction|
+----------+--------------------+--------------------+--------------------+----------+
|         1|[0.14545424324450...|[9.34231037970611...|[0.46711551898530...|       1.0|
|         1|[0.63291223270976...|[7.67136337670512...|[0.38356816883525...|       1.0|
|         1|[1.12338272955014...|[10.4771090484300...|[0.52385545242150...|       0.0|
+----------+--------------------+--------------------+--------------------+----------+
only showing top 3 rows



In [45]:
binaryevaluator = evaluation.BinaryClassificationEvaluator(
    labelCol='potability'
    
)
multievaluator = evaluation.MulticlassClassificationEvaluator(
    labelCol='potability',
    metricName="accuracy"
)
print(binaryevaluator.evaluate(prediction_train))
print(multievaluator.evaluate(prediction_train))

0.9113604816195348
0.7664009809932557
