In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('water_potability').getOrCreate()
spark

In [2]:
data = spark.read.csv("C:\\ML\\Machine-Learning-Projects-master\\Water Quality Classification\\water_potability.csv",header = True,inferSchema = True)
data.show(5)

+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+----------+
|               ph|          Hardness|            Solids|      Chloramines|           Sulfate|      Conductivity|    Organic_carbon|   Trihalomethanes|         Turbidity|Potability|
+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+----------+
|             null| 204.8904554713363|20791.318980747026|7.300211873184757|368.51644134980336| 564.3086541722439|  10.3797830780847|  86.9909704615088|2.9631353806316407|         0|
| 3.71608007538699|129.42292051494425|18630.057857970347|   6.635245883862|              null| 592.8853591348523|15.180013116357259| 56.32907628451764| 4.500656274942408|         0|
|8.099124189298397|224.23625939355776|19909.541732292393|9.275883602694089|              n

In [3]:
data.printSchema()

root
 |-- ph: double (nullable = true)
 |-- Hardness: double (nullable = true)
 |-- Solids: double (nullable = true)
 |-- Chloramines: double (nullable = true)
 |-- Sulfate: double (nullable = true)
 |-- Conductivity: double (nullable = true)
 |-- Organic_carbon: double (nullable = true)
 |-- Trihalomethanes: double (nullable = true)
 |-- Turbidity: double (nullable = true)
 |-- Potability: integer (nullable = true)



In [4]:
from pyspark.sql.functions import isnan,count,when,col
data.select([count(when(col(c).isNull(),c)).alias(c) for c in data.columns]).show()

+---+--------+------+-----------+-------+------------+--------------+---------------+---------+----------+
| ph|Hardness|Solids|Chloramines|Sulfate|Conductivity|Organic_carbon|Trihalomethanes|Turbidity|Potability|
+---+--------+------+-----------+-------+------------+--------------+---------------+---------+----------+
|491|       0|     0|          0|    781|           0|             0|            162|        0|         0|
+---+--------+------+-----------+-------+------------+--------------+---------------+---------+----------+



In [5]:
#TO fill null values 
from pyspark.ml.feature import Imputer
imputer = Imputer(inputCols = ['ph','Sulfate','Trihalomethanes'],
                 outputCols =['{}_imputed'.format(c) for c in ['ph','Sulfate','Trihalomethanes']]).setStrategy('mean')

In [6]:
data = imputer.fit(data).transform(data)
data.show(5)

+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+----------+-----------------+------------------+-----------------------+
|               ph|          Hardness|            Solids|      Chloramines|           Sulfate|      Conductivity|    Organic_carbon|   Trihalomethanes|         Turbidity|Potability|       ph_imputed|   Sulfate_imputed|Trihalomethanes_imputed|
+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+----------+-----------------+------------------+-----------------------+
|             null| 204.8904554713363|20791.318980747026|7.300211873184757|368.51644134980336| 564.3086541722439|  10.3797830780847|  86.9909704615088|2.9631353806316407|         0|7.080794504276819|368.51644134980336|       86.9909704615088|
| 3.71608007538699|129.42292

In [7]:
data = data.drop('ph','Sulfate','Trihalomethanes')

In [8]:
data.show(5)

+------------------+------------------+-----------------+------------------+------------------+------------------+----------+-----------------+------------------+-----------------------+
|          Hardness|            Solids|      Chloramines|      Conductivity|    Organic_carbon|         Turbidity|Potability|       ph_imputed|   Sulfate_imputed|Trihalomethanes_imputed|
+------------------+------------------+-----------------+------------------+------------------+------------------+----------+-----------------+------------------+-----------------------+
| 204.8904554713363|20791.318980747026|7.300211873184757| 564.3086541722439|  10.3797830780847|2.9631353806316407|         0|7.080794504276819|368.51644134980336|       86.9909704615088|
|129.42292051494425|18630.057857970347|   6.635245883862| 592.8853591348523|15.180013116357259| 4.500656274942408|         0| 3.71608007538699| 333.7757766108134|      56.32907628451764|
|224.23625939355776|19909.541732292393|9.275883602694089| 418.606

In [9]:
len(data.columns)

10

In [10]:
cols = data.columns
cols = cols[:9]
cols

['Hardness',
 'Solids',
 'Chloramines',
 'Conductivity',
 'Organic_carbon',
 'Turbidity',
 'Potability',
 'ph_imputed',
 'Sulfate_imputed']

In [11]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = cols,outputCol = 'Independent Features')
output = assembler.transform(data)
output.show(5)

+------------------+------------------+-----------------+------------------+------------------+------------------+----------+-----------------+------------------+-----------------------+--------------------+
|          Hardness|            Solids|      Chloramines|      Conductivity|    Organic_carbon|         Turbidity|Potability|       ph_imputed|   Sulfate_imputed|Trihalomethanes_imputed|Independent Features|
+------------------+------------------+-----------------+------------------+------------------+------------------+----------+-----------------+------------------+-----------------------+--------------------+
| 204.8904554713363|20791.318980747026|7.300211873184757| 564.3086541722439|  10.3797830780847|2.9631353806316407|         0|7.080794504276819|368.51644134980336|       86.9909704615088|[204.890455471336...|
|129.42292051494425|18630.057857970347|   6.635245883862| 592.8853591348523|15.180013116357259| 4.500656274942408|         0| 3.71608007538699| 333.7757766108134|      

In [12]:
finilized_data = output.select('Independent Features','Potability')
finilized_data.show(5)

+--------------------+----------+
|Independent Features|Potability|
+--------------------+----------+
|[204.890455471336...|         0|
|[129.422920514944...|         0|
|[224.236259393557...|         0|
|[214.373394085622...|         0|
|[181.101509236125...|         0|
+--------------------+----------+
only showing top 5 rows



In [13]:
#model
from pyspark.ml.classification import LogisticRegression

#train_test_split
train_data,test_data = finilized_data.randomSplit([0.75,0.25])
LR = LogisticRegression(labelCol = 'Potability',featuresCol = 'Independent Features')
LR  = LR.fit(train_data)
pred_test = LR.transform(test_data)
pred_test.show(10)

+--------------------+----------+--------------------+--------------------+----------+
|Independent Features|Potability|       rawPrediction|         probability|prediction|
+--------------------+----------+--------------------+--------------------+----------+
|[97.2809085978074...|         1|[-18.836847918773...|[6.59569993420787...|       1.0|
|[98.3679148956603...|         1|[-18.916950509913...|[6.08797382400400...|       1.0|
|[100.457615091583...|         0|[19.0665027162042...|[0.99999999475768...|       0.0|
|[105.859263571954...|         0|[19.0124790307928...|[0.99999999446668...|       0.0|
|[111.478581754181...|         1|[-18.873713516703...|[6.35697295694659...|       1.0|
|[113.831112174351...|         1|[-18.872493258878...|[6.3647348376744E...|       1.0|
|[114.463899842195...|         1|[-18.878861592010...|[6.32433087568579...|       1.0|
|[114.733544971534...|         1|[-18.885146200981...|[6.28470956201657...|       1.0|
|[116.905479298641...|         1|[-18.91800

In [14]:
pred_test.select('Potability','prediction').show(10)

+----------+----------+
|Potability|prediction|
+----------+----------+
|         1|       1.0|
|         1|       1.0|
|         0|       0.0|
|         0|       0.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         1|       1.0|
|         0|       0.0|
+----------+----------+
only showing top 10 rows



In [15]:
lr_summary = LR.summary

In [16]:
#overall accuracy of classification model
print(lr_summary.accuracy*100)

100.0


In [17]:
print('areaUnderROC:',lr_summary.areaUnderROC*100)

areaUnderROC: 99.99996521838334


In [18]:
#precision of both classes
print('precision:',lr_summary.precisionByLabel)

precision: [1.0, 1.0]


In [19]:
#recall of both classes
print('recall:',lr_summary.recallByLabel)

recall: [1.0, 1.0]


In [20]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol = 'Potability',predictionCol = 'prediction',metricName = 'accuracy')
accuracy = evaluator.evaluate(pred_test)
print('accuracy:',accuracy*100)

accuracy: 100.0


In [30]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
import pandas as pd

In [23]:
y_true = pred_test.select('potability')
y_true = y_true.toPandas()
y_pred = pred_test.select('prediction')
y_pred = y_pred.toPandas()

In [33]:
print('accuracy score:',accuracy_score(y_pred,y_true)*100)
print('*************************classification_report******************')
print(classification_report(y_pred,y_true))
print('**********************Confusion_matrix**************************')
pd.DataFrame(confusion_matrix(y_pred,y_true))

accuracy score: 100.0
*************************classification_report******************
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       499
         1.0       1.00      1.00      1.00       319

    accuracy                           1.00       818
   macro avg       1.00      1.00      1.00       818
weighted avg       1.00      1.00      1.00       818

**********************Confusion_matrix**************************


Unnamed: 0,0,1
0,499,0
1,0,319
