# Importing usefull packages


In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline
 
# machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, mean_absolute_error, plot_confusion_matrix
from sklearn.model_selection import train_test_split

print('setup complete')

# Understanding Data and Data CLeaning

In [None]:
df = pd.read_csv("../input/water-potability/water_potability.csv")
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

Since it is a big dataset I am going go remove null values, I won't replace them with mean values because we know from common sense that, per example, there is a high probability that water with really low or really high pH levels isn't drinkable (in EDA we will check that), therefore I don't want to replace potable water with pH levels of 7 since with would change the predict values in the ML section.

In [None]:
df.dropna(inplace=True)
df.isnull().sum()

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,8))
df.Potability.value_counts().plot(kind ='pie', ax=ax[0])
sns.countplot(data=df,x='Potability',ax=ax[1])
plt.show()

The Data seems unbalanced, we will fix that:

In [None]:
Potability = [df['Potability']==0, df['Potability']==1]
print(len(df[Potability[0]])-len(df[Potability[1]]))

In [None]:
rows_to_remove = df[Potability[0]].sample(n=389)

if len(df[Potability[0]]) != len(df[Potability[1]]):
    df = pd.concat([rows_to_remove, df]).drop_duplicates(keep=False)

sns.countplot(data=df,x='Potability')
plt.show() 

# EDA(Exploratory Data Analysis)

## Distribution 
Checking if the distribuiton is normal 

In [None]:
plt.figure(figsize=[10,10])
for count, value in enumerate(df.columns[:-1]):
    plt.subplot(3,3, count+1)    
    sns.histplot(df[value])
    plt.title(value)
plt.tight_layout() #meter no loop boxplots tbm para ver outliers, e depois remove los 

In [None]:
plt.figure(figsize=[10,10])
for count, value in enumerate(df.columns[:-1]):
    plt.subplot(3,3, count+1)    
    sns.boxplot(df[value])
    plt.title(value)
plt.tight_layout()

## Correlation

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), annot=True)
#dar titulo, mostrar qual a relacao maior entre potability e as cenas escrever em markdown, meter as cenas em geral mais apresentaveis, dar uma intro e sub titulos 

In [None]:
#There seems that solids has the biggest correlation - 0.046
plt.figure(figsize=(10,10))
undrinkable =df['Solids'][df['Potability']==0]
drinkable = df['Solids'][df['Potability']==1]

plt.subplot(1,2,1)
plt.boxplot(x=undrinkable) #set titles 
plt.title("Solids in unpotable water")

plt.subplot(1,2,2)
plt.boxplot(x=drinkable)
plt.title("Solids in potable water")

## Mean of Solids:

In [None]:
print("Unpotable water:", undrinkable.mean())
print("Potable water:", drinkable.mean())


## Checking if highly acid or basic water is potable 

We will consider everything outside of Q1 (pH of 6) and Q3 (pH of 8) as higly acid or basic levels. This is to understand if 'abnormal' pH levels is enough to discover if water is not potable; water within a 'normal' pH level would still need to be further evaluated, but being able to say that water with 'abnormal' pH levels isn't for sure potable would help to hurry the process of checking if the water is drinkable, since evaluating pH levels is really easy compared to other features in this dataset. 

In [None]:
normal = df[df['ph']<8][df['ph']>6]
abnormal =  pd.concat([df, normal]).drop_duplicates(keep=False)

plt.figure(figsize=(10,10))
plt.subplot(121)
sns.countplot(data=normal, x='Potability')
plt.title('Normal pH level')

plt.subplot(122)
sns.countplot(data=abnormal, x='Potability') #title 
plt.title('Abnormal pH level')


Since it doesn't show much of a difference I will change the range of abnormal pH levels, which will be defined as anything below 4 or above 10.

In [None]:
normal = df[df['ph']<10][df['ph']>4]
abnormal =  pd.concat([df, normal]).drop_duplicates(keep=False)

plt.figure(figsize=(10,10))
plt.subplot(121)
sns.countplot(data=normal, x='Potability')
plt.title('Normal pH level')

plt.subplot(122)
sns.countplot(data=abnormal, x='Potability') 
plt.title('Abnormal pH level')

Now it shows a significant difference, even tho using only pH levels  wouldn't be enough to totally classify the potability of water. Maybe using the mean of pH to replace null values wouldn't be so bad, maybe we could try that in another analysis.

# Machine Learning 

In [None]:
features = []
for feat in df.columns[:-1]:
    features.append(feat)
    
y = df['Potability']
X = df[features]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33)

## Random Forest Classifier

In [None]:
clf1 = RandomForestClassifier()
clf1.fit(X_train, y_train)
y_predict1 = clf1.predict(X_test)

print('MAE:',mean_absolute_error(y_predict1,y_test))
print('Accuracy:', round(accuracy_score(y_test, y_predict1)*100),'%')

In [None]:
clf2 = KNeighborsClassifier()
clf2.fit(X_train, y_train)
y_predict2 = clf2.predict(X_test)

print('MAE:',mean_absolute_error(y_predict2,y_test))
print('Accuracy:', round(accuracy_score(y_test, y_predict2)*100),'%')

In [None]:
clf3 = DecisionTreeClassifier()
clf3.fit(X_train, y_train)
y_predict3 = clf3.predict(X_test)

print('MAE:',mean_absolute_error(y_predict3,y_test))
print('Accuracy:', round(accuracy_score(y_test, y_predict3)*100),'%')

In [None]:
clf4 = GaussianNB()
clf4.fit(X_train, y_train)
y_predict4 = clf4.predict(X_test)

print('MAE:',mean_absolute_error(y_predict4,y_test))
print('Accuracy:', round(accuracy_score(y_test, y_predict4)*100),'%')

In [None]:
clf5 = LinearSVC()
clf5.fit(X_train, y_train)
y_predict5 = clf5.predict(X_test)

print('MAE:',mean_absolute_error(y_predict5,y_test))
print('Accuracy:', round(accuracy_score(y_test, y_predict5)*100),'%')

RandomForestClassifier performed the best out of all of them, in the next iteration we could try different algorithms to see if they performed better and we could use all the data (without deleting a random sample to equilibrate potability), we could also replace the null values with their feature's mean.

In [None]:
plot_confusion_matrix(clf1, X_test, y_test)
plt.title('Confusion Matrix for Random Forest Classifier')
plt.show()

# Brief conclusion:
The accuracy is still really low for this to be impletend in the real-world, even comparing my predictions to other's people predictions the highest I found as 80% accuracy 