In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
data = pd.read_csv('../input/water-potability/water_potability.csv')
data.info() # It appears that we have missing data for Ph Values, Sulfate and Trihalomethanes
sns.set_style('whitegrid')
sns.heatmap(data.isnull(),yticklabels=False,cbar=False,cmap='coolwarm') # This is a nice visual way to see what is missing

In [None]:
sns.distplot(data['ph'].dropna(),bins = 30,color = 'blue' )

In [None]:
sns.distplot(data['Sulfate'].dropna(),bins = 30, color = 'green')

In [None]:
sns.distplot(data['Trihalomethanes'].dropna(),bins = 30, color = 'red')

In [None]:
#These graphs all appear to show a normal (or Gaussian) Distribution, let's investigate this so we can accurately 
#Fill in missing values.
data[['ph','Sulfate','Trihalomethanes']].describe()
nd_mean = data[['ph','Sulfate','Trihalomethanes']].mean()
nd_std = data[['ph','Sulfate','Trihalomethanes']].std()

nd_lb = nd_mean - 2*nd_std
nd_ub = nd_mean + 2*nd_std #Here we are trying to see if 95% of data lies in a 2sigma range of the mean

(data[(data['ph'] >= nd_lb['ph']) & (data['ph'] <= nd_ub['ph'])].count())/data.count() # The ph metric does adhear to this
(data[(data['Sulfate'] >= nd_lb['Sulfate']) & (data['Sulfate'] <= nd_ub['Sulfate'])].count())/data.count() #92%
(data[(data['Trihalomethanes'] >= nd_lb['Trihalomethanes']) & (data['Trihalomethanes'] <= nd_ub['Trihalomethanes'])].count())/data.count()#95%
#These metrics adhear to the 2Sigma rule and so we will fill in missing values using a random sampling from a normal distrbution
#We will now try to impute the missing values with values from the normal distribution.
data.fillna({'ph': np.random.normal( loc = nd_mean['ph'], scale = nd_std['ph']), 'Sulfate': np.random.normal( loc = nd_mean['Sulfate'], scale = nd_std['Sulfate']), 'Trihalomethanes': np.random.normal( loc = nd_mean['Trihalomethanes'], scale = nd_std['Trihalomethanes'])},inplace = True)
data.isnull().sum()


#We have imputed the missing values with a normally distributed number for each of the columns. This does neglect correlations between variables.

In [None]:
data['Potability'].value_counts()
plt.figure(figsize = (5,5))
data.Potability.value_counts().plot(kind ='pie')

In [None]:
plt.figure(figsize = (10,8))
sns.heatmap(data.drop('Potability', axis= 1).corr(), annot = True)
# There isn't any immediately obvious correlations.

In [None]:
#We now have a full dataset we can apply machine learning algorithms to.
# we'll investigate which model is most accurate in predicting the potability of water.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler

from sklearn import metrics
from sklearn.metrics import classification_report

X = data.drop(['Potability'], axis = 1)
y = data['Potability']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
#We start with a Logistic Regression
LR = LogisticRegression()
LR.fit(X_train,y_train)
Results=LR.predict(X_test)
classification_report(y_test,Results)

In [None]:
#Decision Tree
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
pred = dtree.predict(X_test)
classification_report(y_test,pred)

In [None]:
#Random Forest
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train,y_train)
prediction= rfc.predict(X_test)
classification_report(y_test,prediction)
importance = rfc.feature_importances_
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()
#There doesn't appear to be a specific feature which is more important than any other

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
Pred= knn.predict(X_test)
error_rate=[]
for i in range (1,40):
    knn= KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    Pred_i=knn.predict(X_test)
    error_rate.append(np.mean(Pred_i != y_test))
    plt.figure(figsize=(10,6))

plt.plot(range(1,40),error_rate)
         
error_rate.index(min(error_rate))
knn = KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train,y_train)
Pred= knn.predict(X_test)
classification_report(y_test,Pred)
# Here we used the K nearest neighbours algorithim. We used a for loop to find the no. neighbours which has the least error
#we then used this value for n when we look at the classification report

In [None]:
# TO CONCLUDE:
#Random Forest was the best in this case with 65% accuracy , K nearest neighbours was very close with 63%
# The decision tree was 59% accurate and the logistic regression was by far least effective here with a measly 35%


# Imported data and relevant diaries.
#Checked for missing values
# Checked if the variables which had missing values were appropriate to be modeled with a normal distrubtion (2 sigma rating)
# Filled missing values with the a random number from the appropriate normal distribution
# checked for correlations between variables, none were obvious.
# 