In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')
df.head()

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(df.corr(),annot=True, cmap="RdPu",fmt='.3f',linewidths=.8)

In [None]:
dup = df.duplicated().sum()
print('Any Duplicate Value:',dup)

In [None]:
df.isnull().sum()

In [None]:
df["ph"].fillna(value = df["ph"].mean(), inplace = True)
df["Sulfate"].fillna(value = df["Sulfate"].mean(), inplace = True)
df["Trihalomethanes"].fillna(value = df["Trihalomethanes"].mean(), inplace = True)

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
plt.rcParams['figure.figsize'] = [7,5]
sns.distplot(df['Potability'])

In [None]:
sns.pairplot(data = df)

In [None]:
x = df.drop(['Potability'],axis=True)
y = df['Potability']

# **Split and Train the model**

In [None]:
x_train,x_test,y_train,y_test = tts(x,y,test_size=0.32, random_state = 50)

In [None]:
print(x.describe(),"\n","\n", y.describe())

# **Logistic Regression**

In [None]:
logi = LogisticRegression(max_iter = 120, random_state=0,n_jobs=20)
logi.fit(x_train,y_train)
predlogi_y = logi.predict(x_test)

In [None]:
Acc= accuracy_score(predlogi_y,y_test)
print( Acc)

In [None]:
print(classification_report(y_test,predlogi_y))

# **Random Forest Classifier**

In [None]:
RFC = RandomForestClassifier()
RFC.fit(x_train,y_train)
y_RFC = RFC.predict(x_test)

In [None]:
Acc_rfc= accuracy_score(y_RFC,y_test)
print( Acc_rfc)

In [None]:
print(classification_report(y_RFC,y_test))

In [None]:
cmr= confusion_matrix(y_test,y_RFC)
sns.heatmap(cmr/np.sum(cmr), annot= True, fmt= '0.2%', cmap= 'coolwarm')

# **Decision Tree**

In [None]:
DTR = DecisionTreeRegressor()
DTR.fit(x_train,y_train)
y_pred = DTR.predict(x_test)

In [None]:
Acc_dt= accuracy_score(y_pred,y_test)
print( Acc_dt)

In [None]:
print(classification_report(y_pred,y_test))

In [None]:
print(classification_report(y_pred,y_test))

In [None]:
cmd= confusion_matrix(y_test,y_pred)
sns.heatmap(cmd/np.sum(cmd), annot= True, fmt= '0.2%', cmap= 'twilight_shifted')

# **XGBoost Classifier**

In [None]:
Xgb = XGBClassifier(max_depth= 8,n_estimators= 125, random_state= 0, learning_rate= 0.03, n_jobs= 5)
Xgb.fit(x_train,y_train)
pred_Xgb =Xgb.predict(x_test)

In [None]:
Xgb_ACC = accuracy_score(y_test,pred_Xgb)
print(Xgb_ACC)

In [None]:
print(classification_report(y_test, pred_Xgb))

In [None]:
cmx = confusion_matrix(y_test,pred_Xgb)
sns.heatmap(cmx/np.sum(cmx), annot= True, fmt= '0.2%', cmap= 'Reds')

# **KNeighbours**

In [None]:
Kn = KNeighborsClassifier(n_neighbors= 9, leaf_size = 20)
Kn.fit(x_train,y_train)
pred_Kn =Kn.predict(x_test)

In [None]:
Kn_ACC = accuracy_score(y_test,pred_Kn)
print(Kn_ACC)

In [None]:
print(classification_report(y_test, pred_Kn))

In [None]:
cmk = confusion_matrix(y_test,pred_Kn)
sns.heatmap(cmk/np.sum(cmk), annot= True, fmt= '0.2%', cmap= 'Blues')

# **Final Report**

In [None]:
models = pd.DataFrame({"Model":['Logistic','Random Forest','Decision Tree','XGBoost','KNeighbours'],
           "Accuracy":[Acc,Acc_rfc,Acc_dt,Xgb_ACC,Kn_ACC]})#Creat Data.....
models.sort_values(by='Accuracy', ascending=False)#Show the Higher to lower value order....

In [None]:
output = pd.DataFrame({"Model":['Logistic','Random Forest','Decision Tree','XGBoost','KNeighbours'],
           "Accuracy":[Acc,Acc_rfc,Acc_dt,Xgb_ACC,Kn_ACC]})
output.to_csv('Water Quality EDA & Prediction', index= False)# Save My Output........

In [None]:
output.head()

In [None]:
sns.barplot(x= 'Accuracy', y= 'Model', data= models)

# **--------------------------Conclusion----------------------------------**

#  * **Here Random Forest has Achieved High Accuracy = 0.66%**
