# Water Quality Explanatory Data Analysis

## Notebook content
1. [Python Libary](#1)
2. [Read Data](#2)
3. [Feature Analysis](#3)
4.  [Missing Value](#4)
5. [Train Test Split and Normalization](#5)
6. [Modeling](#6)

### Python Libaries
<a id="1"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
import missingno as msno
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import confusion_matrix,precision_score
from sklearn import tree

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Read Data
<a id="2"></a>

In [None]:
df=pd.read_csv("/kaggle/input/water-potability/water_potability.csv")

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),annot=True,fmt=".2g")
plt.show()


### Feature Analysis
<a id="3"></a>

In [None]:
list_features=df.columns[:9]

In [None]:
df_features=pd.DataFrame(list_features)

In [None]:
df_non_Potability=df.query("Potability==0")
df_Potability=df.query("Potability==1")

In [None]:
plt.figure(figsize=(15,15))
for i in range(len(df_features)):
    plt.subplot(3,3,i+1)
    sns.kdeplot(df_non_Potability[list_features[i]],fill=False,label="NoN Potability")
    sns.kdeplot(df_Potability[list_features[i]],fill=False,shade=False,label="Potability")
    plt.title(list_features[i])
    plt.legend()
plt.tight_layout()
plt.show()    

### Missing Value
<a id="4"></a>

In [None]:
msno.matrix(df)
plt.show()

In [None]:
df.columns

In [None]:
df["ph"].fillna(value=df["ph"].mean(),inplace=True)
df["Sulfate"].fillna(value=df["Sulfate"].mean(),inplace=True)
df["Trihalomethanes"].fillna(value=df["Trihalomethanes"].mean(),inplace=True)

In [None]:
msno.matrix(df)
plt.show()

### Train Test Split and Normalization
<a id="5"></a>

In [None]:
y=df[["Potability"]]
X=df.drop(["Potability"],axis=1)

In [None]:
scaler=MinMaxScaler()


In [None]:
X_train, X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
print("X_train",X_train.shape)
print("X_test",X_test.shape)
print("y_train",y_train.shape)
print("y_test",y_test.shape)

In [None]:
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

### Modeling
<a id="6"></a>

In [None]:
models=[("dst",DecisionTreeClassifier(max_depth=3)),
       ("rf",RandomForestClassifier())]

In [None]:
final=[]
cmt_list=[]
for name,model in models:
    model.fit(X_train,y_train)
    model_predict=model.predict(X_test)
    score=precision_score(y_test,model_predict)
    cmt=confusion_matrix(y_test,model_predict)
    cmt_list.append((name,cmt))
    final.append((name,score))
print(final)
    
  

In [None]:
for name,i in cmt_list:
    plt.figure()
    sns.heatmap(i,annot=True,fmt=".2f",linewidths=.8)
    plt.show()
    

In [None]:
dt_clf=models[0][1]


In [None]:
plt.figure(figsize=(18,12))
tree.plot_tree(dt_clf,filled=True,feature_names=df.columns[:9])
plt.show()