In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

# Importing The Dataset

### Feature Description

* ph: pH of 1. water (0 to 14).

* Hardness: Capacity of water to precipitate soap in mg/L.

* Solids: Total dissolved solids in ppm.

* Chloramines: Amount of Chloramines in ppm.

* Sulfate: Amount of Sulfates dissolved in mg/L.

* Conductivity: Electrical conductivity of water in μS/cm.

* Organic_carbon: Amount of organic carbon in ppm.

* Trihalomethanes: Amount of Trihalomethanes in μg/L.

* Turbidity: Measure of light emiting property of water in NTU.

* Potability: Indicates if water is safe for human consumption. Potable - 1 and Not potable - 0

### Importing 

In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.Potability.value_counts()

In [None]:
df = df.dropna()

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.Potability.value_counts()

In [None]:
df_notpotable  = df[df['Potability']==0]
df_potable = df[df['Potability']==1] 

In [None]:
df_notpotable.head()

In [None]:
df_potable.head()

In [None]:
df_potable_resample = resample(df_potable, replace = True, n_samples = 1200, random_state = 0)

In [None]:
df_potable_resample.shape

In [None]:
df = pd.concat([df_notpotable, df_potable_resample])

In [None]:
df.shape

In [None]:
df.Potability.value_counts()

In [None]:
df = shuffle(df, random_state=0) 

# EDA

In [None]:
df.hist(bins=10, figsize=(20,15), color = 'teal')

In [None]:
fig = plt.figure(figsize=(25,10))
 
p1 = fig.add_subplot(2,2,1)
p1.hist(df.ph[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.ph[df.Potability == 1], bins=20, alpha = .4)
plt.title('pH')
plt.xlabel('pH')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

p1 = fig.add_subplot(2,2,2)
p1.hist(df.Hardness[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Hardness[df.Potability == 1], bins=20, alpha = .4)
plt.title('Hardness')
plt.xlabel('Hardness')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

p1 = fig.add_subplot(2,2,3)
p1.hist(df.Solids[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Solids[df.Potability == 1], bins=20, alpha = .4)
plt.title('Solids')
plt.xlabel('Solids')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

p1 = fig.add_subplot(2,2,4)
p1.hist(df.Chloramines[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Chloramines[df.Potability == 1], bins=20, alpha = .4)
plt.title('Chloramines')
plt.xlabel('Chloramines')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)
 
plt.subplots_adjust(wspace=.1, hspace=.3)
plt.show()

In [None]:
fig = plt.figure(figsize=(25,10))

p1 = fig.add_subplot(2,2,1)
p1.hist(df.Sulfate[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Sulfate[df.Potability == 1], bins=20, alpha = .4)
plt.title('Sulfate')
plt.xlabel('Sulfate')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

p1 = fig.add_subplot(2,2,2)
p1.hist(df.Conductivity[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Conductivity[df.Potability == 1], bins=20, alpha = .4)
plt.title('Conductivity')
plt.xlabel('Conductivity')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

p1 = fig.add_subplot(2,2,3)
p1.hist(df.Organic_carbon[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Organic_carbon[df.Potability == 1], bins=20, alpha = .4)
plt.title('Organic_carbon')
plt.xlabel('Organic_carbon')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

p1 = fig.add_subplot(2,2,4)
p1.hist(df.Trihalomethanes[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Trihalomethanes[df.Potability == 1], bins=20, alpha = .4)
plt.title('Trihalomethanes')
plt.xlabel('Trihalomethanes')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

plt.subplots_adjust(wspace=.1, hspace=.3)
plt.show()

In [None]:
fig = plt.figure(figsize=(25,10))

p1 = fig.add_subplot(2,2,1)
p1.hist(df.Turbidity[df.Potability == 0], bins=20, alpha = .4)
p1.hist(df.Turbidity[df.Potability == 1], bins=20, alpha = .4)
plt.title('Turbidity')
plt.xlabel('Turbidity')
plt.ylabel('Count')
labels = ["0", "1"]
plt.legend(labels)

plt.subplots_adjust(wspace=.1, hspace=.3)
plt.show()

In [None]:
from pandas.plotting import scatter_matrix
x = scatter_matrix(df, alpha=1, figsize=(40, 20), diagonal='hist')

In [None]:
plt.figure(figsize = (15,9))
sns.heatmap(df.corr(), annot = True)

In [None]:
df_corr = df.corr()
df_corr["Potability"].sort_values(ascending=False)

# Modelling

In [None]:
x = df.drop(['Potability'], axis = 1)
y = df['Potability']

In [None]:
st = StandardScaler()
x_columns= x.columns
x[x_columns] = st.fit_transform(x[x_columns])

In [None]:
x.head()

In [None]:
x.describe()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble  import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size = 0.1, random_state = 0)

In [None]:
hgbc = HistGradientBoostingClassifier(random_state=0, max_leaf_nodes=None, min_samples_leaf=2, max_iter=1000)
rf = RandomForestClassifier(min_samples_leaf = 2, n_estimators = 1000, random_state = 0)

In [None]:
hgbc.fit(X_train, Y_train)
rf.fit(X_train, Y_train)

In [None]:
hgbc_pred = hgbc.predict(X_test)
rf_pred = rf.predict(X_test)

In [None]:
print('GradientBoosting : ' + str(accuracy_score(hgbc_pred, Y_test)))
print('RandomForest : ' + str(accuracy_score(rf_pred, Y_test)))

### Finally the best Model : RandomForest with accuracy 85%

# Feature Importance

In [None]:
label = ['ph','Hardness','Solids','Chloramines','Sulfate','Conductivity','Organic_carbon','Trihalomethanes','Turbidity']

In [None]:
feature = rf.feature_importances_   

print('Feature Importances:')
for i, feat in enumerate(label):
    print('\t{0:20s} : {1:>.6f}'.format(feat, feature[i]))

In [None]:
# 特徴量の重要度（重要）
plt.barh(label, feature)
plt.title('feature importances')