In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
df.head()

In [None]:
df.shape

### Check Null Value

In [None]:
df.isnull().sum() / df.shape[0] * 100

**14.99** % of **'ph'** values are Null values.

**23.84** % of **'Sulfate'** values are Null values.

**4.95** % of **'Trihalomethanes'** values are Null values.

In [None]:
df.dropna(inplace=True)

Drop the rows with Null values and check the shape of data frame after drop.

In [None]:
df.shape

In [None]:
df.describe()

# EDA

In [None]:
background_color = '#F8EDF4'
color_palette = ['#F3AA51', '#FCF695', '#CEE5D5', '#B7D3E9', '#567ACE', '#BBB0DC', '#DB706C', '#F1C3AA','#A7E0E1', 'D9598C']

In [None]:
fig = plt.figure(figsize=(14, 8))
gs = fig.add_gridspec(1, 2)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])


# Title
ax0.text(0.5, 0.5, 'Countplot of the Potability\n _______________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontfamily='serif', fontweight='bold')
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)
ax0.spines['bottom'].set_visible(False)


# Graph
g = ax1.pie(df['Potability'].value_counts(),
       autopct=lambda p : '{:.2f}%'.format(p),
       colors=color_palette,
       explode=(0, 0.05),
       shadow=True,
       textprops={'fontsize': 18})
plt.legend(g[0], [0, 1], loc='upper right', fontsize='large')


fig.patch.set_facecolor(background_color)
axes = [ax0, ax1]

ax0.set_facecolor(background_color)
for s in ['top', 'right', 'left']:
    ax0.spines[s].set_visible(False)

In [None]:
fig = plt.figure(figsize=(18, 15))
gs = fig.add_gridspec(3, 3)
gs.update(wspace=0.3, hspace=0.3)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])
ax3 = fig.add_subplot(gs[1, 0])
ax4 = fig.add_subplot(gs[1, 1])
ax5 = fig.add_subplot(gs[1, 2])
ax6 = fig.add_subplot(gs[2, 0])
ax7 = fig.add_subplot(gs[2, 1])
ax8 = fig.add_subplot(gs[2, 2])
axes = [ax0, ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8]

fig.patch.set_facecolor(background_color)

# Title
ax1.text(0, 400, 'Boxenplot of Continuous Feature',
        fontsize=18, fontfamily='serif', fontweight='bold',
        horizontalalignment='center',
        verticalalignment='center')


# Graphs
for i, ax in enumerate(axes):
    ax.set_facecolor(background_color)
    ax.set_title(df.columns[i], fontsize=14, fontfamily='serif', fontweight='bold')
    ax.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
    
    sns.boxenplot(y=df.columns[i], data=df, ax=ax, palette=[color_palette[i]], width=0.5)
    ax.set_xlabel('')
    ax.set_ylabel('')
    
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

In [None]:
fig = plt.figure(figsize=(18, 15))
gs = fig.add_gridspec(3, 3)
gs.update(wspace=0.3, hspace=0.3)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])
ax3 = fig.add_subplot(gs[1, 0])
ax4 = fig.add_subplot(gs[1, 1])
ax5 = fig.add_subplot(gs[1, 2])
ax6 = fig.add_subplot(gs[2, 0])
ax7 = fig.add_subplot(gs[2, 1])
ax8 = fig.add_subplot(gs[2, 2])
axes = [ax0, ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8]

kde_palette = ['#F3AA51', '#567ACE']
fig.patch.set_facecolor(background_color)


# Title
ax1.text(200, 0.011, 'Distribution of Continuous Feature by Potability',
        fontsize=18, fontfamily='serif', fontweight='bold',
        horizontalalignment='center',
        verticalalignment='center')


# Graphs
for i, ax in enumerate(axes):
    dp_legend = False
    if i in [2, 5, 8]:
        dp_legend = True
        
    ax.set_facecolor(background_color)
    ax.set_title(df.columns[i], fontsize=14, fontfamily='serif', fontweight='bold')
    ax.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
    
    sns.kdeplot(x=df.columns[i], hue='Potability', data=df, fill=True, ax=ax, palette=kde_palette, legend=dp_legend)
    ax.set_xlabel('')
    ax.set_ylabel('')
    
    for s in ['top', 'right', 'left']:
        ax.spines[s].set_visible(False)

## Correlation Matrix

In [None]:
df.corr()

In [None]:
f, ax = plt.subplots(1, 1, figsize=(10, 10))

mask = np.triu(np.ones_like(df.corr()))
ax.text(2.5, -0.1, 'Correlation Matrix', fontsize=18, fontweight='bold', fontfamily='serif')
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='RdBu', square=True, mask=mask, linewidth=0.7, ax=ax)

# Modeling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

In [None]:
X = df.drop('Potability', axis=1)
y = df['Potability']

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
score_logreg = accuracy_score(y_pred, y_test)

print('Accuracy Score of Logistic Regression :', score_logreg)

In [None]:
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
score_rf = accuracy_score(y_pred, y_test)

print('Accuracy Score of Random Forest Classifier :', score_rf)

In [None]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)
score_gbc = accuracy_score(y_pred, y_test)

print('Accuracy Score of Gradient Boosting Classifier :', score_gbc)

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
score_dt = accuracy_score(y_pred, y_test)

print('Accuracy Score of Decision Tree Classifier :', score_dt)

In [None]:
# Find n_neighbors for best score
accuracy = []

for i in range(1, 11):
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy.append(model.score(X_test, y_test))

score_knn = max(accuracy)
print('Accuracy Score of K-Nearest Neighbors Classifier : ', score_knn)

plt.figure(figsize=(7, 4))
plt.plot(range(1,11), accuracy, linestyle='dashed', marker='o', color='blue',
        markersize=7, markerfacecolor='red')
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.show()

In [None]:
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
score_svm = accuracy_score(y_pred, y_test)

print('Accuracy Score of Support Vector Classifier :', score_svm)

# Result

In [None]:
df_result = pd.DataFrame({'Model' : ['Logistic Regression', 'RandomForest', 'Gradient Boosting', 
                         'Decision Tree', 'K-Nearest Neighbors', 'Support Vector'],
                         'Score' : [score_logreg, score_rf, score_gbc, score_dt, score_knn, score_svm]})

df_result.sort_values(by='Score', ascending=False, inplace=True)

In [None]:
fig = plt.figure(figsize=(15, 6))
gs = fig.add_gridspec(1, 2)
gs.update(wspace=0.25)

ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])

# Title
ax0.text(0.5, 0.5, 'Accuracy Score of Model\n ___________________',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=18, fontfamily='serif', fontweight='bold')
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)
ax0.spines['left'].set_visible(False)


# Graph
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0, dashes=(1,5))
sns.barplot(x='Score', y='Model', data=df_result, palette=color_palette, ax=ax1)
ax1.set_xlabel('')
ax1.set_ylabel('')


fig.patch.set_facecolor(background_color)
axes = [ax0, ax1]

for ax in axes:
    ax.set_facecolor(background_color)
    for s in ['top', 'right', 'bottom']:
        ax.spines[s].set_visible(False)

## Pleas Upvote if you like my notebook. 
## Thank you!