In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing the required libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler

## Importing the dataset

In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')
df.head(5)

In [None]:
df.describe()

# Data Analysis and Visualization

In [None]:
colors = ['#ff0000','#fff000','#18fff9','#8f139f']
fig, axes = plt.subplots(3, 3,figsize=(20,12))
column = df.columns
fig.suptitle('Boxplots of each variable')
sns.boxplot(ax=axes[0,0],x=column[0],data=df,color=colors[0])
sns.boxplot(ax=axes[0,1],x=column[1],data=df,color=colors[1])
sns.boxplot(ax=axes[0,2],x=column[2],data=df,color=colors[2])
sns.boxplot(ax=axes[1,0],x=column[3],data=df,color=colors[3])
sns.boxplot(ax=axes[1,1],x=column[4],data=df,color=colors[0])
sns.boxplot(ax=axes[1,2],x=column[5],data=df,color=colors[1])
sns.boxplot(ax=axes[2,0],x=column[6],data=df,color=colors[2])
sns.boxplot(ax=axes[2,1],x=column[7],data=df,color=colors[3])
sns.boxplot(ax=axes[2,2],x=column[8],data=df,color=colors[0])
plt.show()

## Checking for Null values

In [None]:
df.isnull().sum()

## Imputing the missing values

In [None]:
#ph values are evenly distributed so we can use mean
df['ph'].fillna(df['ph'].mean(),inplace=True)
#sulphate values are slightly on the right side and it has outliers which may affect mean hence we will use median here
df['Sulfate'].fillna(df['Sulfate'].median(),inplace=True)
#Trihalomethanes values are evenly distributed so we will use mean
df['Trihalomethanes'].fillna(df['Trihalomethanes'].mean(),inplace=True)

In [None]:
df.isnull().sum()

## Pairplot gives a fair understanding about data distribution

In [None]:
sns.pairplot(data=df,hue='Potability')

### Our dataset has less samples of class 1

In [None]:
df['Potability'].value_counts()

# Data Preparation

- ## Splitting the Dataframe

In [None]:
X = df.drop(['Potability'],axis=1)
y = df['Potability']

- ## Generating Train and Test sets

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=17)

- ### We can see the imbalance in classes

In [None]:
sns.countplot(x=y_train)

- ### We will under-sample the majority class and oversample the minority , this gives us the best results

In [None]:
rus = RandomUnderSampler(sampling_strategy=0.75)
X_train,y_train = rus.fit_resample(X_train,y_train)

In [None]:
sns.countplot(x=y_train)

In [None]:
smote = SMOTE(sampling_strategy='minority')
X_train,y_train = smote.fit_resample(X_train,y_train)

- ### After Sampling is completed

In [None]:
sns.countplot(x=y_train)

- ### We will use MinMaxScaler from sklearn library to scale the data in the range of 0 to 1

In [None]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_train)
X_scaled

# Model Selection

- ### Importing the models from sklearn

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

- ### Function to evaluate base models

In [None]:
models = {'Logistic Regression':LogisticRegression,'Random Forest':RandomForestClassifier,'KNN':KNeighborsClassifier,'Support Vector':SVC,'Naive bayes gaussian':GaussianNB}
X_test_scaled = scaler.transform(X_test)
for i in models:
    clf = models[i]()
    clf.fit(X_scaled,y_train)
    print(i)
    print(classification_report(y_test,clf.predict(X_test_scaled)))

#### Random Forest seems to perform rather good than other models on both the classes

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
params=[{'n_estimators':[150,200,250,300],'criterion':['gini', 'entropy'],'max_features':['auto','sqrt','log2']}]
rf_clf = RandomForestClassifier(random_state=17)
final_clf = GridSearchCV(rf_clf,params)
final_clf.fit(X_scaled,y_train)
print(classification_report(y_test,final_clf.predict(X_test_scaled)))

### We managed to get a little improvement in f1 score for '1' class

# The End
`If you liked the notebook then don't forget to upvote and suggestions are always welcomed.`
`Follow me on Linkedin :` __[Atharva_Dumbre](https://www.linkedin.com/in/atharva-dumbre-208b5716b)__