In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.Potability.value_counts()

In [None]:
 df = df.dropna(subset=['ph','Sulfate','Trihalomethanes'], how='all')

In [None]:
df['ph'].fillna(df.ph.mean(),inplace= True)

In [None]:
df['Sulfate'].fillna(df.Sulfate.mean(),inplace= True)
df['Trihalomethanes'].fillna(df.Trihalomethanes.mean(),inplace= True)

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
df.columns

# **Data Visualization**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.express as px

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),annot=True, cmap='YlGnBu')

In [None]:

sns.pairplot(df, hue='Potability',palette='mako')

In [None]:
sns.countplot(x='Potability',data =df)

In [None]:
fig = px.scatter_matrix(df,
    dimensions=['ph', 'Hardness', 'Solids','Turbidity'],
    color="Potability")
fig.show()

In [None]:
fig = px.scatter(df, x="ph", y="Sulfate", color="Potability")
fig.show()

In [None]:
fig = px.histogram(df, x="Chloramines", color="Potability", marginal="violin", # can be `box`, `violin`
                         hover_data=df.columns)
fig.show()


In [None]:
fig = px.density_contour(df, x="Conductivity", y="Solids",color="Potability", marginal_x="rug", marginal_y="histogram")
fig.show()

In [None]:
fig = px.scatter_ternary(df, a="Chloramines", b="Turbidity", c="Conductivity",color='Potability')
fig.show()

# **Feature Selection**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop('Potability',axis=1)
y = df.Potability

In [None]:
 X_train, X_test, y_train, y_test = train_test_split(
     X, y, 
     test_size=0.3,
     random_state=0)

##  1. **Using Pearson Corelation**

In [None]:
X_train.shape, y_train.shape

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(X_train.corr(),annot=True, cmap='YlGnBu')

In [None]:
def correlation(dataset,threshold):
    col_corr = set() # set of names of all the columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i,j]) > threshold:# we r intrested in coeff value
                col_name = corr_matrix.columns[i] # getting the name of column
                col_corr.add(col_name)
    return col_corr

In [None]:
corr_features = correlation(X,0.85)
len(set(corr_features))

In [None]:
corr_features

## 2. **Using ExtraTressClassifier** 

In [None]:
## Feature Importance
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()

In [None]:
model.fit(X,y)

In [None]:
print(model.feature_importances_)

In [None]:
## plot graph of feature importance for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(5).plot(kind='barh')
plt.show()

## 3. **Using Mutual Information Gain** 

In [None]:
from sklearn.feature_selection import mutual_info_classif

from sklearn.feature_selection import SelectKBest

In [None]:
mutual_info = mutual_info_classif(X_train,y_train)
mutual_info

In [None]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)

In [None]:
mutual_info.sort_values(ascending=False).plot.bar(figsize=(20,13))

In [None]:
select_5_best = SelectKBest(mutual_info_classif,k=5)
select_5_best.fit(X_train,y_train)
cols = X_train.columns[select_5_best.get_support()]

In [None]:
cols

# **Feature Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler 
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.fit_transform(X_test)

# **Prediction and Evaluation**

## 1. Using RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

In [None]:
# RANDOMIZED SEARCH CV
# HYPERPARAMETERS

## no.of trees in the radnom forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]

## criterion
criterion = ["gini", "entropy"]

## no.of features to consider at every split
max_features = ['auto','sqrt']

## max no of levels of trees
max_depth = [int(x) for x in np.linspace(5,30,num=6)]

## min no of samples required to split a node
min_samples_split = [2,5,10,15,100]

## min no. of samples required at each leaf node
min_samples_leaf = [1,2,3,10]

In [None]:
from sklearn.model_selection import RandomizedSearchCV , GridSearchCV

In [None]:
# create the random grid
random_grid = { 'n_estimators' : n_estimators,
               'criterion' : criterion,
               'max_features' : max_features,
               'max_depth' : max_depth,
               'min_samples_split' : min_samples_split,
               'min_samples_leaf' : min_samples_leaf
               }
print(random_grid)

In [None]:
rfc_random = RandomizedSearchCV(estimator = rfc,
                               param_distributions=random_grid,
                               scoring='neg_mean_squared_error',
                               n_iter = 10,
                               cv = 5,
                               verbose = 2,
                               random_state = 42,
                               n_jobs = 1)

In [None]:
rfc_random.fit(X_train,y_train)

In [None]:
prediction = rfc_random.predict(X_test)

In [None]:
prediction

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(classification_report(y_test,prediction))

In [None]:
print(confusion_matrix(y_test,prediction))

## 2. **Using STRATIFIED K_FOLD**

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_cal = SGDClassifier()

In [None]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(sgd_cal,X_train,y_train,cv=10)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
score

In [None]:
score.mean()

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
accuracy = []
skfl = StratifiedKFold(n_splits=10,random_state=None)
skfl.get_n_splits(X_train,y_train)
for train_index,test_index in skfl.split(X_train,y_train):
    print('train:',train_index,'validation:',test_index)
    X1_train , X1_test = X_train[train_index] , X_train[test_index]
    y1_train , y1_test = y_train.iloc[train_index] , y_train.iloc[test_index]
    
    sgd_cal.fit(X1_train,y1_train)
    prediction = sgd_cal.predict(X1_test)
    score = accuracy_score(prediction,y1_test)
    accuracy.append(score)

In [None]:
np.array(accuracy).mean()

## 3. **Using Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()
lr.fit(X_train,y_train)

In [None]:
prediction = lr.predict(X_test)

In [None]:
print(classification_report(y_test,prediction))

In [None]:
sns.heatmap(confusion_matrix(y_test,prediction),annot=True)

In [None]:
print(accuracy_score(y_test,prediction))

## 1.**Using ADABOOST Classifier** 

In [None]:
from sklearn.ensemble import AdaBoostClassifier


In [None]:
ada = AdaBoostClassifier()


In [None]:

params_ada = {'n_estimators': [50,100,250,400,500,600], 'learning_rate': [0.2,0.5,0.8,1]}
grid_ada =  RandomizedSearchCV(ada, param_distributions = params_ada, cv=5)

In [None]:
grid_ada.fit(X_train,y_train)

In [None]:
y_pred = grid_ada.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print(accuracy)

## 1. **Using SGDClassifier**

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
sgd_cal = SGDClassifier()

In [None]:
sgd_cal.fit(X_train,y_train)

In [None]:
predictions = sgd_cal.predict(X_test)

In [None]:
print(classification_report(y_test,predictions))

In [None]:
sns.heatmap(confusion_matrix(y_test,predictions),annot=True,cmap='YlGnBu')

In [None]:
print(accuracy_score(y_test,predictions))