In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Loading Data From URL**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

In [None]:
dataset=pd.read_csv('data.csv')

Checking for missing values, statistical summary, type of data and normalization.

In [None]:
dataset

In [None]:
dataset.info()

In [None]:
dataset.describe()

Filling missing data by it's columns variables averages or mean.

In [None]:
dataset["Sulfate"]= dataset["Sulfate"].replace(np.NaN,dataset["Sulfate"].mean())

In [None]:
dataset["Trihalomethanes"]= dataset["Trihalomethanes"].replace(np.NaN,dataset["Trihalomethanes"].mean())

In [None]:
dataset["ph"]= dataset["ph"].replace(np.NaN,dataset["ph"].mean())

This can also be approached with **SimpleImputer** - strategy - mean which is built in sklearn


In [None]:
dataset.info()

In [None]:
dataset.isnull().sum()

In [None]:
col=dataset.columns


In [None]:
col

In [None]:
dataset["ph"].nunique

In [None]:
print(dataset["Sulfate"][:20])

In [None]:
dataset

In [None]:
dataset.describe()

In [None]:
dataset.corr()

**Exploratory Data Analysis**

In [None]:
!pip install plotly matplotlib seaborn --quiet

In [None]:
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
sns.countplot(dataset['Potability'])
dataset['Potability'].value_counts()

In [None]:
sns.heatmap(dataset.corr(), cmap='Reds',annot=True)

Info: Correlation of the attributes are very low than usual and also the attributes are independent.

In [None]:
dataset.corr().unstack().sort_values(ascending=False).drop_duplicates().head(8)

In [None]:
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (8, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
# Let us check the distribution of the attributes

fig = plt.figure(figsize=(22, 11))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
ax = fig.add_subplot(3, 3, 1)
sns.distplot(dataset['ph'], hist_kws=dict(edgecolor='k', linewidth=1), bins=8)
ax = fig.add_subplot(3, 3, 2)
sns.distplot(dataset['Hardness'], hist_kws=dict(edgecolor='k', linewidth=1), bins=8)
ax = fig.add_subplot(3, 3, 3)
sns.distplot(dataset['Solids'], hist_kws=dict(edgecolor='k', linewidth=1), bins=8)
ax = fig.add_subplot(3, 3, 4)
sns.distplot(dataset['Chloramines'], hist_kws=dict(edgecolor='k', linewidth=1), bins=8)
ax = fig.add_subplot(3, 3, 5)
sns.distplot(dataset['Sulfate'], hist_kws=dict(edgecolor='k', linewidth=1), bins=8)
ax = fig.add_subplot(3, 3, 6)
sns.distplot(dataset['Conductivity'], hist_kws=dict(edgecolor='k', linewidth=1), bins=8)
ax = fig.add_subplot(3, 3, 7)
sns.distplot(dataset['Organic_carbon'], hist_kws=dict(edgecolor='k', linewidth=1), bins=8)
ax = fig.add_subplot(3, 3, 8)
sns.distplot(dataset['Trihalomethanes'], hist_kws=dict(edgecolor='k', linewidth=1), bins=8)
ax = fig.add_subplot(3, 3, 9)
sns.distplot(dataset['Turbidity'], hist_kws=dict(edgecolor='k', linewidth=1), bins=8)
plt.show()

Looking for some insight/useful information from graphs, plots etc but the values are fairly and evenly distributed.

**Pairplot**

In [None]:
sns.pairplot(data = dataset , hue ='Potability')

In [None]:
fig = px.histogram(dataset, 
                   x='ph', 
                   marginal='box', 
                   nbins=40, 
                   title='ph')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
fig = px.histogram(dataset, 
                   x='Hardness', 
                   marginal='box', 
                   nbins=40, 
                   title='hardness')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
fig = px.histogram(dataset, 
                   x='Chloramines', 
                   marginal='box', 
                   nbins=40, 
                   title='Chloramines')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
fig = px.scatter(dataset, 
                 x='Hardness', 
                 y='ph',
                 color = 'Potability',
                 opacity=0.8, 
                 title='Comparisions')
fig.update_traces(marker_size=5)
fig.show()

**Training , Validation , Testing , scaling , One-hot Encoder**

In [None]:
!pip install scikit-learn --upgrade 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_df, test_df = train_test_split(dataset, test_size=0.3)

In [None]:
test_df.shape

In [None]:
train_df.shape

**Indentify Input and output Columns**

In [None]:
dataset

In [None]:
input_cols=list(train_df.columns)[:-1]

In [None]:
target_col='Potability' 

In [None]:
train_input = train_df[input_cols].copy()
train_target = train_df[target_col].copy()

**Scaling** of the values are necessary because it increase the accuracy of the model and error is less.

In [None]:
test_input = test_df[input_cols].copy()
test_target = test_df[target_col].copy()

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
scaler.fit(train_df[input_cols])

In [None]:
list(scaler.data_min_)


In [None]:
train_input[input_cols] = scaler.transform(train_input[input_cols])
test_input[input_cols] = scaler.transform(test_input[input_cols])


In [None]:
train_input.head(20)

In [None]:
fig = px.scatter(dataset, 
                 x='Hardness', 
                 y='ph',
                 color = 'Potability',
                 opacity=0.8, 
                 title='Comparisions')
fig.update_traces(marker_size=5)
fig.show()

**BUILING A MACHINE LEARNING MODEL**

**rANDOM fOREST cLASSIFIER**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score

In [None]:
?RandomForestClassifier

In [None]:
param_grid = {'n_estimators': [100, 200, 300], 'max_features': ['auto', 'sqrt'], 'bootstrap': [True, False], 'criterion':['entropy', 'gini']}
rfcgrid = GridSearchCV(RandomForestClassifier(random_state=101), param_grid, verbose=100, cv=10, n_jobs=-2)
rfcgrid.fit(train_input,train_target)

In [None]:
rfcgrid.best_params_

In [None]:
rfcpredictions = rfcgrid.predict(test_input)


In [None]:
print("Confusion Matrix - Random Forest Using Entropy Index")
print(confusion_matrix(test_target,rfcpredictions))
print("\n")
print("Accuracy Score - Random Forest")
print(accuracy_score(test_target, rfcpredictions))
print("\n")
print("Classification Report - Random Forest")
print(classification_report(test_target,rfcpredictions))
print("\n")
print("F1 Score - Random Forest")
print(f1_score(test_target, rfcpredictions))

THANK YOU! THIS IS MY FIRST PROJECT      ACCURACY - 69.78 % 