# Introduction

I referred to the URL : https://www.kaggle.com/gunesevitan/titanic-advanced-feature-engineering-tutorial

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="darkgrid")

import string
import warnings
warnings.filterwarnings('ignore')

SEED = 42

* Data set has 3276 rows
* Data set have 10 features
* One extra feature in Data set is 'Potability' feature, which is the target variable

In [None]:
df_all = pd.read_csv('../input/water-potability/water_potability.csv')
df_all.name = 'All Set' 

print('Number of Data Examples = {}'.format(df_all.shape[0]))
print('Data X Shape = {}'.format(df_all.shape))
print('Data y Shape = {}\n'.format(df_all['Potability'].shape[0]))
print(df_all.columns)


# Exploratory Data Analysis

# Overview

* Potability is the target variable we are trying to predict (0 or 1):
  * 1 = Potable
  * 0 = Not Potable
* ph: pH of 1. water (0 to 14).
* Hardness: Capacity of water to precipitate soap in mg/L.
* Solids: Total dissolved solids in ppm.
* Chloramines: Amount of Chloramines in ppm.
* Sulfate: Amount of Sulfates dissolved in mg/L.
* Conductivity: Electrical conductivity of water in μS/cm.
* Organic_carbon: Amount of organic carbon in ppm.
* Trihalomethanes: Amount of Trihalomethanes in μg/L.
* Turbidity: Measure of light emiting property of water in NTU.

In [None]:
df_all.head(5)

In [None]:
print(df_all.info())

In [None]:
df_all.describe()

### Missing Values

In [None]:
def display_missing(df):    
    for col in df.columns.tolist():          
        print('{} column missing values: {}'.format(col, df[col].isnull().sum()))
    print('\n')
    
print('{}'.format(df_all.name))
display_missing(df_all)

# Target Distribution

* 39.01% (1278/3276) of data set is Class 1
* 60.99% (1998/3276) of data set is Class 0

In [None]:
Potable = df_all['Potability'].value_counts()[1]
not_Potable = df_all['Potability'].value_counts()[0]
Potable_per = Potable / df_all.shape[0] * 100
not_Potable_per = not_Potable / df_all.shape[0] * 100

print('{} of {} Potable and it is the {:.2f}% .'.format(Potable, df_all.shape[0], Potable_per))
print('{} of {} not Potable and it is the {:.2f}% .'.format(not_Potable, df_all.shape[0], not_Potable_per))

plt.figure(figsize=(10, 8))
sns.countplot(df_all['Potability'])

plt.xlabel('Potability', size=15, labelpad=15)
plt.ylabel('Count', size=15, labelpad=15)
plt.xticks((0, 1), ['Not Potable ({0:.2f}%)'.format(not_Potable_per), 'Potable ({0:.2f}%)'.format(Potable_per)])
plt.tick_params(axis='x', labelsize=13)
plt.tick_params(axis='y', labelsize=13)

plt.title('Data Set Potability Distribution', size=15, y=1.05)

plt.show()

# Correlations

Features are low correlated with each other and not dependent to each other.

In [None]:
plt.figure(figsize = (15,9))
sns.heatmap(df_all.corr(), annot = True, cmap='coolwarm')

# Target Distribution in Features

## Continuous Features

In [None]:
features = ['ph', 'Trihalomethanes', 'Sulfate', 'Solids', 'Hardness', 'Organic_carbon', 'Chloramines', 'Conductivity', 'Turbidity']
surv = df_all['Potability'] == 1

fig, axs = plt.subplots(ncols=2, nrows=5, figsize=(20, 20))
plt.subplots_adjust(right=1.5, top=2)

for i, feature in enumerate(features, 1):
    plt.subplot(5, 2, i)
    sns.distplot(df_all[~surv][feature], label='Not Potable', hist=True, color='#e74c3c')
    sns.distplot(df_all[surv][feature], label='Potable', hist=True, color='#2ecc71')
    plt.xlabel('{}'.format(feature), size=15, labelpad=5)
    plt.ylabel('', size=10, labelpad=5)
    plt.legend(['Not Potable', 'Potable'], loc='upper right', prop={'size': 18})
    plt.title('Count of Potability in {} Feature'.format(feature), size=20, y=1.0)

# Binning Continuous Features

In [None]:
df = df_all

## ph

In [None]:
df['ph'] = pd.qcut(df['ph'], 10)

In [None]:
fig, axs = plt.subplots(figsize=(22, 9))
sns.countplot(x='ph', hue='Potability', data=df)

plt.xlabel('ph', size=15, labelpad=20)
plt.ylabel('Count', size=15, labelpad=20)
plt.tick_params(axis='x', labelsize=10)
plt.tick_params(axis='y', labelsize=15)

plt.legend(['Not Potable', 'Potable'], loc='upper right', prop={'size': 15})
plt.title('Count of {} Feature'.format('ph'), size=15, y=1)

plt.show()

## Trihalomethanes

In [None]:
df['Trihalomethanes'] = pd.qcut(df_all['Trihalomethanes'], 10)

In [None]:
fig, axs = plt.subplots(figsize=(22, 9))
sns.countplot(x='Trihalomethanes', hue='Potability', data=df)

plt.xlabel('Trihalomethanes', size=15, labelpad=20)
plt.ylabel('Count', size=15, labelpad=20)
plt.tick_params(axis='x', labelsize=10)
plt.tick_params(axis='y', labelsize=15)

plt.legend(['Not Potable', 'Potable'], loc='upper right', prop={'size': 15})
plt.title('Count of {} Feature'.format('Trihalomethanes'), size=15, y=1)

plt.show()

## Sulfate

In [None]:
df['Sulfate'] = pd.qcut(df_all['Sulfate'], 10)

In [None]:
fig, axs = plt.subplots(figsize=(22, 9))
sns.countplot(x='Sulfate', hue='Potability', data=df)

plt.xlabel('Sulfate', size=15, labelpad=20)
plt.ylabel('Count', size=15, labelpad=20)
plt.tick_params(axis='x', labelsize=10)
plt.tick_params(axis='y', labelsize=15)

plt.legend(['Not Potable', 'Potable'], loc='upper right', prop={'size': 15})
plt.title('Count of {} Feature'.format('Sulfate'), size=15, y=1)

plt.show()

## Solids

In [None]:
df['Solids'] = pd.qcut(df_all['Solids'], 10)

In [None]:
fig, axs = plt.subplots(figsize=(22, 9))
sns.countplot(x='Solids', hue='Potability', data=df)

plt.xlabel('Solids', size=15, labelpad=20)
plt.ylabel('Count', size=15, labelpad=20)
plt.tick_params(axis='x', labelsize=10)
plt.tick_params(axis='y', labelsize=15)

plt.legend(['Not Potable', 'Potable'], loc='upper right', prop={'size': 15})
plt.title('Count of {} Feature'.format('Solids'), size=15, y=1)

plt.show()

## Hardness

In [None]:
df['Hardness'] = pd.qcut(df_all['Hardness'], 10)

In [None]:
fig, axs = plt.subplots(figsize=(22, 9))
sns.countplot(x='Hardness', hue='Potability', data=df)

plt.xlabel('Hardness', size=15, labelpad=20)
plt.ylabel('Count', size=15, labelpad=20)
plt.tick_params(axis='x', labelsize=10)
plt.tick_params(axis='y', labelsize=15)

plt.legend(['Not Potable', 'Potable'], loc='upper right', prop={'size': 15})
plt.title('Count of {} Feature'.format('Hardness'), size=15, y=1)

plt.show()

## Organic_carbon

In [None]:
df['Organic_carbon'] = pd.qcut(df_all['Organic_carbon'], 10)

In [None]:
fig, axs = plt.subplots(figsize=(22, 9))
sns.countplot(x='Organic_carbon', hue='Potability', data=df)

plt.xlabel('Organic_carbon', size=15, labelpad=20)
plt.ylabel('Count', size=15, labelpad=20)
plt.tick_params(axis='x', labelsize=10)
plt.tick_params(axis='y', labelsize=15)

plt.legend(['Not Potable', 'Potable'], loc='upper right', prop={'size': 15})
plt.title('Count of {} Feature'.format('Organic_carbon'), size=15, y=1)

plt.show()

## Chloramines

In [None]:
df['Chloramines'] = pd.qcut(df_all['Chloramines'], 10)

In [None]:
fig, axs = plt.subplots(figsize=(22, 9))
sns.countplot(x='Chloramines', hue='Potability', data=df)

plt.xlabel('Chloramines', size=15, labelpad=20)
plt.ylabel('Count', size=15, labelpad=20)
plt.tick_params(axis='x', labelsize=10)
plt.tick_params(axis='y', labelsize=15)

plt.legend(['Not Potable', 'Potable'], loc='upper right', prop={'size': 15})
plt.title('Count of {} Feature'.format('Chloramines'), size=15, y=1)

plt.show()

## Conductivity

In [None]:
df['Conductivity'] = pd.qcut(df_all['Conductivity'], 10)

In [None]:
fig, axs = plt.subplots(figsize=(22, 9))
sns.countplot(x='Conductivity', hue='Potability', data=df)

plt.xlabel('Conductivity', size=15, labelpad=20)
plt.ylabel('Count', size=15, labelpad=20)
plt.tick_params(axis='x', labelsize=10)
plt.tick_params(axis='y', labelsize=15)

plt.legend(['Not Potable', 'Potable'], loc='upper right', prop={'size': 15})
plt.title('Count of {} Feature'.format('Conductivity'), size=15, y=1)

plt.show()

## Turbidity

In [None]:
df['Turbidity'] = pd.qcut(df_all['Turbidity'], 10)

In [None]:
fig, axs = plt.subplots(figsize=(22, 9))
sns.countplot(x='Turbidity', hue='Potability', data=df)

plt.xlabel('Turbidity', size=15, labelpad=20)
plt.ylabel('Count', size=15, labelpad=20)
plt.tick_params(axis='x', labelsize=10)
plt.tick_params(axis='y', labelsize=15)

plt.legend(['Not Potable', 'Potable'], loc='upper right', prop={'size': 15})
plt.title('Count of {} Feature'.format('Turbidity'), size=15, y=1)

plt.show()

# Box plot

In [None]:
df_all = pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
print(df_all['ph'].plot.box())

In [None]:
print(df_all['Trihalomethanes'].plot.box())

In [None]:
print(df_all['Sulfate'].plot.box())

In [None]:
print(df_all['Solids'].plot.box())

In [None]:
print(df_all['Hardness'].plot.box())

In [None]:
print(df_all['Organic_carbon'].plot.box())

In [None]:
print(df_all['Chloramines'].plot.box())

In [None]:
print(df_all['Conductivity'].plot.box())

In [None]:
print(df_all['Turbidity'].plot.box())

# Conclusion

I hope these are useful for data analysis. Thank you for reading.