In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import iplot
from scipy.stats import ttest_ind
from statistics import mean,variance
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### About this notebook

This notebook is divided into two parts:
1. Statistical study of the variables: In this part we will study the objective variable (Potability) and we will study how the variables are distributed in each group (in the potability group and the non-potability groups) and if there are any differences between how they are distributed these variables in each group.
2. Predictive models: In this part, we will do some machine learning classification models to classify whether a water sample is drinkable based on its characteristics (pH, sulfate, etc.). Also, we will make a neural network with PyTorch.

In [None]:
data = pd.read_csv('/kaggle/input/water-potability/water_potability.csv')
data.head()

As we see in the last cell, there are some NaN values. We'll see more details in the next cell.


In [None]:
data.isna().sum()

We're having a lot of NaN values. 
To fix this issue we'll delate the rows that have at least one NaN value.

In [None]:
data = data.dropna()

In [None]:
data.isna().sum() #Checking

### Part One: Stadistical Study

In [None]:
labels = data["Potability"].unique().tolist()
values = data["Potability"].value_counts().tolist()

dades = [go.Pie(labels=labels, values=values, textinfo='label+percent', hole=0.3, marker_colors=['blue', 'purple'])]

figure = go.Figure(dades)
figure.update_layout(title="Potability proportion", width=900, height=400)

iplot(figure)

We will study separetly the 2 groups: potability water group(Potability = 1) and non-potability water(Potability = 0).

In [None]:
potability_group = data[data["Potability"] == 1]
non_potability_group = data[data["Potability"] == 0]

In [None]:
n, bins, patches = plt.hist(x=potability_group["ph"], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('ph distribution')

In [None]:
n, bins, patches = plt.hist(x=non_potability_group["ph"], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('ph distribution')

It seems that the distributions of the pH for each group are really similar. The mean seems the same and the different with them is that the ph in non-potability group seems reach big ph values. 
Given this case, we're gonna study two things: 
1. Are really the mean equal for each group? --> For this question we'll do a hipotesis test for the mean
2. Are the variances equal?
For this question we'll do a hipotesis test for the variance

In [None]:
print("Sample mean for ph of potability group",mean(potability_group["ph"]))
print("Sample mean for ph of non-potability group",mean(non_potability_group["ph"]))

In [None]:
#H_o : mean_poblational(potability_water) = mean_poblational(non_potability_water)
#H_1: alternative case

#Parametric hypotesis test: 
stat, p = ttest_ind(potability_group["ph"],non_potability_group["ph"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
	print('Probably the same distribution')
else:
	print('Probably different distributions')

We can't reject the null hipotesis and thats why we'll consider that the mean of both poblations are the same. Now, we'll study if this happens in the variance.

In [None]:

print("Sample variance for ph of potability group",variance(potability_group["ph"]))
print("Sample variance for ph of non-potability group",variance(non_potability_group["ph"]))

In [None]:
from scipy.stats import bartlett
##Bartlett test
stat, p = bartlett(potability_group["ph"],non_potability_group["ph"])

print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
	print('Probably the same distribution')
else:
	print('Probably different distributions')


We can determinate that the ph of water potable variance is not the same as the ph of water no potable variance

In the next cells we'll repeat the same process as we do with the ph variable.

In [None]:
n, bins, patches = plt.hist(x=potability_group["Hardness"], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Hardness distribution')

In [None]:
n, bins, patches = plt.hist(x=non_potability_group["Hardness"], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Hardness distribution')

In [None]:
print("Sample mean for Hardness of potability group",mean(potability_group["Hardness"]))
print("Sample mean for Hardness of non-potability group",mean(non_potability_group["Hardness"]))

print("Sample variance for Hardness of potability group",variance(potability_group["Hardness"]))
print("Sample variance for Hardness of non-potability group",variance(non_potability_group["Hardness"]))

In [None]:
stat, p = ttest_ind(potability_group["Hardness"],non_potability_group["Hardness"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
	print('Probably the same distribution')
else:
	print('Probably different distributions')

In [None]:
stat, p = bartlett(potability_group["Hardness"],non_potability_group["Hardness"])

print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
	print('Probably the same distribution')
else:
	print('Probably different distributions')


Same mean different variance.

In [None]:
n, bins, patches = plt.hist(x=potability_group["Solids"], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Solids potability distribution')

In [None]:
n, bins, patches = plt.hist(x=non_potability_group["Solids"], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Solids non_potability distribution')

In [None]:
print("Sample mean for Solids of potability group",mean(potability_group["Solids"]))
print("Sample mean for Solids of non-potability group",mean(non_potability_group["Solids"]))

print("Sample variance for Solids of potability group",variance(potability_group["Solids"]))
print("Sample variance for Solids of non-potability group",variance(non_potability_group["Solids"]))

In [None]:
stat, p = ttest_ind(potability_group["Solids"],non_potability_group["Solids"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
	print('Probably the same distribution')
else:
	print('Probably different distributions')

In [None]:
stat, p = bartlett(potability_group["Solids"],non_potability_group["Solids"])

print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
	print('Probably the same distribution')
else:
	print('Probably different distributions')

In this variable(Solids) seems that the distribution in two poblations are the same.

In [None]:
n, bins, patches = plt.hist(x=potability_group["Chloramines"], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Chloramines potability distribution')

In [None]:
n, bins, patches = plt.hist(x=non_potability_group["Chloramines"], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Chloramines potability distribution')

In [None]:
print("Sample mean for Chloramines of potability group",mean(potability_group["Chloramines"]))
print("Sample mean for Chloramines of non-potability group",mean(non_potability_group["Chloramines"]))

print("Sample variance for Chloramines of potability group",variance(potability_group["Chloramines"]))
print("Sample variance for Chloramines of non-potability group",variance(non_potability_group["Chloramines"]))

In [None]:
stat, p = ttest_ind(potability_group["Chloramines"],non_potability_group["Chloramines"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
	print('Probably the same distribution')
else:
	print('Probably different distributions')

In [None]:
stat, p = bartlett(potability_group["Chloramines"],non_potability_group["Chloramines"])

print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
	print('Probably the same distribution')
else:
	print('Probably different distributions')

In [None]:

n, bins, patches = plt.hist(x=potability_group["Sulfate"], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Sulfate potability distribution')


In [None]:
n, bins, patches = plt.hist(x=non_potability_group["Sulfate"], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Sulfate potability distribution')

In [None]:
print("Sample mean for Sulfate of potability group",mean(potability_group["Sulfate"]))
print("Sample mean for Sulfate of non-potability group",mean(non_potability_group["Sulfate"]))

print("Sample variance for Sulfate of potability group",variance(potability_group["Sulfate"]))
print("Sample variance for Sulfate of non-potability group",variance(non_potability_group["Sulfate"]))


In [None]:
stat, p = ttest_ind(potability_group["Sulfate"],non_potability_group["Sulfate"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

In [None]:
stat, p = bartlett(potability_group["Sulfate"],non_potability_group["Sulfate"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

In [None]:
n, bins, patches = plt.hist(x=potability_group["Conductivity"], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Conductivity potability distribution')

In [None]:
n, bins, patches = plt.hist(x=non_potability_group["Conductivity"], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Conductivity non-potability distribution')

In [None]:
print("Sample mean for Conductivity of potability group",mean(potability_group["Conductivity"]))
print("Sample mean for Conductivity of non-potability group",mean(non_potability_group["Conductivity"]))

print("Sample variance for Conductivity of potability group",variance(potability_group["Conductivity"]))
print("Sample variance for Conductivity of non-potability group",variance(non_potability_group["Conductivity"]))


In [None]:
stat, p = ttest_ind(potability_group["Conductivity"],non_potability_group["Conductivity"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

In [None]:
stat, p = bartlett(potability_group["Conductivity"],non_potability_group["Conductivity"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

In [None]:
n, bins, patches = plt.hist(x=potability_group["Organic_carbon"], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Organic_carbon potability distribution')

In [None]:
n, bins, patches = plt.hist(x=non_potability_group["Organic_carbon"], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Organic_carbon potability distribution')

In [None]:
print("Sample mean for Organic_carbon of potability group",mean(potability_group["Organic_carbon"]))
print("Sample mean for Organic_carbon of non-potability group",mean(non_potability_group["Organic_carbon"]))

print("Sample variance for Organic_carbon of potability group",variance(potability_group["Organic_carbon"]))
print("Sample variance for Organic_carbon of non-potability group",variance(non_potability_group["Organic_carbon"]))


In [None]:
stat, p = ttest_ind(potability_group["Organic_carbon"],non_potability_group["Organic_carbon"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

In [None]:
stat, p = bartlett(potability_group["Organic_carbon"],non_potability_group["Organic_carbon"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

Same mean and variance

In [None]:
n, bins, patches = plt.hist(x=potability_group["Trihalomethanes"], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Trihalomethanes potability distribution')



In [None]:
n, bins, patches = plt.hist(x=non_potability_group["Trihalomethanes"], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Trihalomethanes potability distribution')


In [None]:
print("Sample mean for Trihalomethanes of potability group",mean(potability_group["Trihalomethanes"]))
print("Sample mean for Trihalomethanes of non-potability group",mean(non_potability_group["Trihalomethanes"]))

print("Sample variance for Trihalomethanes of potability group",variance(potability_group["Trihalomethanes"]))
print("Sample variance for Trihalomethanes of non-potability group",variance(non_potability_group["Trihalomethanes"]))


In [None]:
stat, p = ttest_ind(potability_group["Trihalomethanes"],non_potability_group["Trihalomethanes"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

In [None]:
stat, p = bartlett(potability_group["Trihalomethanes"],non_potability_group["Trihalomethanes"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

Same mean and variance

In [None]:
n, bins, patches = plt.hist(x=potability_group["Turbidity"], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Turbidity potability distribution')

In [None]:
n, bins, patches = plt.hist(x=non_potability_group["Turbidity"], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Turbidity potability distribution')

In [None]:
print("Sample mean for Turbidity of potability group",mean(potability_group["Turbidity"]))
print("Sample mean for Turbidity of non-potability group",mean(non_potability_group["Turbidity"]))

print("Sample variance for Turbidity of potability group",variance(potability_group["Turbidity"]))
print("Sample variance for Turbidity of non-potability group",variance(non_potability_group["Turbidity"]))


In [None]:
stat, p = ttest_ind(potability_group["Turbidity"],non_potability_group["Turbidity"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

In [None]:
stat, p = bartlett(potability_group["Turbidity"],non_potability_group["Turbidity"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

Same mean and different variance

SUMMARY:
- ph: Same mean and diferent variance between two groups
- Hardness: Same mean and diferent variance between two groups
- Solids: Same mean and variance
- Chloramines: Same mean and diferent variance
- Sulfate: Same mean and diferent variance
- Conductivity: Same mean and variance
- Organic_carbon: Same mean and variance

### Part 2: Machine Learning Models

In [None]:
correlation = data.corr()

plt.figure(figsize=(35,35))

ax = sns.heatmap(correlation, annot=True, linewidths=.5)

In [None]:
data.corr()['Potability'].sort_values()

We can see that there is not a variable with a significant correlation with the objective variable. 

Predicitive Models: Predicting the potability of the water

In [None]:

var = ["Organic_carbon","Conductivity","Sulfate","Hardness","Trihalomethanes","ph","Chloramines",
"Turbidity","Solids"]
X = data.iloc[:, 0:-1]
Y = data.iloc[:, -1]
X_train,X_test,y_train,y_test = train_test_split(X,Y, test_size = 0.33)
"""
#Normalize the training data
d = preprocessing.normalize(X_train, axis=0) #Normalizing along a column
X_train = pd.DataFrame(d, columns=var)

m = preprocessing.normalize(X_test, axis=0) #Normalizing along a column
X_test = pd.DataFrame(m, columns=var)
"""

Models to test: Logistic Regressor, SVM, KNN

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

SVM with three diferent kernels: linear,rbf,poly with degree = 2)

In [None]:
model = SVC(kernel = 'linear')
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
model = SVC(kernel = 'rbf')
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
model = SVC(kernel = 'poly',degree = 2)
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

For the KNN models we're gonna try differents numbers to the n_neighbords parameter. 

In [None]:
model = KNeighborsClassifier(n_neighbors = 3)
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
model = KNeighborsClassifier(n_neighbors = 5)
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

In [None]:
model = KNeighborsClassifier(n_neighbors = 7)
model.fit(X_train, y_train)
acc = accuracy_score(y_test, model.predict(X_test))
print(acc)

Finally, we'll implement a neuronal network with pytorch for this classifer problem.

### Neuronal Network

In [None]:
# use gpu if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using: ",device)

In [None]:
#Data standarization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
batch_size = 64

In [None]:
y_train, y_test = y_train.to_frame(),y_test.to_frame()

In [None]:
class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = trainData(torch.FloatTensor(X_train), 
                       torch.FloatTensor(y_train.values))
## test data    
class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = testData(torch.FloatTensor(X_test))

In [None]:
train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [None]:
class SimpleMLP(nn.Module):
 
    def __init__(self,inp_dim,layer1_dim, layer2_dim,output_dim):
        super().__init__()
        self.fc1 = nn.Linear(inp_dim, layer1_dim)
        self.fc2 = nn.Linear(layer1_dim,layer2_dim)
        self.fc3 = nn.Linear(layer2_dim, output_dim)
 
    def inicializate(self):
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.zeros_(self.fc1.bias)
        nn.init.xavier_uniform_(self.fc2.weight)
        nn.init.zeros_(self.fc2.bias)
        nn.init.xavier_uniform_(self.fc3.weight)
        nn.init.zeros_(self.fc3.bias)
 
    def forward(self, x):
        out = torch.relu(self.fc1(x))
        out = torch.relu(self.fc2(out))
        out = torch.relu(self.fc3(out))
        return out

In [None]:
input_size = 9 #9 entry variables
layer1_dim, layer2_dim = 256,128
output_size = 1
model = SimpleMLP(input_size,layer1_dim, layer2_dim, output_size)
print(model)

In [None]:
learning_rate = 0.00001
epochs = 2500
criterion = nn.BCEWithLogitsLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
model.train()
for e in range(1, epochs+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch)
        acc = binary_acc(y_pred, y_batch)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

In [None]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
s = 0
size = len(y_test)
for i in range(size):
    if (y_test.iloc[i]["Potability"]==y_pred_list[i]): s+=1
        
test_accuracy = s/size
print(test_accuracy)
