In [None]:
#Clicking on the Cell tab will reveal a Run All button. Use this to initiate the model.


from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib

import tensorflow.compat.v2.feature_column as fc

import tensorflow as tf

In [None]:
dftrain = pd.read_csv('water_train.csv') # training data
dfeval = pd.read_csv('water_test.csv') # testing data
y_train = dftrain.pop('Potability') # pops the 'Potability' column from the training data
y_eval = dfeval.pop('Potability') # pops the 'Potability' column from the testing data
print(y_train) # Prints the new training data
print(y_eval) # Prints the new testing data

In [None]:
dftrain.fillna(0, inplace=True) 
dfeval.fillna(0, inplace=True)  # this is used to replace the NaN(Not a number) values involved the the test/train data.

In [None]:
dftrain.head() #the first few columns of the dataframe

In [None]:
dftrain.describe()  #describes the statistical information of the dataset

In [None]:
dftrain.shape # The shape of the dataframe. This one being the training dataset.

In [None]:
dftrain.ph.hist(bins=20) # A histogram showing the distribution of ph values of the samples.

In [None]:
NUMERIC_COLUMNS = ["ph","Hardness","Solids","Chloramines","Sulfate","Conductivity","Organic_carbon","Trihalomethanes","Turbidity"]
feature_columns = []
for feature_name in NUMERIC_COLUMNS:
  feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))


In [None]:
print(feature_columns)

In [None]:
def make_input_fn(data_df, label_df, num_epochs=250, shuffle=True, batch_size=128):
  def input_function():  # inner function, this will be returned
    ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))  # create tf.data.Dataset object with data and its label
    if shuffle:
      ds = ds.shuffle(1000)  # randomize order of data
    ds = ds.batch(batch_size).repeat(num_epochs)  # split dataset into batches of 128 and repeat process for number of epochs(Here, 250)
    return ds  # return a batch of the dataset
  return input_function  # return a function object for use

train_input_fn = make_input_fn(dftrain, y_train)  # here we will call the input_function that was returned to us to get a dataset object we can feed to the model
eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)


In [None]:
linearest= tf.estimator.LinearClassifier(feature_columns=feature_columns)

In [None]:
linearest.train(train_input_fn)  # train
result = linearest.evaluate(eval_input_fn)  # get model metrics/stats by testing on testing data

clear_output()  # clears console output
print(result['accuracy'] * 100, "%")  # gives the percentage accuracy of the model

In [None]:
print(result) # the result variable is simply a dict of stats about our model

In [None]:
reslt = list(linearest.predict(eval_input_fn)) 

In [None]:
x = int(input("Enter the code value of the sample : "))  # You can interact with the model here


#Press enter after entering a value between 0-252.

In [None]:
#Now run this block after inputting a value in the previous block.

print("The predicted potability of the sample is :")
print(reslt[x]["probabilities"][1] * 100, "%")

print("The data assosciated with the sample : ")

print(dfeval.loc[x])

print("The actual potability(0(Which means the sample is unsuitable for drinking or 1(The sample is suitable for drinking)): ")
print(y_eval.loc[x])

In [None]:
# The model may show inaccurate values as it is still in its early stages