In [None]:
import pandas as pd
import numpy as np
water_df = pd.read_csv('../input/water-potability/water_potability.csv')

In [None]:
import warnings
warnings.filterwarnings("ignore") # ignoring some non harmful warnings
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import random
from pprint import pprint
from sklearn.tree import DecisionTreeClassifier
import plotly.express as px



In [None]:
water_df.head()

In [None]:
# checking if there are any null values

m_value = water_df.isnull().sum()
print(m_value[m_value > 0])

In [None]:
# counting the sum of our label "Potability"
sns.countplot(x="Potability", data=water_df)

In [None]:
# Checking to confirm which feature has more Potability possibilities
fig = px.scatter_matrix(water_df,
                       dimensions=['ph', "Hardness", "Solids", "Turbidity"],
                       color="Potability")
fig.show()

In [None]:
# Checking to confirm which feature has more Potability possibilities
fig = px.histogram(water_df, x="Hardness", y="Turbidity", color="Potability", marginal="rug",
                   hover_data=water_df.columns)
fig.show()

In [None]:
# dropping all null columns

water_df.dropna(axis=0, how='any', inplace=True)

In [None]:
m_value = water_df.isnull().sum()
print(m_value[m_value >0])

In [None]:
water_df.head()

In [None]:
water_df.shape

In [None]:
# Giving each class a meaningful name e.g. class 1 = Yes Potable
water_df['Potability'].replace(0.0, 'No', inplace=True)
water_df['Potability'].replace(1.0, 'Yes', inplace=True)

In [None]:
# define our train test split data

def train_test_split(water_df, test_size):
    # check if test_size is a float or a proportion ensure our test data set is always a whole digit
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(water_df))
    indices = water_df.index.tolist() # returning a nested list of python scalar
        # randomnly selecting our indices, k = number of element = test_size
    test_indices = random.sample(population=indices, k=test_size)
    # setting up our test_df to retain the rest for train_df
    test_df = water_df.loc[test_indices]
    # dropping all test_df to retain the rest for train_df
    train_df = water_df.drop(test_indices)
    
    
    return train_df, test_df

random.seed(0)

train_df, test_df = train_test_split(water_df, test_size=0.2)
        

In [None]:
data = train_df.values
data[:5]

In [None]:
# checking for impurity

def check_purity(data):
    label_column = data[:,-1]
    unique_classes = np.unique(label_column)
    if len(unique_classes) == 1:
        return True
    else:
        return False

In [None]:
# testing our test result
print("Your test result is ", check_purity(train_df[train_df.Hardness < 188].values))

In [None]:
# classifying our data
def classify_data(data):
    # selecting all values
    label_column = data[:, -1]
    
    # getting and counting the true classes
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)
    # get the class that appears more often
    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    return classification
    

In [None]:
# test if my classification func works
print("Your classified class is ", classify_data(train_df[(train_df.Hardness > 188) & (train_df.Turbidity < 6)].values))

In [None]:
# defining our best splits
def get_potential_splits(data):
    potential_splits = {} # preserving a dictionary for my potential splits
    
    _, n_columns = data.shape # returning a tuple of our data (no rows required _)
    
    for column_index in range(n_columns - 1): # you want to exclude the last column
        potential_splits[column_index] = []
        values = data[:,column_index]
        unique_values = np.unique(values)
        for index in range(len(unique_values)):
            if index != 0:
                current_value = unique_values[index]
                previous_value = unique_values[index -1]
                potential_split = (current_value + previous_value) /2
                potential_splits[column_index].append(potential_split)

    return potential_splits

In [None]:
potential_splits = get_potential_splits(train_df.values)

In [None]:
# pprint(potential_splits)

In [None]:
sns.lmplot(data=train_df, x="Hardness", y= "Turbidity", hue="Potability", height=8
           , aspect=2)

In [None]:
# this plot is showing the potential split between
# the previous value and the current value from the potential_split data
plt.figure(figsize=(20,15))
plt.hlines(y=potential_splits[7], xmin=0, xmax=15)

plt.show()

In [None]:
"""split_column is where we decide to split vertically or horizontally
split_value at what point (value) do we want to split"""

def split_data(data, split_column, split_value):
    # splitting on our column values
    split_column_values = data[:, split_column]
    
    # geting the values from data_below the split_value
    
    data_below = data[split_column_values <= split_value]
    
    # getting the values from data_above the split_value
    data_above = data[split_column_values > split_value]
    
    return data_below, data_above

# testing our split data

split_column  = 7
split_value = 40
data_below, data_above = split_data(data, split_column, split_value)

# testing my split_data

pprint(split_data(data, split_column, split_value))



In [None]:
# computing the entropy
def calculate_entropy(data):
    label_column = data[:,-1]
    # not taking the rows
    
    _, counts = np.unique(label_column, return_counts=True)
    
    # getting the probabilities by unique counts divided by total unique counts
    
    prob = counts/counts.sum()
    entropy = sum(prob * -np.log2(prob))
    
    return entropy
# testing our entropy
calculate_entropy(data_below)
    

In [None]:
calculate_entropy(data_above)

In [None]:
# creating a function to calculate the overall entropy
def calculate_overall_entropy(data_below, data_above):
    # working out how many data points we have in total
    
    n_data_points = len(data_below) + len(data_above)
    
    # the weight of data_below
    
    p_data_below = len(data_below) / n_data_points
    
    # the weight of data_above
    p_data_above = len(data_above) /n_data_points
    
    # calculate the overall entropy
    
    overall_entropy = (p_data_below * calculate_entropy(data_below)
                      +p_data_above * calculate_entropy(data_above))
    
    return overall_entropy

# test overall entropy

calculate_overall_entropy(data_below, data_above)

In [None]:
# computing the best split

def best_split(data, potential_split):
    potential_splits = get_potential_splits(data)
    
    # setting the entropy high value so it loops over the potential split
    # calculate the overall entropy that will be the result of that split
    # if it is lower than our overall entropy then we are going to store
    # that into our best split column and best split value
    
    overall_entropy = 900
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            # calculaet the overall entroy
            data_below, data_above = split_data(data, split_column = column_index,
                                               split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below, data_above)
            
            # now we check for the values
            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
    return best_split_column, best_split_value

print(f'One of your best split at column, value {best_split(data,potential_splits)}')

In [None]:
# now let's build our tree

def decision_tree_algorithm(df, counter=0, min_samples=5, max_depth=10):
    # data prep
    if counter == 0:
        global COLUMN_HEADERS # to allow my function to re-user the headers
        
        COLUMN_HEADERS = df.columns
        data = df.values
    else:
        data = df
        
    
    if (check_purity(data)) or (len(data) < min_samples):
        classification = classify_data(data)
        return classification
    else:
        counter +=1
        # we will be running our help functions
        potential_splits = get_potential_splits(train_df.values)
        
        # determine the best split
        split_column, split_value = best_split(data, potential_splits)
        
        # split our data based on the split based on data_below and data_above
        
        data_below, data_above = split_data(data, split_column, split_value)
        
        # instantiate sub_tree
        
        feature_name = COLUMN_HEADERS[split_column]
        question = "When {} <= value {}".format(feature_name, split_value)
        sub_tree = {question:[]}
        
        # find answers (recursive part)
        
        yes_answer = decision_tree_algorithm(data_below, counter, min_samples, max_depth)
        no_answer = decision_tree_algorithm(data_above, counter, min_samples, max_depth)
        
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
            
        return sub_tree
        

In [None]:
my_tree = decision_tree_algorithm(train_df[train_df.Potability != "Yes"])
pprint(my_tree)

In [None]:
# # test for all our data
my_tree = decision_tree_algorithm(train_df, min_samples=2, max_depth=None)
pprint(my_tree)

In [None]:
# Classification
# sub_tree = {question: [yes_answer, no_answer]}

example = test_df.iloc[30]
example

In [None]:
# getting a dictionary key object then turn it into a list and split it.
question = list(my_tree.keys())[0]
question.split()

In [None]:
def classify_example(example, my_tree):
    question = list(my_tree.keys())[0]
    string, feature_name, comparison_operator, string, value = question.split()
    
    # ask question
    if example[feature_name] <= float(value):
        answer = my_tree[question][0]
    else:
        answer = my_tree[question][1]
    # if our answer is not a dictionary return the answer {{{ }}}
    if not isinstance(answer, dict):
        return answer
    
    # recursive part
    else:
        residual_tree = answer
    return classify_example(example, residual_tree)

In [None]:
# calculate accuracy score

def calculate_accuracy(df, my_tree):
    df['classification'] = df.apply(classify_example, axis=1, args=(my_tree,))
    # adding a column to show whether your prediction is True or False
    df['classification_correct'] = df.classification == df.Potability
    accuracy = df.classification_correct.mean()

    return accuracy

In [None]:
calculate_accuracy(test_df, my_tree)

In [None]:
# this is where our train test split
train_df, test_df = train_test_split(water_df, test_size=0.3) # tried 0.33, 0.25 57% accur, 30% with 61% accuracy
# running my decision tree algorithm
my_tree = decision_tree_algorithm(train_df, max_depth=10)
# showing my accuracy score
accuracy = calculate_accuracy(test_df, my_tree)

# pprint(my_tree)
print(f'Your accuracy score is {accuracy}')
print(test_df)