In [None]:
#
# creation date  2022_04apr_20
# last change    2022_04apr_24
# author         artur
# comment        The algorithm itself is a small part of this notebook. It was
#                made almsot only with standard python3 tackles
#                (tackle - приспособление)
#



In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
import sklearn.model_selection

In [None]:
%xmode Plain

Exception reporting mode: Plain


In [None]:
full = sns.load_dataset("iris")
print(full.head())
print(full.tail())

   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa
     sepal_length  sepal_width  petal_length  petal_width    species
145           6.7          3.0           5.2          2.3  virginica
146           6.3          2.5           5.0          1.9  virginica
147           6.5          3.0           5.2          2.0  virginica
148           6.2          3.4           5.4          2.3  virginica
149           5.9          3.0           5.1          1.8  virginica


In [None]:
X_full = full.iloc[:, :-1]
y_full = full.iloc[:, -1]

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X_full, y_full)

In [None]:
y_train.unique()

array(['virginica', 'versicolor', 'setosa'], dtype=object)

----------------------------------

In [None]:
def learn_averages(X_train, y_train):
    sums = {}
    counts = {}

    for i in range(y_train.size):
        target = y_train[i]
        if target not in sums.keys():
            sums[target] = 0
            counts[target] = 0
        sums[target] += X_train[i]
        counts[target] += 1

    averages = sums
    for target in averages.keys():
        averages[target] /= counts[target]

    return averages

In [None]:
def probabilities_calc(row, learned_averages):
    probabilities = {}
    for target, averages in learned_averages.items():
        ####keep in mind that and row, and averages are vectors here

        ####closer to 0.0 is better
        probabilities[target] = sum(    abs(row - averages) / averages    ) / len(row)
    return probabilities

In [None]:
def predict(row, learned_averages):
    probabilities = probabilities_calc(row, learned_averages)
    most_probably = ""
    most_probability = -1000
    for target, probability in probabilities.items():
        if (abs(probability) < abs(most_probability)):
            most_probability = probability
            most_probably = target
    return most_probably

In [None]:
def my_classifier(X_train, y_train, X_test):
    averages = learn_averages(X_train, y_train)
    predicts = []
    for row in X_test:
        predicts.append(predict(row, averages))
    return predicts

----------------------------------

In [None]:
test_learned_averages = {"a": [2, 2, 4, 1], "b": [2, 1, 1, 3], "c": [3, 5, 6, 5]}
test_X_train = X_train
test_y_train = y_train
my_X_train = np.array(
    [[8, 6, 7, 9],
     [16, 13, 15, 15],
     [16, 17, 17, 20],
     [9, 5, 8, 7]], dtype="float64")
my_y_train = np.array(["lower",
                       "middle",
                       "middleup",
                       "lower"])


test_row = np.array([2.5, 1.7, 3, 8])
test_result = probabilities_calc(test_row, test_learned_averages)
test_right_answer = { 'a': 1.9125, 'b': 1.1541666666666668, 'c': 0.4816666666666667 }
print("probabilities_calc(): ", end="")
if (test_result == test_right_answer):
    print("Passed")
else:
    print("Not passed. Got:")
    print(test_result)


test_result = predict(test_row, test_learned_averages)
test_right_answer = "c"
print("predict(): ", end="")
if (test_result == test_right_answer):
    print("Passed")
else:
    print("Not passed. Got:")
    print(test_result)
test_row = np.array([20, 19, 16, 15])
my_averages = learn_averages(my_X_train, my_y_train)
my_averages
test_result = predict(test_row, my_averages)
test_right_answer = "middleup"
print("predict(): ", end="")
if (test_result == test_right_answer):
    print("Passed")
else:
    print("Not passed. Got:")
    print(test_result)


test_result = learn_averages(my_X_train, my_y_train)
test_right_answer = { "lower": [8.5, 5.5, 7.5, 8],
                      "middle": [16, 13, 15, 15],
                      "middleup": [16, 17, 17, 20]}
print("learn_averages(): ", end="")
if (test_result["lower"] == test_right_answer["lower"]).all() and (test_result["middle"] == test_right_answer["middle"]).all():
    print("Passed")
else:
    print("Not passed. Got:")
    print(test_result)

probabilities_calc(): Passed
predict(): Passed
predict(): Passed
learn_averages(): Passed


----------------------------------
My model test

In [None]:
model_y = my_classifier(X_train.values, y_train.values, X_test.values)
(model_y == y_test).sum() / y_test.shape[0]

0.9736842105263158

----------------------------------
Comparing with sklearn model

In [None]:
import sklearn.naive_bayes

In [None]:
from_teacher_model = sklearn.naive_bayes.GaussianNB()
from_teacher_model.fit(X_train, y_train)
his_predicts = from_teacher_model.predict(X_test)

In [None]:
(his_predicts == y_test).sum() / y_test.shape[0]

1.0