In [None]:
from abc import ABC, abstractmethod

import numpy as np
import pandas as pd

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron as sk_Perceptron
from sklearn.model_selection import LeaveOneOut as sk_LeaveOneOut
from sklearn.model_selection import KFold as sk_KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
class Classifier(ABC):
    """Interface for classifier algorithms."""

    @abstractmethod
    def fit(self, X, y):
        """Fit classifier according to X, y.
        
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training vectors, where `n_samples` is the number of samples
            and `n_features` is the number of features.
        
        y : array-like of shape (n_samples,)
            Target values.
        
        Returns
        -------
        self : object
            Returns the instance itself.
        """
        pass

    @abstractmethod
    def predict(self, X):
        """Perform classification on an array of test vectors X.
        
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Input samples.
        
        Returns
        -------
        C : ndarray of shape (n_samples,)
            Predicted target values for X.
        """
        pass

class CrossValidation(ABC):
    """Interface for cross validation schemes."""
    
    @abstractmethod
    def split(self, X, y=None):
        """Generate indices to split data into training and test set
        
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training vectors, where `n_samples` is the number of samples
            and `n_features` is the number of features.
        
        y : array-like of shape (n_samples,)
            Target values.
        
        random_state : int
            `random_state` affects the ordering of the indices, which
            controls the randomness of each fold.
        
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        
        test : ndarray
            The testing set indices for that split.
        """
        pass

class Evaluator(ABC):
    """Evaluates prediction performance"""
    
    @abstractmethod
    def score(self, y_true, y_pred):
        """Evaluates prediction score.
        
        Parameters
        ----------
        y_true : 1d array-like, or label indicator array / sparse matrix
            Ground truth (correct) labels.
        
        y_pred : 1d array-like, or label indicator array / sparse matrix
            Predicted labels, as returned by a classifier.
        
        Returns
        -------
        score : float
            Evaluated prediction score.
        """
        pass

class NaiveBayes(Classifier):
    """Naive Bayes classification algorithm."""
    
    def __init__(self):
        self.__clf = GaussianNB()
    
    def fit(self, X, y):
        self.__clf.fit(X, y)
        return self
    
    def predict(self, X):
        return self.__clf.predict(X)

class DecisionTrees(Classifier):
    """Decision tree classifier."""
    
    def __init__(self, random_state: int = None):
        self.__clf = DecisionTreeClassifier(random_state = random_state)
    
    def fit(self, X, y):
        self.__clf.fit(X, y)
        return self
    
    def predict(self, X):
        return self.__clf.predict(X)

class KNearestNeighbors(Classifier):
    """K-nearest neighbors classifier"""
    
    def __init__(self):
        self.__clf = KNeighborsClassifier()

    def fit(self, X, y):
        self.__clf.fit(X, y)
        return self

    def predict(self, X):
        return self.__clf.predict(X)

class SVM(Classifier):
    """Support vector machine classifier"""
    
    def __init__(self, random_state: int = None):
        self.__clf = SVC(random_state = random_state)
    
    def fit(self, X, y):
        self.__clf.fit(X, y)
        return self
    
    def predict(self, X):
        return self.__clf.predict(X)

class Perceptron(Classifier):
    """Linear perceptron classifier"""
    
    def __init__(self, random_state: int = None):
        self.__clf = sk_Perceptron(random_state = random_state)
    
    def fit(self, X, y):
        self.__clf.fit(X, y)
        return self
    
    def predict(self, X):
        return self.__clf.predict(X)

class TestSet(CrossValidation):
    """Test set cross validation."""
    
    def __init__(self, test_set_percentage: float, random_state: int = None):
        self.__test_set_percentage = test_set_percentage
        self.__random_state = random_state
    
    def split(self, X, y = None):
        n_samples = len(X)
        indices = np.arange(n_samples)
        np.random.default_rng(self.__random_state).shuffle(indices)
        stop = round(n_samples * self.__test_set_percentage)
        test_indices = indices[:stop]
        test_mask = np.zeros(n_samples, dtype=bool)
        for test_index in test_indices:
            test_mask[test_index] = True
        train_index = indices[np.logical_not(test_mask)]
        test_index = indices[test_mask]
        yield train_index, test_index

class LeaveOneOut(CrossValidation):
    """Leave-one-out cross-validation"""
    
    def __init__(self):
        self.__cv = sk_LeaveOneOut()
    
    def split(self, X, y = None):
        return self.__cv.split(X, y)

class KFold(CrossValidation):
    """Leave-one-out cross-validation"""
    
    def __init__(self, n_splits: int = 5, random_state: int = None):
        self.__cv = sk_KFold(n_splits, shuffle = True, random_state = random_state)
    
    def split(self, X, y = None):
        return self.__cv.split(X, y)

class AccuracyScore(Evaluator):
    """Accuracy classification score.
    
    Computes subset accuracy: the set of labels predicted for a sample
    must *exactly* match the corresponding set of labels in y_true.
    
    Score is the fraction of correctly classified samples.
    """
    
    def __init__(self):
        pass
    
    def score(self, y_true, y_pred):
        return accuracy_score(y_true, y_pred)

def run(X, y, classifier: Classifier, cross_validation: CrossValidation, evaluator: Evaluator):
    """Classify data and return score."""
    score = 0
    for train_index, test_index in cross_validation.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = classifier.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        score = max(evaluator.score(y_test, y_pred), score)
    return score


In [None]:
insurance_dataframe = pd.read_csv("data/insurance.csv")
print('Original dataset:')
print(insurance_dataframe)

min_val = insurance_dataframe["charges"].min()
max_val = insurance_dataframe["charges"].max()

midpoint = (min_val + max_val) / 2
first_quarter = (midpoint + min_val) / 2
third_quarter = (midpoint + max_val) / 2

split_1 = insurance_dataframe.loc[insurance_dataframe["charges"] <= first_quarter, :].copy()
split_2 = insurance_dataframe.loc[(first_quarter < insurance_dataframe["charges"]) & (insurance_dataframe["charges"] <= midpoint), :].copy()
split_3 = insurance_dataframe.loc[(midpoint < insurance_dataframe["charges"]) & (insurance_dataframe["charges"] <= third_quarter), :].copy()
split_4 = insurance_dataframe.loc[third_quarter < insurance_dataframe["charges"], :].copy()

split_1.loc[:, "charges_code"] = 0
split_2.loc[:, "charges_code"] = 1
split_3.loc[:, "charges_code"] = 2
split_4.loc[:, "charges_code"] = 3

categorized_df = pd.concat([split_1, split_2, split_3, split_4])

insurance_dataframe = categorized_df[["age", "sex", "bmi", "children", "smoker", "region", "charges_code"]].sample(frac=1).reset_index(drop=True)

le = LabelEncoder()

# Encode sex
le.fit(insurance_dataframe["sex"])
insurance_dataframe["sex"] = le.transform(insurance_dataframe["sex"])

# Encode smoker
le.fit(insurance_dataframe["smoker"])
insurance_dataframe["smoker"] = le.transform(insurance_dataframe["smoker"])

# Encode region
le.fit(insurance_dataframe["region"])
insurance_dataframe["region"] = le.transform(insurance_dataframe["region"])

scaler = StandardScaler()

scaler.fit(insurance_dataframe[["age", "sex", "bmi", "children", "smoker", "region"]])
insurance_dataframe[["age", "sex", "bmi", "children", "smoker", "region"]] = scaler.transform(insurance_dataframe[["age", "sex", "bmi", "children", "smoker", "region"]])

insurance_dataframe = np.array(insurance_dataframe)
print('Label encoded and scaled dataset:')
print(insurance_dataframe)
X = insurance_dataframe[:, 0:6]
y = insurance_dataframe[:, 6]

random_state: int = 42

classifiers = [
    ('Naive Bayes', NaiveBayes()),
    ('Decision Trees', DecisionTrees(random_state = random_state)),
    ('K-Nearest Neighbors', KNearestNeighbors()),
    ('SVM', SVM(random_state = random_state)),
    ('Perceptron', Perceptron(random_state = random_state)),
]

cross_validators = [
    ('Test Set 10%', TestSet(0.1, random_state = random_state)),
    ('Test Set 20%', TestSet(0.2, random_state = random_state)),
    ('Test Set 30%', TestSet(0.3, random_state = random_state)),
    ('Leave One Out', LeaveOneOut()),
    ('5-Fold', KFold(5, random_state = random_state)),
    ('10-Fold', KFold(10, random_state = random_state)),
    ('15-Fold', KFold(15, random_state = random_state)),
]

accuracy_score_evaluator = AccuracyScore()

for clf in classifiers:
    for cv in cross_validators:
        result = run(X, y, clf[1], cv[1], accuracy_score_evaluator)
        print(f'{clf[0]}, {cv[0]}: {result}')