In [7]:
from collections import Counter

import numpy as np
from scipy.spatial.distance import euclidean

from mla.base import BaseEstimator


class KNNBase(BaseEstimator):
    def __init__(self, k=5, distance_func=euclidean):
        """Base class for Nearest neighbors classifier and regressor.

        Parameters
        ----------
        k : int, default 5
            The number of neighbors to take into account. If 0, all the
            training examples are used.
        distance_func : function, default euclidean distance
            A distance function taking two arguments. Any function from
            scipy.spatial.distance will do.
        """

        self.k = None if k == 0 else k  # l[:None] returns the whole list
        self.distance_func = distance_func

    def aggregate(self, neighbors_targets):
        raise NotImplementedError()

    def _predict(self, X=None):
        predictions = [self._predict_x(x) for x in X]

        return np.array(predictions)

    def _predict_x(self, x):
        """Predict the label of a single instance x."""

        # compute distances between x and all examples in the training set.
        distances = (self.distance_func(x, example) for example in self.X)

        # Sort all examples by their distance to x and keep their target value.
        neighbors = sorted(((dist, target) for (dist, target) in zip(distances, self.y)), key=lambda x: x[0])

        # Get targets of the k-nn and aggregate them (most common one or
        # average).
        neighbors_targets = [target for (_, target) in neighbors[: self.k]]

        return self.aggregate(neighbors_targets)


class KNNClassifier(KNNBase):
    """Nearest neighbors classifier.

    Note: if there is a tie for the most common label among the neighbors, then
    the predicted label is arbitrary."""

    def aggregate(self, neighbors_targets):
        """Return the most common target label."""

        most_common_label = Counter(neighbors_targets).most_common(1)[0][0]
        return most_common_label

In [None]:
try:
    from sklearn.model_selection import train_test_split
except ImportError:
    from sklearn.cross_validation import train_test_split
from sklearn.datasets import make_classification
from sklearn.datasets import make_regression
from scipy.spatial import distance
import os 
import pandas as pd 
import numpy as np
from mla import knn
from mla.metrics.metrics import mean_squared_error, accuracy

base_dir = "class_imbalance"
for filename in os.listdir(base_dir):
    file_path = os.path.join(base_dir, filename)
    df = pd.read_csv(file_path)

    # Remove rows with NaN values
    newdf = df.dropna(axis=0, how="any")
    print(filename)
    X = newdf.iloc[:, :-1]
    y = newdf.iloc[:, -1]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    clf = knn.KNNClassifier(k=5, distance_func=distance.euclidean)

    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print("classification accuracy", accuracy(y_test, predictions))


dataset_978_mfeat-factors.csv
data length 2000
new data length 2000
dataset_947_arsenic-male-bladder.csv
data length 559
new data length 559
dataset_1004_synthetic_control.csv
data length 600
new data length 600
dataset_1056_mc1.csv
data length 9466
new data length 9466
dataset_940_water-treatment.csv
data length 527
new data length 397
dataset_950_arsenic-female-lung.csv
data length 559
new data length 559
dataset_1039_hiva_agnostic.csv
data length 4229
new data length 4229
dataset_1045_kc1-top5.csv
data length 145
new data length 145
dataset_1013_analcatdata_challenger.csv
data length 138
new data length 138
dataset_450_analcatdata_lawsuit.csv
data length 264
new data length 264
dataset_312_scene.csv
data length 2407
new data length 2407
dataset_995_mfeat-zernike.csv
data length 2000
new data length 2000
dataset_311_oil_spill.csv
data length 937
new data length 937
dataset_980_optdigits.csv
data length 5620
new data length 5620
dataset_987_collins.csv
data length 500
new data length 