In [1]:
# Library of my own functions
import sys
sys.path.append("../../src/")
import my_functions

# Libraries to deal with dataframes, vectors and formats
import pandas as pd
import numpy as np
import pickle as pkl

# sklearn to do Grid Search & ML models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import RFECV
from sklearn.metrics import f1_score, confusion_matrix 

# Imbalanced Learn
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, KMeansSMOTE, SVMSMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Plotting libraries to understand the models
import matplotlib.pyplot as plt
import seaborn as sns
import logging
logging.getLogger().setLevel(logging.CRITICAL) # Avoids no critical warnings
plt.rcParams['figure.figsize'] = [20, 4] # Change figure size of the cell of the notebook

# 1. Train Model

##  1.1 Create Features and Target

In [30]:
X_train = pd.read_csv("../../data/pima/preprocess/pima_stratified_standard_scaler.csv", index_col="index")

# Split the data into train and validation sets
df_training, df_validation = X_train[X_train["split"] =="train"], X_train[X_train["split"] =="test"]
X_train, X_test = df_training.iloc[:, 0:8], df_validation.iloc[:, 0:8]
y_train, y_test = df_training["Outcome"], df_validation["Outcome"]

## 1.2 Active Sampling

In [3]:
filename = "../../models/pima/"
model_type = "_STANDARD.pkl"

log_reg = pkl.load(open(filename + "logistic_regression" + model_type , 'rb'))
svc = pkl.load(open(filename + "svc" + model_type, 'rb'))
ada_boost = pkl.load(open(filename + "ada_boost" + model_type, 'rb'))
gradient_boosting = pkl.load(open(filename + "gradient_boosting" + model_type, 'rb'))
random_forest = pkl.load(open(filename + "random_forest" + model_type, 'rb'))
xg_boost = pkl.load(open(filename + "xgboost" + model_type, 'rb'))

In [5]:
# Filtrate only examples of the minority class
indexes_minority_class = y_train[y_train == "diabetes"].index
X_minority_class = X_train[X_train.index.isin(indexes_minority_class)]

In [6]:
def get_least_confidence_samples(model, X_train, perc_uncertain=0.5):
    """ 
    Return pandas dataframe of training data where the model is most uncertain.
    The uncertainty is calculated with the normalized uncertainty score of a training set using
    least confidence sampling in a 0-1 range where 1 is most uncertain (indexes of X_train are kept)

    Keyword arguments:
      model -- sklearn model previously train with the distribution of X_train data.
      X_train -– train data where the confidence of the model is going to be calculated.
      perc_uncertain -- % of most unconfident samples that is going to be retained.  
    """

    prob_dist = model.predict_proba(X_train) 
    prob_dist_minority_class = prob_dist[:, 0] # Obtaining probability of belonging to minority class (0 index may change in other datasets)

    # Calculating the normalized uncertainty score
    num_labels = prob_dist.shape[1]
    simple_least_conf = 1 - prob_dist_minority_class
    normalized_least_conf = simple_least_conf * (num_labels / (num_labels - 1))

    # Concatenating with indexes 
    df_confidence = pd.concat([pd.DataFrame(X_train.index), pd.DataFrame(normalized_least_conf, columns=["normalized_least_confidence"])], axis=1)
    df_confidence.set_index('index', inplace=True)

    # Sorting and keeping the most unconfident samples
    df_least_confidence = df_confidence.sort_values("normalized_least_confidence", ascending=False)
    number_rows = int(len(df_least_confidence)*perc_uncertain) # Obtain the first n rows based on perc_uncertain
    df_least_confidence = df_least_confidence.head(number_rows)

    # Obtaining training data where the model is most uncertain
    X_least_confidence = X_train[X_train.index.isin(df_least_confidence.index)] 

    return X_least_confidence

In [7]:
X_least_confidence = get_least_confidence_samples(log_reg, X_minority_class)

In [8]:
from sklearn.cluster import KMeans
def get_kmeans_samples(X_least_confidence):
    kmeans = KMeans(n_clusters=5, random_state=1).fit(X_least_confidence)

    # Keep Indexes in Kmeans
    df_kmeans = pd.concat([pd.DataFrame(X_least_confidence.index), pd.DataFrame(kmeans.labels_, columns=["kmeans_cluster"])], axis=1)
    df_kmeans.set_index('index', inplace=True)

    perc_sample = 0.4
    number_samples_by_cluster = round(int(len(df_kmeans)*perc_sample)/5)

    try:
        kmeans_samples = df_kmeans.groupby('kmeans_cluster', group_keys=False).apply(lambda x: x.sample(number_samples_by_cluster, random_state=1))
    except ValueError:
        min_samples_cluster = df_kmeans.value_counts().min()
        num_clusters = df_kmeans.value_counts().shape[0]
        perc_to_sample = round((num_clusters*min_samples_cluster) / df_kmeans.shape[0] *100, 2)
        print("There is a cluster with only " + str(min_samples_cluster) + " samples, to have a significant representation of every cluster we can only sample maximum " + str(perc_to_sample)+ "% of the data.")
        print("Setting the number of samples per cluster to be: " + str(min_samples_cluster))

        kmeans_samples = df_kmeans.groupby('kmeans_cluster', group_keys=False).apply(lambda x: x.sample(min_samples_cluster, random_state=1))
        
    return kmeans_samples

In [9]:
kmeans_samples = get_kmeans_samples(X_least_confidence)

There is a cluster with only 5 samples, to have a significant representation of every cluster we can only sample maximum 23.36% of the data.
Setting the number of samples per cluster to be: 5


In [10]:
indexes_kmeans = kmeans_samples.index
indexes_majority_class = y_train[y_train == "healthy"].index
indexes_to_filter = indexes_kmeans.append(indexes_majority_class)

X_least_kmeans = X_train[X_train.index.isin(indexes_to_filter)]
y_least_kmeans = y_train[y_train.index.isin(indexes_to_filter)]    

In [None]:
from sklearn.neighbors import NearestNeighbors
def nearest_neighbour(X):
  nbs = NearestNeighbors(n_neighbors=5, metric='euclidean', algorithm='kd_tree').fit(X)
  euclidean, indices = nbs.kneighbors(X)
  return indices

### Developing

In [3]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [6]:
nbrs = NearestNeighbors(n_neighbors=3)
nbrs.fit(X_train)
knn = nbrs.kneighbors()[1]

In [7]:
"""Python implementation of SMOTE.
This implementation is based on the original variant of SMOTE.
Original paper: https://www.jair.org/media/953/live-953-2037-jair.pdf
"""

import numpy as np
from sklearn.neighbors import NearestNeighbors


class SMOTE:
    """Python implementation of SMOTE.
        This implementation is based on the original variant of SMOTE.
        Parameters
        ----------
        ratio : int, optional (default=100)
            The ratio percentage of generated samples to original samples.
            - If ratio < 100, then randomly choose ratio% of samples to SMOTE.
            - If ratio >= 100, it must be a interger multiple of 100.
        k_neighbors : int, optional (defalut=6)
            Number of nearest neighbors to used to SMOTE.
        random_state : int, optional (default=None)
            The random seed of the random number generator.
    """
    def __init__(self,
                 ratio=100,
                 k_neighbors=6,
                 random_state=None):
        # check input arguments
        if ratio > 0 and ratio < 100:
            self.ratio = ratio
        elif ratio >= 100:
            if ratio % 100 == 0:
                self.ratio = ratio
            else:
                raise ValueError(
                    'ratio over 100 should be multiples of 100')
        else:
            raise ValueError(
                'ratio should be greater than 0')

        if type(k_neighbors) == int:
            if k_neighbors > 0:
                self.k_neighbors = k_neighbors
            else:
                raise ValueError(
                    'k_neighbors should be integer greater than 0')
        else:
            raise TypeError(
                'Expect integer for k_neighbors')

        if type(random_state) == int:
            np.random.seed(random_state)

    def _randomize(self, samples, ratio):
        length = samples.shape[0]
        target_size = length * ratio
        idx = np.random.randint(length, size=target_size)

        return samples[idx, :]

    def _populate(self, idx, nnarray):
        for i in range(self.N):
            nn = np.random.randint(low=0, high=self.k_neighbors) # Obtain a random K-Neighbor
            for attr in range(self.numattrs):
                dif = (self.samples[nnarray[nn]][attr]
                       - self.samples[idx][attr])
                gap = np.random.uniform()
                self.synthetic[self.newidx][attr] = (self.samples[idx][attr]
                                                     + gap * dif)
            self.newidx += 1

    def oversample(self, samples, merge=False):
        """Perform oversampling using SMOTE
        Parameters
        ----------
        samples : list or ndarray, shape (n_samples, n_features)
            The samples to apply SMOTE to.
        merge : bool, optional (default=False)
            If set to true, merge the synthetic samples to original samples.
        Returns
        -------
        output : ndarray
            The output synthetic samples.
        """
        if type(samples) == list:
            self.samples = np.array(samples)
        elif type(samples) == np.ndarray:
            self.samples = samples
        else:
            raise TypeError(
                'Expect a built-in list or an ndarray for samples')

        self.numattrs = self.samples.shape[1]

        if self.ratio < 100:
            ratio = ratio / 100.0
            self.samples = self._randomize(self.samples, ratio) 
            self.ratio = 100

        self.N = int(self.ratio / 100)
        new_shape = (self.samples.shape[0] * self.N, self.samples.shape[1])
        self.synthetic = np.empty(shape=new_shape)
        self.newidx = 0

        self.nbrs = NearestNeighbors(n_neighbors=self.k_neighbors)
        self.nbrs.fit(samples)
        self.knn = self.nbrs.kneighbors()[1]

        for idx in range(self.samples.shape[0]):
            nnarray = self.knn[idx]
            self._populate(idx, nnarray)

        if merge:
            return np.concatenate((self.samples, self.synthetic))
        else:
            return self.synthetic    

In [26]:
smote = SMOTE(ratio = 200)

In [10]:
X_train = X_train.to_numpy()

In [27]:
syntethic = smote.oversample(X_train)

In [None]:
hika
hika