In [2]:
import pandas as pd
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy as scipy
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold, GridSearchCV, KFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score

from sklearn.utils import resample

from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

from imblearn.under_sampling import (RandomUnderSampler, 
                                     ClusterCentroids,
                                     TomekLinks,
                                     NeighbourhoodCleaningRule,
                                     NearMiss)
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN



In [None]:
# create dataframe to track params & scoring
cols = ["Model", "Method", "cv_score", "training_score", "accuracy", "precision", "recall", "f1-score", "TP", "FP", "TN", "FN"]
model_tracker = pd.DataFrame(columns= cols)

In [3]:
# creating class called tiny target which will evaluate several different sampling techniques on a user selected model
class tiny_target:
    def __init__(self, X, y, scaler= MaxAbsScaler(), model= LogisticRegression(), sampling_strategy= 0.5, train_size = 0.3, run_all = True, cv_scoring = "recall", stratify = y, cv= 5, tracker = None, name = "model"):
        if tracker is not None:
            self.tracker = tracker
        else:
            cols = ["Model", "Method", "cv_score", "training_score", "accuracy", "precision", "recall", "f1-score", "TP", "FP", "TN", "FN"]
            model_tracker = pd.DataFrame(columns= cols)
            self.tracker = model_tracker
            
        self.X = X
        self.y = y
        self.model = model
        self.sampling_strategy = sampling_strategy
        self.train_size = train_size
        self.stratify = stratify
        self.cv = cv
        self.name = name
        self.cv_scoring = cv_scoring
        self.tracker = tracker
    
    def model_runner(self):
        """runs models and appends various scoring metrics (y target focused) to model tracker"""
        self.model.fit(self.X_train_new, self.y_train_new)
        self.scores = cross_val_score(self.model, self.X_train_new, self.y_train_new, cv= self.cv, scoring = self.cv_scoring)
        self.training_score = self.model.score(self.X_train_new, self.y_train_new)

        # predict and get classification report & confusion matrix info
        self.predictions = self.model.predict(self.X_test)
        self.class_report = classification_report(self.y_test, self.predictions, output_dict= True)
        self.tn, self.fp, self.fn, self.tp = confusion_matrix(self.y_test, self.predictions).ravel()
        
        self.model_tracker()
        
    def model_tracker(self):
        self.results = pd.DataFrame({'Model': self.name, 'Method': self.method, 'cv_score': self.scores.mean(), 
                                'training_score': self.training_score,'accuracy': self.class_report["accuracy"],
                                'precision': self.class_report["1.0"]["precision"], 'recall': self.class_report["1.0"]["recall"],
                                'f1-score': self.class_report["1.0"]["f1-score"], 'TP': self.tp, 'FP': self.fp,
                                'TN': self.tn, 'FN': self.fn}, index=[len(model_tracker.index)])
        self.tracker =  pd.concat([self.tracker, self.results])
    
    def run_all(self):
        self.random_under()
        self.clustercentroids()
        self.tomeklinks()
        self.neighbourhoodclean()
        self.nearmiss()
        self.smote_upsample()
        self.random_over()
        self.adasyn()
        return self.tracker
    
    def random_under(self):
        """Randomly undersamples the predictor class. Can lead to loss of information 
        unless the majority class is relatively uniform"""
        
        self.sampler = RandomUnderSampler(sampling_strategy= self.sampling_strategy, random_state= 13)
        
        self.X_train, self.X_test, self.y_train , self.y_test = train_test_split( \
            self.X, self.y, train_size=self.train_size, stratify=self.stratify, random_state=13)
        self.X_train_new, self.y_train_new = self.sampler.fit_resample(self.X_train, self.y_train)
        self.method = "Random Under"
        self.model_runner() 
        print("Random undersample run complete")
        return self.tracker
    
    def clustercentroids(self):
        """This method undersamples the majority class by replacing a cluster of majority samples. 
        Clusters of majority class found with K-mean algorithms. Then it keeps the cluster centroids of the 
        N clusters as the new majority samples"""
        self.sampler = ClusterCentroids(sampling_strategy= self.sampling_strategy, random_state= 13)
        
        self.X_train, self.X_test, self.y_train , self.y_test = train_test_split(self.X, self.y, train_size=self.train_size, stratify=self.stratify, random_state=13)
        self.X_train_new, self.y_train_new = self.sampler.fit_resample(self.X_train, self.y_train)
        self.method = "Cluster Centroids"
        self.model_runner()
        
        print("Cluster centroids run complete")
        return self.tracker
    
    def tomeklinks(self):
        """Finds samples near the borderline of the two classess. Given two instances, a & b seperated by distance
        d(a,b) the pair is called a Tomek link if there is no instance c such that d(a,c) < d(a,b) or d(b,c) < d(a,b)
        Instances within Tomek links are considered noise or borderline and are thus removed"""
        
        self.sampler = TomekLinks()
        
        self.X_train, self.X_test, self.y_train , self.y_test = train_test_split(self.X, self.y, train_size=self.train_size, stratify=self.stratify, random_state=13)
        self.X_train_new, self.y_train_new = self.sampler.fit_resample(self.X_train, self.y_train)
        self.method = "Tomek Links"
        self.model_runner()
        
        print("TomekLinks run complete")
        return self.tracker
    
    def neighbourhoodclean(self):
        """ Edited Nearest Neighbor Rule (ENN) to remove any instance whose class label is different from the class
        of at least two of its three nearest neighbors. Neighbourhood cleaning rule uses ENN to remove majority samples
        Finds three nearest neighbors for each training set instance, if majority class and opposite to its neighbours 
        it is removed. If belongs to the target class than the neighbours are removed"""
        
        self.sampler = NeighbourhoodCleaningRule()
        
        self.X_train, self.X_test, self.y_train , self.y_test = train_test_split(self.X, self.y, train_size=self.train_size, stratify=self.stratify, random_state=13)
        self.X_train_new, self.y_train_new = self.sampler.fit_resample(self.X_train, self.y_train)
        self.method = "Neighbourhood Clean"
        self.model_runner()
        
        print("Neighbourhood Cleaning run complete")
        return self.tracker
    
    def nearmiss(self):
        """Calculates distances between all instance of majority and minority classes. K instances of the majority
        class with smallest distances to minority are selected and removed"""
        
        self.sampler = NearMiss(sampling_strategy= self.sampling_strategy)
        
        self.X_train, self.X_test, self.y_train , self.y_test = train_test_split(self.X, self.y, train_size=self.train_size, stratify=self.stratify, random_state=13)
        self.X_train_new, self.y_train_new = self.sampler.fit_resample(self.X_train, self.y_train)
        self.method = "Near Miss"
        self.model_runner()
       
        print("Near Miss run complete")
        return self.tracker
        
    
    def smote_upsample(self, k_neighbors = 5):
        self.k_neighbors = k_neighbors
        self.sampler = SMOTE(sampling_strategy= self.sampling_strategy, random_state= 13, k_neighbors= self.k_neighbors)
        self.X_train, self.X_test, self.y_train , self.y_test = train_test_split(self.X, self.y, train_size=self.train_size, stratify=self.stratify, random_state=13)

        cv = KFold(n_splits=5, shuffle = True)
        self.scores = np.array([])
        for train_fold_index, val_fold_index in cv.split(self.X_train, self.y_train):
         
             # Get the training data
            X_train_fold, y_train_fold = self.X_train[train_fold_index], self.y_train[train_fold_index]
        # Get the validation data
            X_val_fold, y_val_fold = self.X_train[val_fold_index], self.y_train[val_fold_index]

        # Upsample only the data in the training section
            X_train_fold_upsample, y_train_fold_upsample = self.sampler.fit_resample(X_train_fold,
                                                                           y_train_fold)
            model_obj = self.model.fit(X_train_fold_upsample, y_train_fold_upsample)
            score = recall_score(y_val_fold, model_obj.predict(X_val_fold))
            self.scores = np.append(self.scores, score)  
        
        self.X_train_new, self.y_train_new = self.sampler.fit_resample(self.X_train, self.y_train)
        self.model.fit(self.X_train_new, self.y_train_new)
        self.training_score = self.model.score(self.X_train_new, self.y_train_new)

        # predict and get classification report & confusion matrix info
        self.predictions = self.model.predict(self.X_test)
        self.class_report = classification_report(self.y_test, self.predictions, output_dict= True)
        self.tn, self.fp, self.fn, self.tp = confusion_matrix(self.y_test, self.predictions).ravel()
        self.method = "SMOTE"
        self.model_tracker()
        
        print("SMOTE run complete")
        return self.tracker
    
    def random_over(self):
        self.sampler = RandomOverSampler(sampling_strategy= self.sampling_strategy, random_state= 13)
        self.X_train, self.X_test, self.y_train , self.y_test = train_test_split(self.X, self.y, train_size=self.train_size, stratify=self.stratify, random_state=13)

        cv = KFold(n_splits=5, shuffle = True)
        self.scores = np.array([])
        for train_fold_index, val_fold_index in cv.split(self.X_train, self.y_train):
         
             # Get the training data
            X_train_fold, y_train_fold = self.X_train[train_fold_index], self.y_train[train_fold_index]
        # Get the validation data
            X_val_fold, y_val_fold = self.X_train[val_fold_index], self.y_train[val_fold_index]

        # Upsample only the data in the training section
            X_train_fold_upsample, y_train_fold_upsample = self.sampler.fit_resample(X_train_fold,
                                                                           y_train_fold)
            model_obj = self.model.fit(X_train_fold_upsample, y_train_fold_upsample)
            score = recall_score(y_val_fold, model_obj.predict(X_val_fold))
            self.scores = np.append(self.scores, score)  
        
        self.X_train_new, self.y_train_new = self.sampler.fit_resample(self.X_train, self.y_train)
        self.model.fit(self.X_train_new, self.y_train_new)
        self.training_score = self.model.score(self.X_train_new, self.y_train_new)
      

        # predict and get classification report & confusion matrix info
        self.predictions = self.model.predict(self.X_test)
        self.class_report = classification_report(self.y_test, self.predictions, output_dict= True)
        self.tn, self.fp, self.fn, self.tp = confusion_matrix(self.y_test, self.predictions).ravel()
        self.method = "Random oversample"
        self.model_tracker()
        
        print("Random oversample run complete")
        return self.tracker
        

        
    def adasyn(self, n_neighbors = 5):
        self.n_neighbors = n_neighbors
        self.sampler = ADASYN(sampling_strategy= self.sampling_strategy, random_state= 13, n_neighbors= self.n_neighbors)
        self.X_train, self.X_test, self.y_train , self.y_test = train_test_split(self.X, self.y, train_size=self.train_size, stratify=self.stratify, random_state=13)

        cv = KFold(n_splits=5, shuffle = True)
        self.scores = np.array([])
        for train_fold_index, val_fold_index in cv.split(self.X_train, self.y_train):
         
             # Get the training data
            X_train_fold, y_train_fold = self.X_train[train_fold_index], self.y_train[train_fold_index]
        # Get the validation data
            X_val_fold, y_val_fold = self.X_train[val_fold_index], self.y_train[val_fold_index]

        # Upsample only the data in the training section
            X_train_fold_upsample, y_train_fold_upsample = self.sampler.fit_resample(X_train_fold,
                                                                           y_train_fold)
            model_obj = self.model.fit(X_train_fold_upsample, y_train_fold_upsample)
            score = recall_score(y_val_fold, model_obj.predict(X_val_fold))
            self.scores = np.append(self.scores, score)  
        
        self.X_train_new, self.y_train_new = self.sampler.fit_resample(self.X_train, self.y_train)
        self.model.fit(self.X_train_new, self.y_train_new)
        self.training_score = self.model.score(self.X_train_new, self.y_train_new)

        # predict and get classification report & confusion matrix info
        self.predictions = self.model.predict(self.X_test)
        self.class_report = classification_report(self.y_test, self.predictions, output_dict= True)
        self.tn, self.fp, self.fn, self.tp = confusion_matrix(self.y_test, self.predictions).ravel()
        self.method = "ADASYN"
        self.model_tracker()
        
        print("ADASYN run complete")
        return self.tracker
        

NameError: name 'y' is not defined