<h2><center>Imbalanced Classification</center></h2>
<div style="font-family:verdana; word-spacing:1.5px;">
Imbalanced classification involves developing predictive models on classification datasets that have a severe class imbalance. The challenge of working with imbalanced datasets is that most machine learning techniques will ignore, and in turn have poor performance on, the minority class, although typically it is performance on the minority class that is most important.
</div>    

In [None]:
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import kurtosis, skew
from scipy import stats

from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import brier_score_loss, make_scorer

from sklearn.dummy import DummyClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score

from sklearn.svm import SVC

from sklearn.metrics import classification_report, confusion_matrix, roc_curve, precision_recall_curve

from sklearn.model_selection import GridSearchCV
%matplotlib inline

sns.set_style("whitegrid")
sns.set_context("paper")
#plt.style.use('whitegrid')

In [None]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

def DataDesc(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary

def CalOutliers(df_num): 
    '''
    
    Leonardo Ferreira 20/10/2018
    Set a numerical value and it will calculate the upper, lower and total number of outliers
    It will print a lot of statistics of the numerical feature that you set on input
    
    '''
    # calculating mean and std of the array
    data_mean, data_std = np.mean(df_num), np.std(df_num)

    # seting the cut line to both higher and lower values
    # You can change this value
    cut = data_std * 3

    #Calculating the higher and lower cut values
    lower, upper = data_mean - cut, data_mean + cut

    # creating an array of lower, higher and total outlier values 
    outliers_lower = [x for x in df_num if x < lower]
    outliers_higher = [x for x in df_num if x > upper]
    outliers_total = [x for x in df_num if x < lower or x > upper]

    # array without outlier values
    outliers_removed = [x for x in df_num if x > lower and x < upper]
    
    print(color.BOLD+f'Lower outliers: {len(outliers_lower)}'+ color.END) # printing total number of values in lower cut of outliers
    print(color.BOLD+f'Upper outliers: {len(outliers_higher)}'+ color.END) # printing total number of values in higher cut of outliers
    print(color.BOLD+f'Total outliers: {len(outliers_total)}'+ color.END) # printing total number of values outliers of both sides
    print(color.BOLD+f'Non - outliers: {len(outliers_removed)}'+ color.END) # printing total number of non outlier values
    print(color.BOLD+f'% of Outliers : {round((len(outliers_total) / len(outliers_removed) )*100, 4)}'+ color.END ) # Percentual of outliers in points

<h3><center>1. Reading Data </center></h3>

In [None]:
FILE_PATH = '../input/habermans-survival-data-set/haberman.csv'
columns = ['age', 'year', 'nodes', 'class']

haberman_df = pd.read_csv(FILE_PATH, header=None, names=columns)

# Setting class values to 1 & 0 for +ve & -ve classes
haberman_df['class'].replace({1:0,2:1}, inplace=True)
haberman_df.sample(5)

<h3><center>2. Exploring Data </center></h3>

In [None]:
DataDesc(haberman_df)

<h3>2.1. Age Distribution & Outliers</h3>

In [None]:
display(haberman_df['age'].describe())

plt.figure(figsize=(15,5))
plt.suptitle('Age Distribution', fontsize=30)
_ = sns.countplot(data=haberman_df, x='age', color='#963559')
_ = plt.ylabel('Count', fontsize=20)
_ = plt.xlabel('Age', fontsize=20)


print("\n")
display(CalOutliers(haberman_df['age']))

<div style="font-family:verdana; word-spacing:1.5px;">
<h4 style="font-family:verdana; word-spacing:1.5px;font-size:16px">Analysis of feature "Age" :</h4>
<ul><li>Entropy/Uncertainty of column age is the highest(5.34), having 49 unique values out of 306 total samples.
    <li>Mean age value is 52 and the distribution is similar to a normal Distribution.
        <li> As of now we have not detected any outliers using naive calculation.
    </ul>
</div>

<h3>2.2 Distribution of Age Based on Cancer</h3>

In [None]:
fig = plt.figure(figsize=(15,5))

ax1 = fig.add_subplot(121)
_ = sns.distplot(haberman_df[haberman_df['class'] == 0]['age'], bins=24, color='#f5dd90', ax=ax1)
_ = ax1.set_title('Non-Cancer', fontsize=20)
_ = ax1.set_xlabel("Age",fontsize=15)
_ = ax1.set_ylabel("")

ax2 = fig.add_subplot(122)
_ = sns.distplot(haberman_df[haberman_df['class'] == 1]['age'], bins=24, color='#0d3b66', ax=ax2)
_ = ax2.set_title('Cancer', fontsize=20)
_ = ax2.set_xlabel("Age",fontsize=15)
_ = ax2.set_ylabel("")

<h3>2.3 Nodes Distribution</h3>

In [None]:
display(haberman_df['nodes'].describe())

plt.figure(figsize=(15,5))
plt.suptitle('Nodes Distribution', fontsize=20)
_ = sns.countplot(data=haberman_df, x='nodes', color='#963559')
_ = plt.ylabel('Count', fontsize=15)
_ = plt.xlabel('Nodes', fontsize=15)


print("\n")
display(CalOutliers(haberman_df['nodes']))

<h3>2.4 Distribution of nodes Based on Cancer</h3>

In [None]:
plt.figure(figsize=(10,5))
_ = sns.swarmplot(data=haberman_df, x='class', y='nodes', palette=['#9fb8ad','#475841'])
_ = plt.ylabel('Nodes', fontsize=15)
_ = plt.xlabel('Class', fontsize=15)
_ = plt.xticks([0,1],['Non-Cancer','Cancer'], fontsize=15)
_ = plt.title('Nodes vs Cancer', fontsize=20)

A cancer patient cannot be directly identified by number of nodes

<h3>2.5 Cancer vs Non-Cancer</h3>

In [None]:
_ = plt.figure(figsize=(8,6))
_ = sns.countplot(haberman_df['class'], palette=['#9fb8ad','#475841'])
_ = plt.xticks([0,1],['Non-Cancer','Cancer'], fontsize=15)

target = haberman_df['class'].values
counter = Counter(target)
for k,v in counter.items():
    per = v / len(target) * 100
    print('Class=%d, Count=%d, Percentage=%.3f%%' % (k, v, per))

<h3><center>3. Model Test and Baseline Result</center></h3>

<div style="font-family:verdana; word-spacing:1.5px;font-size:16px">
We are interested in predicting a probability of survival, we need a performance metric that evaluates the skill of a model based on the predicted probabilities. In this case, we will use the Brier score that calculates the mean squared error between the predicted probabilities and the expected probabilities.<br><br><br>
    We need a Brier score for a reference prediction. A reference prediction for a problem in which we are predicting probabilities is the probability of the positive class label in the dataset. In this case, the positive class label represents non-survival and occurs about 26% in the dataset. Therefore, predicting about 0.26471 represents the worst-case or baseline performance for a predictive model on this dataset. Any model that has a Brier score better than this has some skill, where as any model that as a Brier score lower than this has no skill.<br><br>
    We will evaluate the baseline strategy of predicting the distribution of positive examples in the training set as the probability of each case in the test set.<br> This can be implemented automatically using the DummyClassifier class and setting the strategy to ‘prior’ that will predict the prior probability of each class in the training dataset, which for the positive class we know is about 0.26471.
    </div>

In [None]:
def brier_skill_score(y_true, y_prob):
    ref_probs = [0.26471 for _ in range(len(y_true))]
    
    bs_ref = brier_score_loss(y_true, ref_probs)
    bs_model = brier_score_loss(y_true, y_prob)
    
    return 1.0 - (bs_model / bs_ref)

def evaluate_model(X, y, model):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    
    metric = make_scorer(brier_skill_score, needs_proba=True)
    
    scores = cross_val_score(model, X, y, cv=cv, scoring=metric, n_jobs=-1)
    
    return scores

In [None]:
X, y = haberman_df.values[:,:-1], haberman_df.values[:,-1]  

model = DummyClassifier(strategy='prior')

scores = evaluate_model(X, y, model)

print(f'Mean BSS {np.mean(scores)}')

<h3><center>4. Evaluate Probabilistic Models</center></h3>
<div style="font-family:verdana; word-spacing:1.5px;font-size:16px">
    <ul>
        <li>Logistic Regression (LR)
        <li>Linear Discriminant Analysis (LDA)
        <li>Quadratic Discriminant Analysis (QDA)
        <li>Gaussian Naive Bayes (GNB)
        <li>Multinomial Naive Bayes (MNB)
        <li>Gaussian Process (GPC)
     </ul>
</div>

<h3>4.1. Probabilistic Algorithm Evaluation</h3>

In [None]:
def get_models():
    models, names = list(), list()
    models.append(LogisticRegression(solver='lbfgs')) 
    names.append('LR')
    
    models.append(LinearDiscriminantAnalysis()) 
    names.append('LDA')
    
    models.append(QuadraticDiscriminantAnalysis()) 
    names.append('QDA')
    
    models.append(GaussianNB()) 
    names.append('GNB')
    
    models.append(MultinomialNB()) 
    names.append('MNB')
    
    models.append(GaussianProcessClassifier()) 
    names.append('GPC')
    
    return models, names

In [None]:
models, names = get_models() 
results = list()

for i in range(len(models)):
    scores = evaluate_model(X, y, models[i])
    results.append(scores)

    print('Model : %s, Mean : %.3f, STD : %.3f' % (names[i], np.mean(scores), np.std(scores)))

In [None]:
plt.figure(figsize=(15,7))
plt.boxplot(results, labels=names, showmeans=True)
plt.show()

<blockquote>
Interestingly, most if not all algorithms show a spread indicating that they may be unskillful on some of the runs. The distribution between the two top-performing models appears roughly equivalent, so choosing a model based on mean performance might be a good start.</blockquote>

<h3>4.2. Model Evaluation With Scaled Inputs </h3>

In [None]:
models, names = get_models() 
results = list()

for i in range(len(models)):
    steps = [('t',StandardScaler()),('m',models[i])]
    pipeline = Pipeline(steps=steps)
    scores = evaluate_model(X, y, pipeline)
    results.append(scores)

    print('Model : %s, Mean : %.3f, STD : %.3f' % (names[i], np.mean(scores), np.std(scores)))

In [None]:
plt.figure(figsize=(15,7))
plt.boxplot(results, labels=names, showmeans=True)
plt.show()

<h3>4.3. Model Evaluation With Power Transform</h3>
<div style="font-family:verdana; word-spacing:1.5px;font-size:16px">
Power transforms, such as the Box-Cox and Yeo-Johnson transforms, are designed to change the distribution to be more Gaussian. This will help with the age input variable in our dataset and may help with the nodes variable and un-bunch the distribution slightly.<br><br>
The power transform may make use of a log() function, which does not work on zero values. We have zero values in our dataset, therefore we will scale the dataset prior to the power transform using a MinMaxScaler.</div>

In [None]:
models, names = get_models() 
results = list()

for i in range(len(models)):
    steps = [('scale',MinMaxScaler()),('powert',PowerTransformer()),('model',models[i])]
    pipeline = Pipeline(steps=steps)
    scores = evaluate_model(X, y, pipeline)
    results.append(scores)

    print('Model : %s, Mean : %.3f, STD : %.3f' % (names[i], np.mean(scores), np.std(scores)))


In [None]:
plt.figure(figsize=(15,7))
plt.boxplot(results, labels=names, showmeans=True)
plt.show()

<blockquote>
    Box and whisker plots are created for the results from each algorithm, suggesting perhaps a smaller and more focused spread for LR compared to the LDA, which was the second-best performing method. All methods still show skill on average, however the distribution of scores show runs that drop below 0.0 (no skill) in some cases.
    </blockquote>

<h3><center>5. Selecting Logistic regression & Fitting</center></h3>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=1)

model = LogisticRegression(solver='lbfgs')

steps = [('scaler',MinMaxScaler()), ('powert',PowerTransformer()), ('model',model)]
pipeline = Pipeline(steps=steps)

c_values = [100, 10, 1.0, 0.1, 0.01]
grid = dict(model__C=c_values)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)


grid_search = GridSearchCV(estimator=pipeline, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

<h3>Prediction</h3>

In [None]:
best_model = grid_result.best_estimator_

y_pred = best_model.predict(X_test)

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

<blockquote> Predictions are very poor especially for the positive class making all the predictions wrong</blockquote>

<h3>Using Heuristic(balanced) classw weights in Logistic regression</h3>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=1)

model = LogisticRegression(solver='lbfgs', class_weight='balanced')

steps = [('scaler',MinMaxScaler()), ('powert',PowerTransformer()), ('model',model)]
pipeline = Pipeline(steps=steps)

c_values = [100, 10, 1.0, 0.1, 0.01]
grid = dict(model__C=c_values)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)


grid_search = GridSearchCV(estimator=pipeline, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

In [None]:
best_model = grid_result.best_estimator_

y_pred = best_model.predict(X_test)

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

<h3>Predicting Probablities</h3>

In [None]:
#Survival Cases
print('Survival Cases:')
data = [[31,59,2], [31,65,4], [34,60,1]]
for row in data:
    yhat = best_model.predict_proba([row])
    p_survive = yhat[0, 0] * 100
    print('>data=%s, Survival=%.3f%%' % (row, p_survive)) 


# some non-survival cases
print('Non-Survival Cases:')
data = [[44,64,6], [34,66,9], [38,69,21]]

for row in data:
    yhat = best_model.predict_proba([row])
    p_survive = yhat[0, 0] * 100
    print('>data=%s, Survival=%.3f%%' % (row, p_survive))