## In this practice session, we will go hand on with Imbalance classification problems. We will be leveraging Imblearn package to deal with imbalance in the dataset

## Import the required libraries

In [None]:
!python -m pip install pip --upgrade --user -q
!python -m pip install numpy pandas seaborn matplotlib scipy statsmodels sklearn imblearn --user -q

In [None]:
import IPython
IPython.Application.instance().kernel.do_shutdown(True)

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('diabetes.csv')
df.head()

In [None]:
df['Outcome'].value_counts()

In [None]:
count_classes = pd.value_counts(df['Outcome'], sort = True)

count_classes.plot(kind = 'bar', rot=0)

plt.title("Class Distribution")

plt.xlabel("Class")

plt.ylabel("Frequency")

In [None]:
X = df.drop('Outcome',axis = 1)
Y = df['Outcome']

In [None]:
X.shape,Y.shape

In [None]:
## Get the Fraud and the normal dataset 

diabetic = df[df['Outcome']==1]

normal = df[df['Outcome']==0]

In [None]:
print(diabetic.shape, normal.shape)

## NearMiss

In [None]:
from imblearn.under_sampling import NearMiss

nm = NearMiss()

X_res, y_res = nm.fit_resample(X,Y)

In [None]:
X_res.shape, y_res.shape

In [None]:
from collections import Counter
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))

## Random Undersampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler

nm = RandomUnderSampler()

X_res,y_res=nm.fit_resample(X,Y)

X_res.shape,y_res.shape

In [None]:
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))

## SMOTE-Tomek

In [None]:
from imblearn.combine import SMOTETomek

smk = SMOTETomek()

X_res, y_res = smk.fit_resample(X,Y)

In [None]:
X_res.shape, y_res.shape

In [None]:
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))

## SMOTE-ENN

In [None]:
from imblearn.combine import SMOTEENN

smk = SMOTEENN()

X_res, y_res = smk.fit_resample(X,Y)

X_res.shape,y_res.shape

In [None]:
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))

## Random OverSampling

In [None]:
from imblearn.over_sampling import RandomOverSampler

os =  RandomOverSampler()

X_train_res, y_train_res = os.fit_resample(X, Y)

X_train_res.shape, y_train_res.shape

In [None]:
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(y_train_res)))

## SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

os =  SMOTE()

X_train_res, y_train_res = os.fit_resample(X, Y)

X_train_res.shape,y_train_res.shape

In [None]:
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(y_train_res)))

## Easy Ensemble

In [None]:
# from imblearn.ensemble import EasyEnsembleClassifier

# easy = EasyEnsembleClassifier()
# easy.fit(X,Y)

In [None]:
# easy.score(X,Y)

## Balanced RandomForest

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=42)

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier

brf = BalancedRandomForestClassifier()

brf.fit(X_train,Y_train)

In [None]:
brf.score(X_train,Y_train)

In [None]:
brf.score(X_test,Y_test)