#### Test on Classification with Imbalanced Dataset

The puprose of this exercise is to test the effect on classification model performance when:

- balanced test set is used within each fold of CV
- imbalanced test set is used within each fold of CV


[Refernce article](https://medium.com/lumiata/cross-validation-for-imbalanced-datasets-9d203ba47e8) 
</b>

[Data source](https://github.com/lumiata/tech_blog/blob/master/Cross_Validation_Imbalanced_Datasets/data/data_updated.csv)

In [1]:
data = pd.read_csv('data_updated.txt', index_col='id')
data.head()

Unnamed: 0_level_0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0


In [2]:
data.Class.value_counts()

0    444
1     12
Name: Class, dtype: int64

In [3]:
# import packages
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold

In [4]:
# define metrics function for convenience

def metrics(true, pred):
    df = pd.Series({
        'accuracy': accuracy_score(true, pred),
        'recall': recall_score(true, pred),
        'precision': precision_score(true, pred),
        'f1_score': f1_score(true, pred)
    })
    return df

def cv_metrics(model, train, test, n=5):
    df = pd.DataFrame({
        'cv_accuracy': cross_val_score(model, train, test, scoring='accuracy', cv=n),
        'cv_recall': cross_val_score(model, train, test, scoring='recall', cv=n),
        'cv_precision': cross_val_score(model, train, test, scoring='precision', cv=n),
        'cv_f1_score': cross_val_score(model, train, test, scoring='f1', cv=n), 
    })
    return df

### Case 1 - balance dataset first, then cross validation

In [5]:
# split data into train vs test

X, y = data.drop(columns='Class'), data.Class
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=63445, test_size=0.3\
                                                   ,stratify=y)

In [6]:
# oversample with SMOTE

sm = SMOTE(random_state=63445)
X_train, y_train = sm.fit_sample(X_train, y_train)

In [8]:
## cv scores on train set
rf = RandomForestClassifier(n_estimators=5, random_state=63445)
cv_metrics(rf, X_train, y_train).mean()

cv_accuracy     0.993548
cv_recall       1.000000
cv_precision    0.987595
cv_f1_score     0.993676
dtype: float64

In [9]:
# results on test set
pred = rf.fit(X_train, y_train).predict(X_test)
metrics(y_test, pred)

accuracy     0.978102
recall       0.500000
precision    0.666667
f1_score     0.571429
dtype: float64

### Case 2 - balance data set during cross validation

In [10]:
# restate train vs test sets

X, y = data.drop(columns='Class'), data.Class
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=63445, test_size=0.3\
                                                   ,stratify=y)

In [11]:
kf = StratifiedKFold(n_splits=5, random_state=63445)

cv_val = []

for train_idx, vali_idx in kf.split(X_train, y_train):
    train, vali = X_train.iloc[train_idx], X_train.iloc[vali_idx]
    tgt_train, tgt_vali = y_train.iloc[train_idx], y_train.iloc[vali_idx]
    sm = SMOTE(random_state=63445)
    X_train_sm, y_train_sm = sm.fit_sample(train, tgt_train)
    
    rf = RandomForestClassifier(n_estimators=5, random_state=63445)
    
    pred = rf.fit(X_train_sm, y_train_sm).predict(vali)
    
    cv_val.append(metrics(tgt_vali, pred))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [12]:
pd.concat(cv_val, axis=1).mean(1)

accuracy     0.977927
recall       0.500000
precision    0.450000
f1_score     0.413333
dtype: float64