## Handle unbalanced dataset

In [None]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, EditedNearestNeighbours
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report



In [None]:
def fit_and_evaluate(model, X_train, y_train, X_test, y_test):
    _ = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    target_names = ['No', 'Yes']
    print(model)
    print(classification_report(y_test, y_pred, target_names=target_names))
def random_oversample(X_train, y_train, ratio=1):
    ros = RandomOverSampler(sampling_strategy=ratio, random_state=0)
    return ros.fit_sample(X_train, y_train)
def SMOTE_oversample(X_train, y_train, ratio=1):
    smote = SMOTE(sampling_strategy=ratio)
    return smote.fit_sample(X_train, y_train)
def random_undersample(X_train, y_train, ratio=1):
    rus = RandomUnderSampler(sampling_strategy=ratio, random_state=0)
    return rus.fit_sample(X_train, y_train)



###Oversampling 

#### Naive Random Oversampling

##### 1:1 ratio

In [None]:
# 1:1 ratio
ros = RandomOverSampler(random_state=0)
X_train_ros_11, y_train_ros_11 = ros.fit_sample(X_train_transformed, y_train)



In [None]:
pd.Series(y_train_ros).value_counts()

1    88249
0    88249
dtype: int64

In [None]:
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [None]:
log_clf = LogisticRegression(random_state=0, max_iter=10000)
linear_svm = LinearSVC(random_state=0, max_iter=10000)
tree = DecisionTreeClassifier(random_state=0)

In [None]:
_ = log_clf.fit(X_train_ros, y_train_ros)
y_pred = log_clf.predict(X_test_transformed)
target_names = ['No', 'Yes']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

          No       0.93      0.80      0.86     22067
         Yes       0.53      0.78      0.63      6372

    accuracy                           0.80     28439
   macro avg       0.73      0.79      0.75     28439
weighted avg       0.84      0.80      0.81     28439



In [None]:
_ = linear_svm.fit(X_train_ros, y_train_ros)
y_pred = linear_svm.predict(X_test_transformed)
target_names = ['No', 'Yes']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

          No       0.93      0.81      0.86     22067
         Yes       0.54      0.78      0.64      6372

    accuracy                           0.80     28439
   macro avg       0.73      0.79      0.75     28439
weighted avg       0.84      0.80      0.81     28439



In [None]:
_ = tree.fit(X_train_ros, y_train_ros)
y_pred = tree.predict(X_test_transformed)
target_names = ['No', 'Yes']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

          No       0.87      0.87      0.87     22067
         Yes       0.54      0.54      0.54      6372

    accuracy                           0.79     28439
   macro avg       0.70      0.70      0.70     28439
weighted avg       0.79      0.79      0.79     28439



##### 3:4 ratio

In [None]:
ros = RandomOverSampler(random_state=0, sampling_strategy=.75)
X_train_ros_34, y_train_ros_34 = ros.fit_sample(X_train_transformed, y_train)



In [None]:
fit_and_evaluate(LogisticRegression(random_state=0, max_iter=10000), X_train_ros_34, y_train_ros_34, X_test_transformed, y_test)
fit_and_evaluate(LinearSVC(random_state=0, max_iter=10000), X_train_ros_34, y_train_ros_34, X_test_transformed, y_test)
fit_and_evaluate(DecisionTreeClassifier(random_state=0), X_train_ros_34, y_train_ros_34, X_test_transformed, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
              precision    recall  f1-score   support

          No       0.92      0.85      0.88     22067
         Yes       0.58      0.73      0.65      6372

    accuracy                           0.82     28439
   macro avg       0.75      0.79      0.76     28439
weighted avg       0.84      0.82      0.83     28439

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)
              precision    recall  f1-score   support

          No       0.91      0.85      0.88     22067
         Ye

##### 2:3 ratio 

In [None]:
X_train_ros_23, y_train_ros_23 = random_oversample(X_train_transformed, y_train, ratio=2/3)
# fit_and_evaluate(LogisticRegression(random_state=0, max_iter=10000), X_train_ros_23, y_train_ros_23, X_test_transformed, y_test)
# fit_and_evaluate(LinearSVC(random_state=0, max_iter=10000), X_train_ros_23, y_train_ros_23, X_test_transformed, y_test)
# fit_and_evaluate(DecisionTreeClassifier(random_state=0), X_train_ros_23, y_train_ros_23, X_test_transformed, y_test)



##### 1:2 ratio

In [None]:
X_train_ros_12, y_train_ros_12 = random_oversample(X_train_transformed, y_train, ratio=0.5)
# fit_and_evaluate(LogisticRegression(random_state=0, max_iter=10000), X_train_ros_23, y_train_ros_23, X_test_transformed, y_test)
# fit_and_evaluate(LinearSVC(random_state=0, max_iter=10000), X_train_ros_23, y_train_ros_23, X_test_transformed, y_test)
# fit_and_evaluate(DecisionTreeClassifier(random_state=0), X_train_ros_23, y_train_ros_23, X_test_transformed, y_test)



#### SMOTE 



##### 1:1

In [None]:
X_train_smote_11, y_train_smote_11 = SMOTE_oversample(X_train_transformed, y_train)




In [None]:
fit_and_evaluate(LogisticRegression(random_state=0, max_iter=10000), X_train_smote_11, y_train_smote_11, X_test_transformed, y_test)
fit_and_evaluate(LinearSVC(random_state=0, max_iter=10000), X_train_smote_11, y_train_smote_11, X_test_transformed, y_test)
fit_and_evaluate(DecisionTreeClassifier(random_state=0), X_train_smote_11, y_train_smote_11, X_test_transformed, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
              precision    recall  f1-score   support

          No       0.92      0.81      0.86     22067
         Yes       0.53      0.77      0.63      6372

    accuracy                           0.80     28439
   macro avg       0.73      0.79      0.75     28439
weighted avg       0.84      0.80      0.81     28439

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)
              precision    recall  f1-score   support

          No       0.92      0.81      0.86     22067
         Ye

##### 3:4 ratio

In [None]:
X_train_smote_34, y_train_smote_34 = SMOTE_oversample(X_train_transformed, y_train, ratio=3/4)




In [None]:
fit_and_evaluate(LogisticRegression(random_state=0, max_iter=10000), X_train_smote_34, y_train_smote_34, X_test_transformed, y_test)
fit_and_evaluate(LinearSVC(random_state=0, max_iter=10000), X_train_smote_34, y_train_smote_34, X_test_transformed, y_test)
fit_and_evaluate(DecisionTreeClassifier(random_state=0), X_train_smote_34, y_train_smote_34, X_test_transformed, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
              precision    recall  f1-score   support

          No       0.91      0.85      0.88     22067
         Yes       0.58      0.72      0.65      6372

    accuracy                           0.82     28439
   macro avg       0.75      0.79      0.76     28439
weighted avg       0.84      0.82      0.83     28439

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)
              precision    recall  f1-score   support

          No       0.91      0.85      0.88     22067
         Ye

##### 2:3 ratio

In [None]:
X_train_smote_23, y_train_smote_23 = SMOTE_oversample(X_train_transformed, y_train, ratio=2/3)




In [None]:
fit_and_evaluate(LogisticRegression(random_state=0, max_iter=10000), X_train_smote_23, y_train_smote_23, X_test_transformed, y_test)
fit_and_evaluate(LinearSVC(random_state=0, max_iter=10000), X_train_smote_23, y_train_smote_23, X_test_transformed, y_test)
fit_and_evaluate(DecisionTreeClassifier(random_state=0), X_train_smote_23, y_train_smote_23, X_test_transformed, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
              precision    recall  f1-score   support

          No       0.91      0.86      0.89     22067
         Yes       0.60      0.70      0.65      6372

    accuracy                           0.83     28439
   macro avg       0.75      0.78      0.77     28439
weighted avg       0.84      0.83      0.83     28439

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)
              precision    recall  f1-score   support

          No       0.91      0.87      0.89     22067
         Ye

##### 1:2 ratio

In [None]:
X_train_smote_12, y_train_smote_12 = SMOTE_oversample(X_train_transformed, y_train, ratio=0.5)




In [None]:
fit_and_evaluate(LogisticRegression(random_state=0, max_iter=10000), X_train_smote_12, y_train_smote_12, X_test_transformed, y_test)
fit_and_evaluate(LinearSVC(random_state=0, max_iter=10000), X_train_smote_12, y_train_smote_12, X_test_transformed, y_test)
fit_and_evaluate(DecisionTreeClassifier(random_state=0), X_train_smote_12, y_train_smote_12, X_test_transformed, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
              precision    recall  f1-score   support

          No       0.90      0.90      0.90     22067
         Yes       0.65      0.64      0.64      6372

    accuracy                           0.84     28439
   macro avg       0.77      0.77      0.77     28439
weighted avg       0.84      0.84      0.84     28439

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)
              precision    recall  f1-score   support

          No       0.89      0.90      0.90     22067
         Ye

In [None]:
#1:1 ratio
y_pred = log_clf.predict(X_test_transformed)
target_names = ['No', 'Yes']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

          No       0.92      0.80      0.86     22067
         Yes       0.53      0.77      0.63      6372

    accuracy                           0.80     28439
   macro avg       0.73      0.79      0.75     28439
weighted avg       0.84      0.80      0.81     28439



In [None]:
#1:3 ratio
y_pred = log_clf.predict(X_test_transformed)
target_names = ['No', 'Yes']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

          No       0.88      0.94      0.91     22067
         Yes       0.71      0.55      0.62      6372

    accuracy                           0.85     28439
   macro avg       0.80      0.74      0.76     28439
weighted avg       0.84      0.85      0.84     28439



In [None]:
#1:2 ratio
y_pred = log_clf.predict(X_test_transformed)
target_names = ['No', 'Yes']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

          No       0.90      0.90      0.90     22067
         Yes       0.65      0.64      0.64      6372

    accuracy                           0.84     28439
   macro avg       0.77      0.77      0.77     28439
weighted avg       0.84      0.84      0.84     28439



In [None]:
#2:3 ratio
y_pred = log_clf.predict(X_test_transformed)
target_names = ['No', 'Yes']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

          No       0.91      0.86      0.89     22067
         Yes       0.60      0.70      0.65      6372

    accuracy                           0.83     28439
   macro avg       0.75      0.78      0.77     28439
weighted avg       0.84      0.83      0.83     28439



In [None]:
log_clf.score(X_test_transformed, y_test)

0.8280530257744646

### Undersampling

#### Controlled Undersampling

##### 1:1 ratio

In [None]:
X_train_rus_11, y_train_rus_11 = random_undersample(X_train_transformed, y_train) 



In [None]:
pd.Series(y_train_rus_11).value_counts()

1    25505
0    25505
dtype: int64

In [None]:
fit_and_evaluate(LogisticRegression(random_state=0, max_iter=10000), X_train_rus_11, y_train_rus_11, X_test_transformed, y_test)
fit_and_evaluate(LinearSVC(random_state=0, max_iter=10000), X_train_rus_11, y_train_rus_11, X_test_transformed, y_test)
fit_and_evaluate(DecisionTreeClassifier(random_state=0), X_train_rus_11, y_train_rus_11, X_test_transformed, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
              precision    recall  f1-score   support

          No       0.93      0.80      0.86     22067
         Yes       0.53      0.78      0.63      6372

    accuracy                           0.80     28439
   macro avg       0.73      0.79      0.75     28439
weighted avg       0.84      0.80      0.81     28439

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)
              precision    recall  f1-score   support

          No       0.93      0.81      0.86     22067
         Ye

##### 3:4 ratio

In [None]:
X_train_rus_34, y_train_rus_34 = random_undersample(X_train_transformed, y_train, ratio=0.75) 



In [None]:
pd.Series(y_train_rus_34).value_counts()

0    34006
1    25505
dtype: int64

In [None]:
fit_and_evaluate(LogisticRegression(random_state=0, max_iter=10000), X_train_rus_34, y_train_rus_34, X_test_transformed, y_test)
fit_and_evaluate(LinearSVC(random_state=0, max_iter=10000), X_train_rus_34, y_train_rus_34, X_test_transformed, y_test)
fit_and_evaluate(DecisionTreeClassifier(random_state=0), X_train_rus_34, y_train_rus_34, X_test_transformed, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
              precision    recall  f1-score   support

          No       0.91      0.85      0.88     22067
         Yes       0.58      0.72      0.64      6372

    accuracy                           0.82     28439
   macro avg       0.75      0.79      0.76     28439
weighted avg       0.84      0.82      0.83     28439

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)
              precision    recall  f1-score   support

          No       0.91      0.85      0.88     22067
         Ye

##### 2:3 ratio

In [None]:
X_train_rus_23, y_train_rus_23 = random_undersample(X_train_transformed, y_train, ratio=2/3) 



In [None]:
pd.Series(y_train_rus_23).value_counts()

0    38257
1    25505
dtype: int64

In [None]:
fit_and_evaluate(LogisticRegression(random_state=0, max_iter=10000), X_train_rus_23, y_train_rus_23, X_test_transformed, y_test)
fit_and_evaluate(LinearSVC(random_state=0, max_iter=10000), X_train_rus_23, y_train_rus_23, X_test_transformed, y_test)
fit_and_evaluate(DecisionTreeClassifier(random_state=0), X_train_rus_23, y_train_rus_23, X_test_transformed, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
              precision    recall  f1-score   support

          No       0.91      0.87      0.89     22067
         Yes       0.60      0.70      0.65      6372

    accuracy                           0.83     28439
   macro avg       0.76      0.78      0.77     28439
weighted avg       0.84      0.83      0.83     28439

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)
              precision    recall  f1-score   support

          No       0.91      0.87      0.89     22067
         Ye

##### 1:2 ratio

In [None]:
X_train_rus_12, y_train_rus_12 = random_undersample(X_train_transformed, y_train, ratio=0.5) 



In [None]:
X_train_rus_12.shape

(76515, 118)

In [None]:
pd.Series(y_train_rus_12).value_counts()

0    51010
1    25505
dtype: int64

In [None]:
fit_and_evaluate(LogisticRegression(random_state=0, max_iter=10000), X_train_rus_12, y_train_rus_12, X_test_transformed, y_test)
fit_and_evaluate(LinearSVC(random_state=0, max_iter=10000), X_train_rus_12, y_train_rus_12, X_test_transformed, y_test)
fit_and_evaluate(DecisionTreeClassifier(random_state=0), X_train_rus_12, y_train_rus_12, X_test_transformed, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
              precision    recall  f1-score   support

          No       0.89      0.90      0.90     22067
         Yes       0.65      0.63      0.64      6372

    accuracy                           0.84     28439
   macro avg       0.77      0.77      0.77     28439
weighted avg       0.84      0.84      0.84     28439

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)
              precision    recall  f1-score   support

          No       0.89      0.90      0.90     22067
         Ye

#### Cleaning Undersampling

##### Tomek Links

In [None]:
tomek = TomekLinks(random_state=0, n_jobs=-1)

In [None]:
X_train_tomek, y_train_tomek = tomek.fit_sample(X_train_transformed, y_train) 



In [None]:
pd.Series(y_train_tomek).value_counts()

0    81474
1    25505
dtype: int64

In [None]:
fit_and_evaluate(LogisticRegression(random_state=0, max_iter=10000), X_train_tomek, y_train_tomek, X_test_transformed, y_test)
fit_and_evaluate(LinearSVC(random_state=0, max_iter=10000), X_train_tomek, y_train_tomek, X_test_transformed, y_test)
fit_and_evaluate(DecisionTreeClassifier(random_state=0), X_train_tomek, y_train_tomek, X_test_transformed, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
              precision    recall  f1-score   support

          No       0.88      0.93      0.91     22067
         Yes       0.70      0.56      0.62      6372

    accuracy                           0.85     28439
   macro avg       0.79      0.75      0.76     28439
weighted avg       0.84      0.85      0.84     28439

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)
              precision    recall  f1-score   support

          No       0.88      0.94      0.91     22067
         Ye

##### Editted Nearest Neighbor

In [None]:
enn_all = EditedNearestNeighbours(kind_sel='all')
enn_mode = EditedNearestNeighbours(kind_sel='mode')

In [None]:
X_train_enn_all, y_train_enn_all = enn_all.fit_sample(X_train_transformed, y_train)
X_train_enn_mode, y_train_enn_mode = enn_mode.fit_sample(X_train_transformed, y_train)



In [None]:
X_train_enn_mode.shape

(105466, 118)

In [None]:
pd.Series(y_train_enn_all).value_counts()

0    58889
1    25505
dtype: int64

In [None]:
pd.Series(y_train_enn_mode).value_counts()

0    79961
1    25505
dtype: int64

In [None]:
fit_and_evaluate(LogisticRegression(random_state=0, max_iter=10000), X_train_enn_all, y_train_enn_all, X_test_transformed, y_test)
fit_and_evaluate(LinearSVC(random_state=0, max_iter=10000), X_train_enn_all, y_train_enn_all, X_test_transformed, y_test)
fit_and_evaluate(DecisionTreeClassifier(random_state=0), X_train_enn_all, y_train_enn_all, X_test_transformed, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
              precision    recall  f1-score   support

          No       0.91      0.85      0.88     22067
         Yes       0.58      0.72      0.64      6372

    accuracy                           0.82     28439
   macro avg       0.75      0.78      0.76     28439
weighted avg       0.84      0.82      0.83     28439

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)
              precision    recall  f1-score   support

          No       0.91      0.86      0.88     22067
         Ye

In [None]:
fit_and_evaluate(LogisticRegression(random_state=0, max_iter=10000), X_train_enn_mode, y_train_enn_mode, X_test_transformed, y_test)
fit_and_evaluate(LinearSVC(random_state=0, max_iter=10000), X_train_enn_mode, y_train_enn_mode, X_test_transformed, y_test)
fit_and_evaluate(DecisionTreeClassifier(random_state=0), X_train_enn_mode, y_train_enn_mode, X_test_transformed, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
              precision    recall  f1-score   support

          No       0.89      0.92      0.90     22067
         Yes       0.68      0.59      0.63      6372

    accuracy                           0.85     28439
   macro avg       0.78      0.76      0.77     28439
weighted avg       0.84      0.85      0.84     28439

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=10000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)
              precision    recall  f1-score   support

          No       0.88      0.92      0.90     22067
         Ye

In [None]:
# 1:3 ratio - no sampling
y_pred = log_clf.predict(X_test_transformed)
target_names = ['No', 'Yes']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

          No       0.87      0.95      0.91     22067
         Yes       0.74      0.52      0.61      6372

    accuracy                           0.85     28439
   macro avg       0.80      0.73      0.76     28439
weighted avg       0.84      0.85      0.84     28439



In [None]:
# 1:3 ratio - TomeK Links
y_pred = log_clf.predict(X_test_transformed)
target_names = ['No', 'Yes']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

          No       0.88      0.93      0.91     22067
         Yes       0.70      0.56      0.62      6372

    accuracy                           0.85     28439
   macro avg       0.79      0.75      0.76     28439
weighted avg       0.84      0.85      0.84     28439



### Combine

#### SMOTETomeK

In [None]:
smote_tomek = SMOTETomek(random_state=0)
X_train_smote_tomek, y_train_smote_tomek = smote_tomek.fit_sample(X_train_transformed, y_train)



#### SMOTEENN

In [None]:
smote_enn = SMOTEENN(random_state=0)
X_train_smote_enn, y_train_smote_enn = smote_enn.fit_sample(X_train_transformed, y_train)

### Save dataset

In [None]:
import os 

In [None]:
import numpy as np


In [None]:
SAVE_DIR = 'datasets'

In [None]:
np.save(os.path.join(SAVE_DIR, 'X_train_ros_11.npy'), X_train_ros_11)
np.save(os.path.join(SAVE_DIR, 'y_train_ros_11.npy'), y_train_ros_11)
np.save(os.path.join(SAVE_DIR, 'X_train_ros_34.npy'), X_train_ros_34)
np.save(os.path.join(SAVE_DIR, 'y_train_ros_34.npy'), y_train_ros_34)
np.save(os.path.join(SAVE_DIR, 'X_train_ros_23.npy'), X_train_ros_23)
np.save(os.path.join(SAVE_DIR, 'y_train_ros_23.npy'), y_train_ros_23)
np.save(os.path.join(SAVE_DIR, 'X_train_ros_12.npy'), X_train_ros_12)
np.save(os.path.join(SAVE_DIR, 'y_train_ros_12.npy'), y_train_ros_12)

In [None]:
np.save(os.path.join(SAVE_DIR, 'X_train_smote_11.npy'), X_train_smote_11)
np.save(os.path.join(SAVE_DIR, 'y_train_smote_11.npy'), y_train_smote_11)
np.save(os.path.join(SAVE_DIR, 'X_train_smote_34.npy'), X_train_smote_34)
np.save(os.path.join(SAVE_DIR, 'y_train_smote_34.npy'), y_train_smote_34)
np.save(os.path.join(SAVE_DIR, 'X_train_smote_23.npy'), X_train_smote_23)
np.save(os.path.join(SAVE_DIR, 'y_train_smote_23.npy'), y_train_smote_23)
np.save(os.path.join(SAVE_DIR, 'X_train_smote_12.npy'), X_train_smote_12)
np.save(os.path.join(SAVE_DIR, 'y_train_smote_12.npy'), y_train_smote_12)


In [None]:
np.save(os.path.join(SAVE_DIR, 'X_train_rus_11.npy'), X_train_rus_11)
np.save(os.path.join(SAVE_DIR, 'y_train_rus_11.npy'), y_train_rus_11)
np.save(os.path.join(SAVE_DIR, 'X_train_rus_34.npy'), X_train_rus_34)
np.save(os.path.join(SAVE_DIR, 'y_train_rus_34.npy'), y_train_rus_34)
np.save(os.path.join(SAVE_DIR, 'X_train_rus_23.npy'), X_train_rus_23)
np.save(os.path.join(SAVE_DIR, 'y_train_rus_23.npy'), y_train_rus_23)
np.save(os.path.join(SAVE_DIR, 'X_train_rus_12.npy'), X_train_rus_12)
np.save(os.path.join(SAVE_DIR, 'y_train_rus_12.npy'), y_train_rus_12)

In [None]:
np.save(os.path.join(SAVE_DIR, 'X_train_tomek.npy'), X_train_tomek)
np.save(os.path.join(SAVE_DIR, 'y_train_tomek.npy'), y_train_tomek)
np.save(os.path.join(SAVE_DIR, 'X_train_enn_all.npy'), X_train_enn_all)
np.save(os.path.join(SAVE_DIR, 'y_train_enn_all.npy'), y_train_enn_all)

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC, SVC 

### Class Weights

In [None]:
from sklearn.model_selection import cross_validate
scores = cross_validate(LogisticRegression(class_weight='balance'), X_train_transformed, y_train, cv=5, n_jobs=-1, scoring=['roc_auc', 'f1'], return_train_score=True, verbose=1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.5s finished


In [None]:
scores 

{'fit_time': array([5.52871084, 5.66914654, 5.3084178 , 5.29882812, 3.19924784]),
 'score_time': array([0.05131459, 0.05093265, 0.04524946, 0.04461932, 0.0280056 ]),
 'test_f1': array([0.60036664, 0.59551341, 0.5945758 , 0.6026444 , 0.60163727]),
 'test_roc_auc': array([0.86746188, 0.86593195, 0.87193621, 0.87292159, 0.86923002]),
 'train_f1': array([0.59954973, 0.60104284, 0.60042723, 0.5970944 , 0.59883906]),
 'train_roc_auc': array([0.87092843, 0.87138209, 0.87001671, 0.86969852, 0.87056826])}

In [None]:
class_weights = ['auto', 'balance', {0: .3, 1:.7}, {0: .2, 1: .8}, {0: .4, 1:.6}]

In [None]:
for class_weight in class_weights: 
    print('Class weight: {}'.format(class_weight))
    logreg = LogisticRegression(class_weight=class_weight, max_iter=10000)
    _  = logreg.fit(X_train_transformed, y_train)
    print('Accuracy: {}'.format(logreg.score(X_test_transformed, y_test)))
    y_pred = logreg.predict(X_test_transformed)
    target_names = ['No', 'Yes']
    print(classification_report(y_test, y_pred, target_names=target_names))
    print('-'*50)

Class weight: auto
Accuracy: 0.850170540455009
              precision    recall  f1-score   support

          No       0.87      0.95      0.91     22067
         Yes       0.74      0.52      0.61      6372

    accuracy                           0.85     28439
   macro avg       0.80      0.73      0.76     28439
weighted avg       0.84      0.85      0.84     28439

--------------------------------------------------
Class weight: balance
Accuracy: 0.850170540455009
              precision    recall  f1-score   support

          No       0.87      0.95      0.91     22067
         Yes       0.74      0.52      0.61      6372

    accuracy                           0.85     28439
   macro avg       0.80      0.73      0.76     28439
weighted avg       0.84      0.85      0.84     28439

--------------------------------------------------
Class weight: {0: 0.3, 1: 0.7}
Accuracy: 0.8286156334610921
              precision    recall  f1-score   support

          No       0.91      0.8

In [None]:
y_pred = logreg.predict(X_test_transformed)
target_names = ['No', 'Yes']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

          No       0.93      0.80      0.86     22067
         Yes       0.53      0.78      0.63      6372

    accuracy                           0.80     28439
   macro avg       0.73      0.79      0.75     28439
weighted avg       0.84      0.80      0.81     28439



In [None]:
logreg.score(X_test_transformed, y_test)

0.850170540455009