In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from mlxtend.classifier import StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, precision_recall_curve
from sklearn.model_selection import train_test_split
from imblearn.metrics import geometric_mean_score, sensitivity_score, specificity_score
from xgboost import XGBClassifier
from shaphypetune import BoostRFE
from sklearn.base import BaseEstimator
from sklearn.utils.metaestimators import if_delegate_has_method

import torch
import re, pickle, random, os
import warnings
warnings.filterwarnings('ignore')

from collections import Counter

In [2]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

RANDOM_STATE = 42
seed_everything(seed=RANDOM_STATE)

In [3]:
col_names = ['age','edu','hus_edu','chil','rel','work','hus_ocu','sol','media','cmc']
df = pd.read_csv('cmc.csv', names = col_names)
df.shape

(1473, 10)

In [4]:
df.head()

Unnamed: 0,age,edu,hus_edu,chil,rel,work,hus_ocu,sol,media,cmc
0,24,2,3,3,1,1,2,3,0,1
1,45,1,3,10,1,1,3,4,0,1
2,43,2,3,7,1,1,3,4,0,1
3,42,3,2,9,1,1,3,3,0,1
4,36,3,3,8,1,1,3,2,0,1


In [5]:
df['cmc'].value_counts()

1    629
3    511
2    333
Name: cmc, dtype: int64

In [6]:
df.isnull().sum()

age        0
edu        0
hus_edu    0
chil       0
rel        0
work       0
hus_ocu    0
sol        0
media      0
cmc        0
dtype: int64

In [7]:
df.describe()

Unnamed: 0,age,edu,hus_edu,chil,rel,work,hus_ocu,sol,media,cmc
count,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0
mean,32.538357,2.958588,3.429735,3.261371,0.850645,0.749491,2.137814,3.133741,0.073999,1.919891
std,8.227245,1.014994,0.816349,2.358549,0.356559,0.433453,0.864857,0.976161,0.261858,0.876376
min,16.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
25%,26.0,2.0,3.0,1.0,1.0,0.0,1.0,3.0,0.0,1.0
50%,32.0,3.0,4.0,3.0,1.0,1.0,2.0,3.0,0.0,2.0
75%,39.0,4.0,4.0,4.0,1.0,1.0,3.0,4.0,0.0,3.0
max,49.0,4.0,4.0,16.0,1.0,1.0,4.0,4.0,1.0,3.0


# Train and Test Split

In [8]:
x = df.drop(['cmc'], axis=1)
y = df['cmc']

X_train,X_test,y_train,y_test=train_test_split(x, y, train_size=0.8, stratify = y, random_state=100)

In [9]:
y_train.shape, y_test.shape

((1178,), (295,))

In [10]:
y_train.value_counts()/len(y_train)

1    0.426995
3    0.347199
2    0.225806
Name: cmc, dtype: float64

In [11]:
y_test.value_counts()/len(y_test)

1    0.427119
3    0.345763
2    0.227119
Name: cmc, dtype: float64

In [12]:
from sklearn.preprocessing import StandardScaler
Scaler_X = StandardScaler()
X_train = Scaler_X.fit_transform(X_train)
X_test = Scaler_X.transform(X_test)

# Model Building

In [13]:
model = list()
precision = list()
recall = list()
F1score = list()
AUCROC = list()
balanced_acc = list()
GMean = list()
sensi = list()
speci = list()

In [14]:
def test_eval(clf_model, X_test, y_test, algo=None):

    y_prob=clf_model.predict_proba(X_test)
    y_pred=clf_model.predict(X_test)

    print('Confusion Matrix')
    print('='*60)
    print(confusion_matrix(y_test,y_pred),"\n")
    print('Classification Report')
    print('='*60)
    print(classification_report(y_test,y_pred),"\n")
    print('='*60)
    print('AUC-ROC')
    print(roc_auc_score(y_test, y_prob, multi_class='ovr'), "\n")
    print('Balanced Accuracy')
    print(balanced_accuracy_score(y_test, y_pred),"\n")
    print('Geometric Mean Score')
    print(geometric_mean_score(y_test, y_pred, average='macro'),"\n")
    print('Sensitivity')
    print(sensitivity_score(y_test, y_pred, average='macro'),"\n")
    print('Specificity')
    print(specificity_score(y_test, y_pred, average='macro'))
          
    model.append(algo)
    precision.append(precision_score(y_test,y_pred, average='macro'))
    recall.append(recall_score(y_test,y_pred, average='macro'))
    F1score.append(f1_score(y_test,y_pred, average='macro'))
    AUCROC.append(roc_auc_score(y_test, y_prob, multi_class='ovr', average='macro'))
    balanced_acc.append(balanced_accuracy_score(y_test, y_pred))
    GMean.append(geometric_mean_score(y_test, y_pred, average='macro'))
    sensi.append(sensitivity_score(y_test, y_pred, average='macro'))
    speci.append(specificity_score(y_test, y_pred, average='macro'))


## Model-1: Logistic Regression

In [15]:
clf_LR = LogisticRegression(multi_class='ovr', solver='liblinear')
clf_LR.fit(X_train, y_train)

LogisticRegression(multi_class='ovr', solver='liblinear')

In [16]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression')

Confusion Matrix
[[85  4 37]
 [26 21 20]
 [47 14 41]] 

Classification Report
              precision    recall  f1-score   support

           1       0.54      0.67      0.60       126
           2       0.54      0.31      0.40        67
           3       0.42      0.40      0.41       102

    accuracy                           0.50       295
   macro avg       0.50      0.46      0.47       295
weighted avg       0.50      0.50      0.49       295
 

AUC-ROC
0.672170574764154 

Balanced Accuracy
0.46333226491259855 

Geometric Mean Score
0.5820771035610383 

Sensitivity
0.46333226491259855 

Specificity
0.7312543937640957


## Model-2: Decision Tree

In [17]:
clf_DT = DecisionTreeClassifier()
clf_DT.fit(X_train, y_train)

DecisionTreeClassifier()

In [18]:
test_eval(clf_DT, X_test, y_test, 'Decision Tree')

Confusion Matrix
[[73 19 34]
 [23 20 24]
 [32 22 48]] 

Classification Report
              precision    recall  f1-score   support

           1       0.57      0.58      0.57       126
           2       0.33      0.30      0.31        67
           3       0.45      0.47      0.46       102

    accuracy                           0.48       295
   macro avg       0.45      0.45      0.45       295
weighted avg       0.47      0.48      0.48       295
 

AUC-ROC
0.5923161444254756 

Balanced Accuracy
0.4494869257819214 

Geometric Mean Score
0.5733731444182364 

Sensitivity
0.4494869257819214 

Specificity
0.7314045056330724


## Model-3: Gaussian NB

In [19]:
clf_NB = GaussianNB()
clf_NB.fit(X_train, y_train)

GaussianNB()

In [20]:
test_eval(clf_NB, X_test, y_test, 'Gaussian NB')

Confusion Matrix
[[46 43 37]
 [11 41 15]
 [20 34 48]] 

Classification Report
              precision    recall  f1-score   support

           1       0.60      0.37      0.45       126
           2       0.35      0.61      0.44        67
           3       0.48      0.47      0.48       102

    accuracy                           0.46       295
   macro avg       0.47      0.48      0.46       295
weighted avg       0.50      0.46      0.46       295
 

AUC-ROC
0.6350590191360824 

Balanced Accuracy
0.4825359662936484 

Geometric Mean Score
0.5961330908553857 

Sensitivity
0.4825359662936484 

Specificity
0.7364728990927308


## Model-4: K-Nearest Neighbour

In [21]:
clf_KNN = KNeighborsClassifier()
clf_KNN.fit(X_train, y_train)

KNeighborsClassifier()

In [22]:
test_eval(clf_KNN, X_test, y_test, 'KNN')

Confusion Matrix
[[69 20 37]
 [31 19 17]
 [41 18 43]] 

Classification Report
              precision    recall  f1-score   support

           1       0.49      0.55      0.52       126
           2       0.33      0.28      0.31        67
           3       0.44      0.42      0.43       102

    accuracy                           0.44       295
   macro avg       0.42      0.42      0.42       295
weighted avg       0.44      0.44      0.44       295
 

AUC-ROC
0.6204733239440317 

Balanced Accuracy
0.41758992154075564 

Geometric Mean Score
0.5441889013979156 

Sensitivity
0.41758992154075564 

Specificity
0.7091683614202546


## MODEL-5 Support Vector Classifier

In [23]:
clf_SVC = SVC(probability=True)
clf_SVC.fit(X_train, y_train)

SVC(probability=True)

In [24]:
test_eval(clf_SVC, X_test, y_test, 'SVC')

Confusion Matrix
[[85 10 31]
 [21 27 19]
 [35 12 55]] 

Classification Report
              precision    recall  f1-score   support

           1       0.60      0.67      0.64       126
           2       0.55      0.40      0.47        67
           3       0.52      0.54      0.53       102

    accuracy                           0.57       295
   macro avg       0.56      0.54      0.54       295
weighted avg       0.56      0.57      0.56       295
 

AUC-ROC
0.7030879007308452 

Balanced Accuracy
0.5389346451681835 

Geometric Mean Score
0.6446185438148395 

Sensitivity
0.5389346451681835 

Specificity
0.771026822557103


# MODEL-6 XGBoost

In [25]:
class BoostRFEWrap(BaseEstimator, BoostRFE):

    @if_delegate_has_method(delegate='estimator')
    def predict_proba(self, X):
        return self.predict(X, method='predict_proba')


xgb_params = {'max_depth': (5,10),
          'learning_rate': (0.001, 0.3),
          'n_estimators': (25, 200),
          'reg_alpha' : (0.1, 1),
          'reg_lambda': (0.1, 1),
          'subsample': (0.5,  0.9),
          'colsample_bytree': (0.5,1),
          'min_child_weight': (0,10),
          'gamma': (0,1)}

clf_XGB = BoostRFEWrap(XGBClassifier(), param_grid=xgb_params, importance_type='shap_importances', train_importance=False, min_features_to_select=1, step=1)

In [26]:
xclf = clf_XGB.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=6, verbose=0)


512 trials detected for ('max_depth', 'learning_rate', 'n_estimators', 'reg_alpha', 'reg_lambda', 'subsample', 'colsample_bytree', 'min_child_weight', 'gamma')

trial: 0001 ### iterations: 00005 ### eval_score: 0.45763
trial: 0002 ### iterations: 00015 ### eval_score: 0.44068
trial: 0003 ### iterations: 00007 ### eval_score: 0.41695
trial: 0004 ### iterations: 00011 ### eval_score: 0.44068
trial: 0005 ### iterations: 00012 ### eval_score: 0.4678
trial: 0006 ### iterations: 00009 ### eval_score: 0.45763
trial: 0007 ### iterations: 00002 ### eval_score: 0.42712
trial: 0008 ### iterations: 00003 ### eval_score: 0.42373
trial: 0009 ### iterations: 00008 ### eval_score: 0.46102
trial: 0010 ### iterations: 00012 ### eval_score: 0.44746
trial: 0011 ### iterations: 00013 ### eval_score: 0.4339
trial: 0012 ### iterations: 00007 ### eval_score: 0.43729
trial: 0013 ### iterations: 00008 ### eval_score: 0.4678
trial: 0014 ### iterations: 00002 ### eval_score: 0.46102
trial: 0015 ### iterations: 0

trial: 0140 ### iterations: 00017 ### eval_score: 0.4339
trial: 0141 ### iterations: 00010 ### eval_score: 0.45085
trial: 0142 ### iterations: 00002 ### eval_score: 0.45424
trial: 0143 ### iterations: 00003 ### eval_score: 0.44407
trial: 0144 ### iterations: 00010 ### eval_score: 0.45763
trial: 0145 ### iterations: 00010 ### eval_score: 0.44407
trial: 0146 ### iterations: 00002 ### eval_score: 0.42373
trial: 0147 ### iterations: 00009 ### eval_score: 0.43729
trial: 0148 ### iterations: 00009 ### eval_score: 0.45763
trial: 0149 ### iterations: 00004 ### eval_score: 0.45085
trial: 0150 ### iterations: 00007 ### eval_score: 0.43729
trial: 0151 ### iterations: 00006 ### eval_score: 0.4339
trial: 0152 ### iterations: 00006 ### eval_score: 0.42034
trial: 0153 ### iterations: 00013 ### eval_score: 0.43051
trial: 0154 ### iterations: 00007 ### eval_score: 0.45085
trial: 0155 ### iterations: 00004 ### eval_score: 0.42712
trial: 0156 ### iterations: 00010 ### eval_score: 0.43051
trial: 0157 ### 

trial: 0282 ### iterations: 00010 ### eval_score: 0.43729
trial: 0283 ### iterations: 00009 ### eval_score: 0.4339
trial: 0284 ### iterations: 00011 ### eval_score: 0.4339
trial: 0285 ### iterations: 00006 ### eval_score: 0.44407
trial: 0286 ### iterations: 00004 ### eval_score: 0.44407
trial: 0287 ### iterations: 00003 ### eval_score: 0.46102
trial: 0288 ### iterations: 00003 ### eval_score: 0.45763
trial: 0289 ### iterations: 00003 ### eval_score: 0.43051
trial: 0290 ### iterations: 00014 ### eval_score: 0.41695
trial: 0291 ### iterations: 00012 ### eval_score: 0.42034
trial: 0292 ### iterations: 00011 ### eval_score: 0.42712
trial: 0293 ### iterations: 00003 ### eval_score: 0.4339
trial: 0294 ### iterations: 00006 ### eval_score: 0.42373
trial: 0295 ### iterations: 00003 ### eval_score: 0.42034
trial: 0296 ### iterations: 00006 ### eval_score: 0.42034
trial: 0297 ### iterations: 00005 ### eval_score: 0.44068
trial: 0298 ### iterations: 00004 ### eval_score: 0.44746
trial: 0299 ### i

trial: 0424 ### iterations: 00008 ### eval_score: 0.42373
trial: 0425 ### iterations: 00008 ### eval_score: 0.44407
trial: 0426 ### iterations: 00005 ### eval_score: 0.44746
trial: 0427 ### iterations: 00004 ### eval_score: 0.44407
trial: 0428 ### iterations: 00009 ### eval_score: 0.44068
trial: 0429 ### iterations: 00002 ### eval_score: 0.45424
trial: 0430 ### iterations: 00003 ### eval_score: 0.45085
trial: 0431 ### iterations: 00002 ### eval_score: 0.44746
trial: 0432 ### iterations: 00002 ### eval_score: 0.44407
trial: 0433 ### iterations: 00001 ### eval_score: 0.43729
trial: 0434 ### iterations: 00001 ### eval_score: 0.43729
trial: 0435 ### iterations: 00004 ### eval_score: 0.42373
trial: 0436 ### iterations: 00003 ### eval_score: 0.43729
trial: 0437 ### iterations: 00006 ### eval_score: 0.43729
trial: 0438 ### iterations: 00005 ### eval_score: 0.42712
trial: 0439 ### iterations: 00006 ### eval_score: 0.43051
trial: 0440 ### iterations: 00011 ### eval_score: 0.42712
trial: 0441 ##

In [27]:
test_eval(xclf, X_test, y_test, 'XGB')

Confusion Matrix
[[90  5 31]
 [24 21 22]
 [29  9 64]] 

Classification Report
              precision    recall  f1-score   support

           1       0.63      0.71      0.67       126
           2       0.60      0.31      0.41        67
           3       0.55      0.63      0.58       102

    accuracy                           0.59       295
   macro avg       0.59      0.55      0.56       295
weighted avg       0.59      0.59      0.58       295
 

AUC-ROC
0.724572410178223 

Balanced Accuracy
0.5517231768329222 

Geometric Mean Score
0.657458923043899 

Sensitivity
0.5517231768329222 

Specificity
0.7834585416029061


# MODEL-7 Stacking

In [28]:
clf_DT = DecisionTreeClassifier()
clf_SVC = SVC(probability=True)
clf_KNN = KNeighborsClassifier()
clf_LR = LogisticRegression()

sclf = StackingClassifier(classifiers=[clf_DT, clf_SVC, clf_KNN], use_probas=True, average_probas=False, meta_classifier=clf_LR)

In [29]:
sclf.fit(X_train, y_train)
test_eval(sclf, X_test, y_test, 'Stacking')

Confusion Matrix
[[70 20 36]
 [23 21 23]
 [30 22 50]] 

Classification Report
              precision    recall  f1-score   support

           1       0.57      0.56      0.56       126
           2       0.33      0.31      0.32        67
           3       0.46      0.49      0.47       102

    accuracy                           0.48       295
   macro avg       0.45      0.45      0.45       295
weighted avg       0.48      0.48      0.48       295
 

AUC-ROC
0.6483633538888772 

Balanced Accuracy
0.45306148993594125 

Geometric Mean Score
0.5759458131292434 

Sensitivity
0.45306148993594125 

Specificity
0.7321601747877681


# MODEL-8 Bagging

In [30]:
clf_bg = BaggingClassifier(clf_DT)

In [31]:
clf_bg.fit(X_train, y_train)
test_eval(clf_bg, X_test, y_test, 'Bagging')

Confusion Matrix
[[79 14 33]
 [23 23 21]
 [30 22 50]] 

Classification Report
              precision    recall  f1-score   support

           1       0.60      0.63      0.61       126
           2       0.39      0.34      0.37        67
           3       0.48      0.49      0.49       102

    accuracy                           0.52       295
   macro avg       0.49      0.49      0.49       295
weighted avg       0.51      0.52      0.51       295
 

AUC-ROC
0.6754073578449772 

Balanced Accuracy
0.4868212625016839 

Geometric Mean Score
0.604074073159894 

Sensitivity
0.4868212625016839 

Specificity
0.7495676831960946


In [32]:
clf_eval_df = pd.DataFrame({'model':model,
                            'precision':precision,
                            'recall':recall,
                            'f1-score':F1score,
                            'AUC-ROC':AUCROC,
                            'balanced_acc':balanced_acc,
                            'GMean':GMean,
                            'sensitivity':sensi,
                            'specificity':speci})
clf_eval_df

Unnamed: 0,model,precision,recall,f1-score,AUC-ROC,balanced_acc,GMean,sensitivity,specificity
0,Logistic Regression,0.498268,0.463332,0.468273,0.672171,0.463332,0.582077,0.463332,0.731254
1,Decision Tree,0.450337,0.449487,0.449614,0.592316,0.449487,0.573373,0.449487,0.731405
2,Gaussian NB,0.474953,0.482536,0.457231,0.635059,0.482536,0.596133,0.482536,0.736473
3,KNN,0.421998,0.41759,0.418489,0.620473,0.41759,0.544189,0.41759,0.709168
4,SVC,0.559222,0.538935,0.544541,0.703088,0.538935,0.644619,0.538935,0.771027
5,XGB,0.592126,0.551723,0.555128,0.724572,0.551723,0.657459,0.551723,0.783459
6,Stacking,0.453718,0.453061,0.453087,0.648363,0.453061,0.575946,0.453061,0.73216
7,Bagging,0.489695,0.486821,0.48764,0.675407,0.486821,0.604074,0.486821,0.749568


In [33]:
clf_eval_df.to_excel("D:\Skripsi\Final\project_contraceptive_final.xlsx")