In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import joblib
import time
import threading
from datetime import datetime
# from contextlib import contextmanager
from timeit import default_timer

from scipy.stats import chi2_contingency

from sklearn.model_selection import train_test_split
# from pycaret.classification import setup,compare_models,create_model,plot_model,evaluate_model
# from pycaret.regression import *

# from autosklearn.classification import AutoSklearnClassifier
import optuna

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# import sklearn.lda.LDA as LDA # <-- this is throwing the ModuleNotFoundError

In [71]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [72]:
# simple function to generate random integers

def rand_gen(low=1,high=1e4):
    '''
    Generates a pseudo-random integer
    consisting of up to four digits
    '''
    import numpy as np
    rng=np.random.default_rng()
    random_state=int(rng.integers(low=low,high=high))
    
    return random_state

In [73]:
seed=rand_gen()
seed

1379

In [74]:
test_size=0.2
test_size

0.2

In [75]:
# set the randomness seed throughout the notebook
# source: # https://odsc.medium.com/properly-setting-the-random-seed-in-ml-experiments-not-as-simple-as-you-might-imagine-219969c84752

## set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed)
## set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed)
## set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed)
np.random.default_rng(seed)

Generator(PCG64) at 0x169709F20

In [76]:
import json
from pathlib import Path
import inspect

def get_variable_name(var):
    callers_local_vars = inspect.currentframe().f_back.f_locals.items()
    return [name for name, val in callers_local_vars if val is var]

def fileDaterSaver(location: str,
                   filetype: str,
                   object_,
                   extra: str = '',
                   verbose: bool = True):

    '''
    Function that gets a timestamped filename and saves it
    to a user-specified location.

    Parameters:
    -----------
    location: str - The location where the file will be saved.
    filetype: str - The type of the file to save ('csv' or 'json').
    object_: The object to be saved. Should be a pandas DataFrame
        for 'csv' or serializable for 'json'.
    extra: str - Additional string to include in the filename.
    verbose: bool - Whether to print verbose messages.
    '''

    # get current date and time
    current_datetime = datetime.now()

    # print current date and time to check
    if verbose:
        print('current_datetime:', current_datetime)

    # format the datetime for a filename
    datetime_suffix = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")

    # create filename with the datetime suffix
    if extra != '':
        file_name = f'{location}{extra}_{datetime_suffix}.{filetype}'
    else:
        file_name = f'{location}{datetime_suffix}.{filetype}'

    # print file name
    if verbose:
        print(file_name)

    # save object
    if filetype == 'csv':
        object_.to_csv(file_name, index=True)
    elif filetype == 'json':
        with open(file_name, 'w') as file:
            file.write(json.dumps(object_, default=str))
    else:
        raise ValueError("Unsupported file type. Use 'csv' or 'json'.")

    # confirm save
    file_path = Path(file_name)
    if file_path.exists():
        variable_name = get_variable_name(object_)
        if variable_name:
            print(f'Successfully saved {variable_name[0]} to {file_path}')
        else:
            print(f'Successfully saved object to {file_path}')
    else:
        print("File save error.")

In [77]:
df=pd.read_csv('../data/2_data.csv')
dbd=[ # demographic and banking data
    'age',
    'job',
    'marital',
    'education',
    'default',
    'balance',
    'housing',
    'loan'
]

csd=[ # campaign-specific data
    'contact',
    'day',
    'month',
    'duration',
    'campaign'
]
X_1=df[[col for col in df.columns if col != 'y' and col not in csd]]
y=df['y'].map({'yes':1,'no':0})

X_2=df[[col for col in df.columns if col != 'y']]

X_3=df[[col for col in df.columns if col != 'y']]

In [78]:
from sklearn.preprocessing import StandardScaler
from pandas.api.types import is_string_dtype

In [79]:
# dummify X_1

cols_to_transform=[col for col in X_1.columns if is_string_dtype(X_1[col])]

X_1_dummy=pd.get_dummies(data=X_1,
                         columns=cols_to_transform,
                         drop_first=True)

bool_cols=[col for col in X_1_dummy.columns if '_' in col]

X_1_dummy[bool_cols]=X_1_dummy[bool_cols].astype(int)

X_1_dummy

scaler=StandardScaler()
X_1_scaled=pd.DataFrame(scaler.fit_transform(X_1_dummy),columns=X_1_dummy.columns)

X_1_scaled.shape

(40000, 21)

In [80]:
# edit number to determine subsampling fraction
frac=1

In [81]:
X_1_scaled_sample=X_1_scaled.sample(frac=frac,random_state=seed)
X_1_scaled_sample

Unnamed: 0,age,balance,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,...,job_unemployed,job_unknown,marital_married,marital_single,education_secondary,education_tertiary,education_unknown,default_yes,housing_yes,loan_yes
23745,2.017849,-0.138538,-0.553592,-0.190797,-0.167135,-0.506476,5.180325,-0.191430,-0.329151,-0.115212,...,-0.168474,-0.076875,0.800178,-0.611597,-1.050946,1.602971,-0.199495,-0.143675,-1.226725,-0.457772
22457,0.358382,-0.447795,-0.553592,-0.190797,-0.167135,1.974426,-0.193038,-0.191430,-0.329151,-0.115212,...,-0.168474,-0.076875,-1.249721,-0.611597,-1.050946,1.602971,-0.199495,6.960155,-1.226725,-0.457772
12777,-1.197367,-0.270782,-0.553592,-0.190797,-0.167135,-0.506476,-0.193038,-0.191430,-0.329151,-0.115212,...,-0.168474,-0.076875,0.800178,-0.611597,0.951524,-0.623842,-0.199495,-0.143675,-1.226725,2.184492
385,1.810415,0.135247,-0.553592,-0.190797,-0.167135,-0.506476,-0.193038,-0.191430,3.038121,-0.115212,...,-0.168474,-0.076875,0.800178,-0.611597,0.951524,-0.623842,-0.199495,-0.143675,0.815179,2.184492
19053,1.914132,-0.438841,-0.553592,-0.190797,-0.167135,-0.506476,-0.193038,-0.191430,-0.329151,-0.115212,...,-0.168474,-0.076875,-1.249721,-0.611597,0.951524,-0.623842,-0.199495,-0.143675,-1.226725,-0.457772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,-0.367634,-0.307631,-0.553592,-0.190797,-0.167135,-0.506476,-0.193038,-0.191430,-0.329151,-0.115212,...,5.935645,-0.076875,-1.249721,1.635063,0.951524,-0.623842,-0.199495,-0.143675,0.815179,-0.457772
35081,-0.263917,-0.326227,-0.553592,-0.190797,-0.167135,-0.506476,-0.193038,5.223844,-0.329151,-0.115212,...,-0.168474,-0.076875,0.800178,-0.611597,-1.050946,1.602971,-0.199495,-0.143675,0.815179,-0.457772
36486,-0.471351,-0.401303,1.806386,-0.190797,-0.167135,-0.506476,-0.193038,-0.191430,-0.329151,-0.115212,...,-0.168474,-0.076875,-1.249721,1.635063,-1.050946,-0.623842,-0.199495,-0.143675,0.815179,-0.457772
14130,0.565816,-0.151625,-0.553592,-0.190797,-0.167135,-0.506476,-0.193038,-0.191430,-0.329151,-0.115212,...,-0.168474,-0.076875,-1.249721,1.635063,0.951524,-0.623842,-0.199495,-0.143675,0.815179,-0.457772


In [108]:
rus=RandomUnderSampler(random_state=seed)
X_res,y_res=rus.fit_resample(X_1_scaled_sample,y)
print(Counter(y_res))

Counter({0: 2896, 1: 2896})


In [109]:
X_train, X_test, y_train, y_test = train_test_split(X_res, 
                                                    y_res, 
                                                    test_size=test_size, 
                                                    random_state=seed)

In [84]:
from sklearn.svm import SVC

In [85]:
# count number of samples per class
class_counts=Counter(y_train)

# calculate class weights
class_weights={cls:len(y_train)/count for cls,count in class_counts.items()}

print(class_weights)

{0: 1.078857759347291, 1: 13.681060282171869}


In [86]:
n_trials=5

In [99]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import StratifiedKFold,cross_validate,cross_val_predict

In [100]:
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score, zero_division=0),
    'f1': make_scorer(f1_score, zero_division=0)
}

In [101]:
kernel_choice='linear'

In [105]:
start_time = time.time()

def objective(trial):
    # Suggest hyperparameters for SVC
    print('kernel_choice:', kernel_choice)
    
    # Handling 'poly' and 'precomputed' kernels
    if kernel_choice == 'poly':
        print('kernel_choice is poly')
        kernel = 'poly'
        degree = trial.suggest_int('degree', 1, 10)
        coef0 = trial.suggest_float('coef0', -10, 10)
    elif kernel_choice == 'precomputed':
        print('kernel_choice is precomputed; computing kernel')
        X_kernel = compute_kernel(X_train, random.choice(['linear', 'poly', 'rbf', 'sigmoid']))
        print('computed kernel')
        kernel = 'precomputed'
        degree = 3
        coef0 = 0
    else:
        print('kernel_choice is not poly or precomputed')
        kernel = kernel_choice
        coef0 = trial.suggest_float('coef0', -10, 10)
        degree = 3  # Default for non-'poly' kernels

    # Gamma choice
    gamma_choice = trial.suggest_categorical('gamma_choice', ['scale', 'auto', 'variable'])
    print('gamma_choice:', gamma_choice)
        
    if gamma_choice == 'variable':
        gamma = trial.suggest_float('gamma_value', 0.001, 10)
        print(f'gamma is: {gamma}')
    else:
        gamma = gamma_choice
        print('gamma_choice is NOT variable. Gamma is:', gamma)
    
    # Decision function shape and break ties
    decision_function_shape = trial.suggest_categorical('decision_function_shape', ['ovo', 'ovr'])
    print(f'decision_function_shape: {decision_function_shape}')

    # Break ties only when decision_function_shape is 'ovr'
    break_ties = (decision_function_shape == 'ovr')
    
    # C parameter
    C = trial.suggest_float('C', 1E-4, 1E4, log=True)
    print(f'C: {C}')
    
    # Base SVC classifier definition
    base_svc = SVC(        
        C=C,
        kernel=kernel,
        degree=degree,
        gamma=gamma,
        coef0=coef0,
        probability=True,
        cache_size=200,
        class_weight='balanced',#trial.suggest_categorical('class_weight', ['balanced', class_weights]),
        decision_function_shape=decision_function_shape,
        break_ties=break_ties,
        random_state=seed  # Ensuring reproducibility
    )

    n_estimators=trial.suggest_int('n_estimators',10,1000)
    
    # BaggingClassifier with SVC as the base estimator
    bagging_clf = BaggingClassifier(
        estimator=base_svc,
        max_samples=1.0 / n_estimators,
        n_estimators=n_estimators,
        bootstrap=False,  # Using sub-sampling without replacement
        n_jobs=-1,  # Utilize all cores
        random_state=seed  # Ensuring reproducibility
    )
    print('bagging_clf:',bagging_clf)    
    
    # OneVsRestClassifier to handle multi-class classification
    # clf = OneVsRestClassifier(bagging_clf)

    # Define StratifiedKFold cross-validation with consistent random state
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
    print(f'cv: {cv}')
   
    # Cross-validation to evaluate the model
    X_data = X_kernel if kernel == 'precomputed' else X_train
    scores = cross_validate(
        bagging_clf, 
        X_data, 
        y_train, 
        cv=cv, 
        n_jobs=-1, 
        scoring=scoring,
        verbose=0
    )
    
    print(f'scores: {scores}')
    return scores['test_accuracy'].mean()

# Optuna study for hyperparameter tuning
svc_study = optuna.create_study(direction='maximize')
svc_study.optimize(objective, n_trials=n_trials)

# Storing results and timings
results['svc'] = svc_study.best_params
end_time = time.time()
elapsed_time = end_time - start_time

times['svc'] = {
    'start_time_svc': start_time,
    'end_time_svc': end_time,
    'elapsed_time_svc': elapsed_time
}

print(f"Optuna tuning completed in {elapsed_time:.2f} seconds")

[I 2024-10-10 21:22:38,031] A new study created in memory with name: no-name-defbdf81-9245-4637-8de1-5a8e6c96746b


kernel_choice: linear
kernel_choice is not poly or precomputed
gamma_choice: auto
gamma_choice is NOT variable. Gamma is: auto
decision_function_shape: ovo
C: 0.06934724248412098
bagging_clf: BaggingClassifier(bootstrap=False,
                  estimator=SVC(C=0.06934724248412098, class_weight='balanced',
                                coef0=-7.35909451141379,
                                decision_function_shape='ovo', gamma='auto',
                                kernel='linear', probability=True,
                                random_state=1379),
                  max_samples=0.007575757575757576, n_estimators=132, n_jobs=-1,
                  random_state=1379)
cv: StratifiedKFold(n_splits=3, random_state=1379, shuffle=True)


[I 2024-10-10 21:24:02,577] Trial 0 finished with value: 0.9269062518104203 and parameters: {'coef0': -7.35909451141379, 'gamma_choice': 'auto', 'decision_function_shape': 'ovo', 'C': 0.06934724248412098, 'n_estimators': 132}. Best is trial 0 with value: 0.9269062518104203.


scores: {'fit_time': array([5.94867206, 7.8318181 , 6.97284317]), 'score_time': array([60.53515816, 68.28523588, 75.72018504]), 'test_accuracy': array([0.92687729, 0.92687729, 0.92696419]), 'test_precision': array([0., 0., 0.]), 'test_recall': array([0., 0., 0.]), 'test_f1': array([0., 0., 0.])}
kernel_choice: linear
kernel_choice is not poly or precomputed
gamma_choice: variable
gamma is: 2.773251876861079
decision_function_shape: ovr
C: 4686.51869510569
bagging_clf: BaggingClassifier(bootstrap=False,
                  estimator=SVC(C=4686.51869510569, break_ties=True,
                                class_weight='balanced',
                                coef0=-7.292256432077984,
                                gamma=2.773251876861079, kernel='linear',
                                probability=True, random_state=1379),
                  max_samples=0.001366120218579235, n_estimators=732, n_jobs=-1,
                  random_state=1379)
cv: StratifiedKFold(n_splits=3, random_state=1

[W 2024-10-10 21:24:03,694] Trial 1 failed with parameters: {'coef0': -7.292256432077984, 'gamma_choice': 'variable', 'gamma_value': 2.773251876861079, 'decision_function_shape': 'ovr', 'C': 4686.51869510569, 'n_estimators': 732} because of the following error: ValueError('\nAll the 3 fits failed.\nIt is very likely that your model is misconfigured.\nYou can try to debug the error by setting error_score=\'raise\'.\n\nBelow are more details about the failures:\n--------------------------------------------------------------------------------\n3 fits failed with the following error:\njoblib.externals.loky.process_executor._RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/joblib/_utils.py", line 72, in __call__\n    return self.func(**kwargs)\n  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/joblib/parallel.py", line 598, in __call__\n    return [func(*args, **kwargs)

ValueError: 
All the 3 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/joblib/_utils.py", line 72, in __call__
    return self.func(**kwargs)
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/joblib/parallel.py", line 598, in __call__
    return [func(*args, **kwargs)
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/joblib/parallel.py", line 598, in <listcomp>
    return [func(*args, **kwargs)
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/sklearn/utils/parallel.py", line 136, in __call__
    return self.function(*args, **kwargs)
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/sklearn/ensemble/_bagging.py", line 187, in _parallel_build_estimators
    estimator_fit(X_, y, **fit_params_)
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/sklearn/svm/_base.py", line 250, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/sklearn/svm/_base.py", line 328, in _dense_fit
    ) = libsvm.fit(
  File "_libsvm.pyx", line 216, in sklearn.svm._libsvm.fit
ValueError: Invalid input - all samples with positive weights belong to the same class.
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/sklearn/utils/validation.py", line 66, in inner_f
    return f(*args, **kwargs)
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/sklearn/ensemble/_bagging.py", line 402, in fit
    return self._fit(X, y, max_samples=self.max_samples, **fit_params)
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/sklearn/ensemble/_bagging.py", line 545, in _fit
    all_results = Parallel(
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/sklearn/utils/parallel.py", line 74, in __call__
    return super().__call__(iterable_with_config)
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/joblib/parallel.py", line 2007, in __call__
    return output if self.return_generator else list(output)
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/joblib/parallel.py", line 1650, in _get_outputs
    yield from self._retrieve()
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/joblib/parallel.py", line 1754, in _retrieve
    self._raise_error_fast()
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/joblib/parallel.py", line 1789, in _raise_error_fast
    error_job.get_result(self.timeout)
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/joblib/parallel.py", line 745, in get_result
    return self._return_or_raise()
  File "/Applications/Anaconda/anaconda3/envs/apziva/lib/python3.9/site-packages/joblib/parallel.py", line 763, in _return_or_raise
    raise self._result
ValueError: Invalid input - all samples with positive weights belong to the same class.


In [110]:
clf=RandomForestClassifier(n_estimators=1000,
                           max_depth=5,
                           random_state=seed,
                           n_jobs=-1,
                           verbose=0)

clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)
report=classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.49      0.60      0.54       571
           1       0.51      0.40      0.45       588

    accuracy                           0.50      1159
   macro avg       0.50      0.50      0.50      1159
weighted avg       0.50      0.50      0.49      1159



In [113]:
n_estimators=10

clf_svc=SVC(random_state=seed)
clf=BaggingClassifier(estimator=clf_svc,
                      random_state=seed,
                      max_samples=1.0/n_estimators)

clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)
report=classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.48      0.64      0.55       571
           1       0.48      0.33      0.39       588

    accuracy                           0.48      1159
   macro avg       0.48      0.48      0.47      1159
weighted avg       0.48      0.48      0.47      1159

