In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import joblib
import time
import threading
from datetime import datetime
# from contextlib import contextmanager
from timeit import default_timer

from scipy.stats import chi2_contingency

from sklearn.model_selection import train_test_split
# from pycaret.classification import setup,compare_models,create_model,plot_model,evaluate_model
# from pycaret.regression import *

# from autosklearn.classification import AutoSklearnClassifier
import optuna

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# import sklearn.lda.LDA as LDA # <-- this is throwing the ModuleNotFoundError

In [71]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [72]:
# simple function to generate random integers

def rand_gen(low=1,high=1e4):
    '''
    Generates a pseudo-random integer
    consisting of up to four digits
    '''
    import numpy as np
    rng=np.random.default_rng()
    random_state=int(rng.integers(low=low,high=high))
    
    return random_state

In [73]:
seed=rand_gen()
seed

1379

In [74]:
test_size=0.2
test_size

0.2

In [75]:
# set the randomness seed throughout the notebook
# source: # https://odsc.medium.com/properly-setting-the-random-seed-in-ml-experiments-not-as-simple-as-you-might-imagine-219969c84752

## set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed)
## set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed)
## set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed)
np.random.default_rng(seed)

Generator(PCG64) at 0x169709F20

In [76]:
import json
from pathlib import Path
import inspect

def get_variable_name(var):
    callers_local_vars = inspect.currentframe().f_back.f_locals.items()
    return [name for name, val in callers_local_vars if val is var]

def fileDaterSaver(location: str,
                   filetype: str,
                   object_,
                   extra: str = '',
                   verbose: bool = True):

    '''
    Function that gets a timestamped filename and saves it
    to a user-specified location.

    Parameters:
    -----------
    location: str - The location where the file will be saved.
    filetype: str - The type of the file to save ('csv' or 'json').
    object_: The object to be saved. Should be a pandas DataFrame
        for 'csv' or serializable for 'json'.
    extra: str - Additional string to include in the filename.
    verbose: bool - Whether to print verbose messages.
    '''

    # get current date and time
    current_datetime = datetime.now()

    # print current date and time to check
    if verbose:
        print('current_datetime:', current_datetime)

    # format the datetime for a filename
    datetime_suffix = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")

    # create filename with the datetime suffix
    if extra != '':
        file_name = f'{location}{extra}_{datetime_suffix}.{filetype}'
    else:
        file_name = f'{location}{datetime_suffix}.{filetype}'

    # print file name
    if verbose:
        print(file_name)

    # save object
    if filetype == 'csv':
        object_.to_csv(file_name, index=True)
    elif filetype == 'json':
        with open(file_name, 'w') as file:
            file.write(json.dumps(object_, default=str))
    else:
        raise ValueError("Unsupported file type. Use 'csv' or 'json'.")

    # confirm save
    file_path = Path(file_name)
    if file_path.exists():
        variable_name = get_variable_name(object_)
        if variable_name:
            print(f'Successfully saved {variable_name[0]} to {file_path}')
        else:
            print(f'Successfully saved object to {file_path}')
    else:
        print("File save error.")

In [77]:
df=pd.read_csv('../data/2_data.csv')
dbd=[ # demographic and banking data
    'age',
    'job',
    'marital',
    'education',
    'default',
    'balance',
    'housing',
    'loan'
]

csd=[ # campaign-specific data
    'contact',
    'day',
    'month',
    'duration',
    'campaign'
]
X_1=df[[col for col in df.columns if col != 'y' and col not in csd]]
y=df['y'].map({'yes':1,'no':0})

X_2=df[[col for col in df.columns if col != 'y']]

X_3=df[[col for col in df.columns if col != 'y']]

In [78]:
from sklearn.preprocessing import StandardScaler
from pandas.api.types import is_string_dtype

In [79]:
# dummify X_1

cols_to_transform=[col for col in X_1.columns if is_string_dtype(X_1[col])]

X_1_dummy=pd.get_dummies(data=X_1,
                         columns=cols_to_transform,
                         drop_first=True)

bool_cols=[col for col in X_1_dummy.columns if '_' in col]

X_1_dummy[bool_cols]=X_1_dummy[bool_cols].astype(int)

X_1_dummy

scaler=StandardScaler()
X_1_scaled=pd.DataFrame(scaler.fit_transform(X_1_dummy),columns=X_1_dummy.columns)

X_1_scaled.shape

(40000, 21)

In [80]:
# edit number to determine subsampling fraction
frac=1

In [81]:
X_1_scaled_sample=X_1_scaled.sample(frac=frac,random_state=seed)
X_1_scaled_sample

Unnamed: 0,age,balance,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,...,job_unemployed,job_unknown,marital_married,marital_single,education_secondary,education_tertiary,education_unknown,default_yes,housing_yes,loan_yes
23745,2.017849,-0.138538,-0.553592,-0.190797,-0.167135,-0.506476,5.180325,-0.191430,-0.329151,-0.115212,...,-0.168474,-0.076875,0.800178,-0.611597,-1.050946,1.602971,-0.199495,-0.143675,-1.226725,-0.457772
22457,0.358382,-0.447795,-0.553592,-0.190797,-0.167135,1.974426,-0.193038,-0.191430,-0.329151,-0.115212,...,-0.168474,-0.076875,-1.249721,-0.611597,-1.050946,1.602971,-0.199495,6.960155,-1.226725,-0.457772
12777,-1.197367,-0.270782,-0.553592,-0.190797,-0.167135,-0.506476,-0.193038,-0.191430,-0.329151,-0.115212,...,-0.168474,-0.076875,0.800178,-0.611597,0.951524,-0.623842,-0.199495,-0.143675,-1.226725,2.184492
385,1.810415,0.135247,-0.553592,-0.190797,-0.167135,-0.506476,-0.193038,-0.191430,3.038121,-0.115212,...,-0.168474,-0.076875,0.800178,-0.611597,0.951524,-0.623842,-0.199495,-0.143675,0.815179,2.184492
19053,1.914132,-0.438841,-0.553592,-0.190797,-0.167135,-0.506476,-0.193038,-0.191430,-0.329151,-0.115212,...,-0.168474,-0.076875,-1.249721,-0.611597,0.951524,-0.623842,-0.199495,-0.143675,-1.226725,-0.457772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,-0.367634,-0.307631,-0.553592,-0.190797,-0.167135,-0.506476,-0.193038,-0.191430,-0.329151,-0.115212,...,5.935645,-0.076875,-1.249721,1.635063,0.951524,-0.623842,-0.199495,-0.143675,0.815179,-0.457772
35081,-0.263917,-0.326227,-0.553592,-0.190797,-0.167135,-0.506476,-0.193038,5.223844,-0.329151,-0.115212,...,-0.168474,-0.076875,0.800178,-0.611597,-1.050946,1.602971,-0.199495,-0.143675,0.815179,-0.457772
36486,-0.471351,-0.401303,1.806386,-0.190797,-0.167135,-0.506476,-0.193038,-0.191430,-0.329151,-0.115212,...,-0.168474,-0.076875,-1.249721,1.635063,-1.050946,-0.623842,-0.199495,-0.143675,0.815179,-0.457772
14130,0.565816,-0.151625,-0.553592,-0.190797,-0.167135,-0.506476,-0.193038,-0.191430,-0.329151,-0.115212,...,-0.168474,-0.076875,-1.249721,1.635063,0.951524,-0.623842,-0.199495,-0.143675,0.815179,-0.457772


In [82]:
X_train, X_test, y_train, y_test = train_test_split(X_1_scaled_sample, 
                                                    y, 
                                                    test_size=test_size, 
                                                    random_state=seed)

In [83]:
rus=RandomUnderSampler(random_state=seed)
X_res,y_res=rus.fit_resample(X_1_scaled_sample,y)
print(Counter(y_res))

Counter({0: 2896, 1: 2896})
