In [1]:
# Paso 0. Importar librerias

# Datos ----------------------------------------------------------------
import pandas as pd
import numpy as np
# Graficos -------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning -----------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    FunctionTransformer, 
    OneHotEncoder, 
    StandardScaler)
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             f1_score,
                             recall_score,
                             confusion_matrix,
                             classification_report)
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

import plotly.express as px
import requests
from io import StringIO

# Modelo ---------------------------------------------------------------
from joblib import dump

### 1. Cargar datos
Objetivo: Obtener los datos de la fuente y una primera visión de las propiedades, más una presentación.

In [4]:
df_raw = pd.read_csv("../data/raw/bank-additional-full.csv", sep=';')
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

## Dictionary:

|# | Feature | Type        | Description   |
|--|---------|-------------|---------------|
|1 | age     | numeric      | age of person |
|2 | job     | categorical  | type of job:  <br>'admin.'<br>'blue-collar'<br>'entrepreneur'<br>'housemaid'<br>'management'<br>'retired'<br>'self-employed'<br>'services'<br>'student'<br>'technician'<br>'unemployed'<br>'unknown'|
|3 | marital | categorical |marital status: <br>'divorced'<br>'married'<br>'single'<br>'unknown'<br> note: 'divorced' means divorced or widowed|
|4 | education | categorical | status education: <br>'basic.4y'<br>'basic.6y'<br>'basic.9y'<br>'high.school'<br>'illiterate'<br>'professional.course'<br>'university.degree'<br>'unknown'|
|5 | default | categorical | has credit in default? <br>'no'<br>'yes'<br>'unknown'|
|6 | housing | categorical | has housing loan?: <br>'no'<br>'yes'<br>'unknown'|
|7 | loan | categorical | has personal loan?: <br>'no'<br>'yes'<br>'unknown'|

### Related with the last contact of the current campaign:
|# | Feature | Type        | Description   |
|--|---------|-------------|---------------|
|8 | contact | categorical | contact communication type: <br>'cellular'<br>'telephone'|
|9 | month | categorical | last contact month of year: <br>'jan'<br>'feb'<br>'mar'<br> ...<br> 'nov'<br>'dec'|
|10| day_of_week | categorical | last contact day of the week: <br>'mon'<br>'tue'<br>'wed'<br>'thu'<br>'fri'|
|11| duration | numeric | last contact duration, in seconds. <br>Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.|

### Other attributes:

|# | Feature | Type        | Description   |
|--|---------|-------------|---------------|
|12| campaign | numeric | number of contacts performed during this campaign and for this client (numeric, includes last contact)|
|13| pdays | numeric | number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)|
|14| previous | numeric | number of contacts performed before this campaign and for this client|
|15| poutcome | categorical | outcome of the previous marketing campaign: <br>'failure'<br>'nonexistent'<br>'success'|

### Social and economic context attributes

|# | Feature | Type        | Description   |
|--|---------|-------------|---------------|
|16| emp.var.rate | numeric | employment variation rate - quarterly indicator |
|17| cons.price.idx | numeric | consumer price index - monthly indicator |
|18| cons.conf.idx | numeric | consumer confidence index - monthly indicator |
|19| euribor3m | numeric | euribor 3 month rate - daily indicator |
|20| nr.employed | numeric | number of employees - quarterly indicator | 

### Output variable (desired target):
|# | Feature | Type        | Description   |
|--|---------|-------------|---------------|
|21| y | binary | has the client subscribed a term deposit?: <br>'yes'<br>'no'|

In [6]:
df_raw.sample(10, random_state=2025)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
4479,58,retired,married,basic.4y,unknown,no,no,telephone,may,tue,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.856,5191.0,yes
24230,51,entrepreneur,married,university.degree,no,yes,no,cellular,nov,mon,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no
8307,43,admin.,married,high.school,no,yes,no,telephone,jun,tue,...,2,999,0,nonexistent,1.4,94.465,-41.8,4.864,5228.1,no
4824,31,admin.,married,high.school,no,no,no,telephone,may,wed,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.858,5191.0,no
12201,41,blue-collar,married,basic.6y,no,no,no,telephone,jul,wed,...,5,999,0,nonexistent,1.4,93.918,-42.7,4.956,5228.1,no
20123,36,housemaid,married,basic.6y,no,no,no,cellular,aug,mon,...,2,999,0,nonexistent,1.4,93.444,-36.1,4.965,5228.1,no
11995,44,admin.,married,university.degree,unknown,yes,yes,telephone,jun,thu,...,4,999,0,nonexistent,1.4,94.465,-41.8,4.955,5228.1,no
9514,45,admin.,married,university.degree,no,yes,no,telephone,jun,mon,...,5,999,0,nonexistent,1.4,94.465,-41.8,4.961,5228.1,no
20217,48,entrepreneur,married,university.degree,no,yes,no,cellular,aug,mon,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.965,5228.1,no
8739,22,services,single,high.school,no,no,no,telephone,jun,wed,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.864,5228.1,no


In [7]:
df_raw['y'].value_counts()

y
no     36548
yes     4640
Name: count, dtype: int64