## Leitura dos dados

In [2]:
# Bibliotecas
import os
import pandas as pd
import plotly.express as px


In [3]:
# Leitura dos dados
current_dir = os.getcwd()
data_path = os.path.abspath(os.path.join(current_dir, "..", "raw"))
dados = pd.read_csv(os.path.join(data_path, "marketing_investimento.csv" ))
dados

Unnamed: 0,idade,estado_civil,escolaridade,inadimplencia,saldo,fez_emprestimo,tempo_ult_contato,numero_contatos,aderencia_investimento
0,45,casado (a),superior,nao,242,nao,587,1,sim
1,42,casado (a),medio,nao,1289,nao,250,4,sim
2,23,solteiro (a),superior,nao,363,nao,16,18,nao
3,58,divorciado (a),superior,nao,1382,nao,700,1,sim
4,50,casado (a),medio,nao,3357,nao,239,4,sim
...,...,...,...,...,...,...,...,...,...
1263,52,solteiro (a),superior,nao,83,nao,1223,6,sim
1264,35,solteiro (a),superior,nao,5958,nao,215,1,sim
1265,30,solteiro (a),superior,nao,-477,sim,1532,2,sim
1266,42,casado (a),superior,nao,2187,nao,525,3,sim


In [4]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1268 entries, 0 to 1267
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   idade                   1268 non-null   int64 
 1   estado_civil            1268 non-null   object
 2   escolaridade            1268 non-null   object
 3   inadimplencia           1268 non-null   object
 4   saldo                   1268 non-null   int64 
 5   fez_emprestimo          1268 non-null   object
 6   tempo_ult_contato       1268 non-null   int64 
 7   numero_contatos         1268 non-null   int64 
 8   aderencia_investimento  1268 non-null   object
dtypes: int64(4), object(5)
memory usage: 89.3+ KB


## Análise Exploratória dos Dados

### Variáveis Categóricas

In [5]:
# Aderência ao investimento
px.histogram(dados,  x = 'aderencia_investimento', text_auto = True)

In [6]:
# Estado civil
px.histogram(dados, x = 'estado_civil', text_auto=True, color = 'aderencia_investimento', barmode='group')

In [7]:
# Escolaridade
px.histogram(dados, x = 'escolaridade', text_auto=True, color = 'aderencia_investimento', barmode='group')

In [8]:
# inadimplencia
px.histogram(dados, x = 'inadimplencia', text_auto=True, color = 'aderencia_investimento', barmode='group')

In [9]:
# fez_emprestimo
px.histogram(dados, x = 'fez_emprestimo', text_auto=True, color = 'aderencia_investimento', barmode='group')

### Variáveis Numéricas

In [10]:
# idade
px.box(dados, x = "idade", color="aderencia_investimento")

In [11]:
# Saldo
px.box(dados, x = "saldo", color="aderencia_investimento")

In [12]:
# tempo último contato
px.box(dados, x = "tempo_ult_contato", color="aderencia_investimento")

In [13]:
# numero de contatos
px.box(dados, x = "numero_contatos", color="aderencia_investimento")

### Modelamento

In [14]:
# Separação de variáveis
x = dados.drop('aderencia_investimento', axis=1)
y = dados['aderencia_investimento']

#### Transformando variáveis categóricas em númericas

A transformação utilizando OneHotEncoder, é recomendada para transformações, principalmente, em variáveis categóricas nominais, isto é, não existe ordem entre as categorias.
O OneHotEncoder cria uma coluna binária para cada categoria, por isso a limitação do método é quando existem múltilplas categorias que podem aumentar muito o número de colunas do meu dataframe.

O label encoder será utilizado para realizar a transformação da variável alvo.

In [15]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

# Salvar nome das colunas
colunas = x.columns

# Selecionar as colunas para serem transformadas, realizar o drop se a coluna possuir somente uma categoria.

one_hot = make_column_transformer((
    OneHotEncoder(drop = 'if_binary'),
    ['estado_civil', 'escolaridade', 'inadimplencia', 'fez_emprestimo']
),
    remainder = 'passthrough',
    sparse_threshold = 0)


In [16]:
# Fit transform
x = one_hot.fit_transform(x)

In [17]:
# Nome das colunas após a transformação
one_hot.get_feature_names_out(colunas)

array(['onehotencoder__estado_civil_casado (a)',
       'onehotencoder__estado_civil_divorciado (a)',
       'onehotencoder__estado_civil_solteiro (a)',
       'onehotencoder__escolaridade_fundamental',
       'onehotencoder__escolaridade_medio',
       'onehotencoder__escolaridade_superior',
       'onehotencoder__inadimplencia_sim',
       'onehotencoder__fez_emprestimo_sim', 'remainder__idade',
       'remainder__saldo', 'remainder__tempo_ult_contato',
       'remainder__numero_contatos'], dtype=object)

In [18]:
# Criando o dataframe e renomeando as colunas
pd.DataFrame(x, columns= one_hot.get_feature_names_out(colunas))

Unnamed: 0,onehotencoder__estado_civil_casado (a),onehotencoder__estado_civil_divorciado (a),onehotencoder__estado_civil_solteiro (a),onehotencoder__escolaridade_fundamental,onehotencoder__escolaridade_medio,onehotencoder__escolaridade_superior,onehotencoder__inadimplencia_sim,onehotencoder__fez_emprestimo_sim,remainder__idade,remainder__saldo,remainder__tempo_ult_contato,remainder__numero_contatos
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,45.0,242.0,587.0,1.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,42.0,1289.0,250.0,4.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,23.0,363.0,16.0,18.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,58.0,1382.0,700.0,1.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,50.0,3357.0,239.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1263,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,52.0,83.0,1223.0,6.0
1264,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,35.0,5958.0,215.0,1.0
1265,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,30.0,-477.0,1532.0,2.0
1266,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,42.0,2187.0,525.0,3.0


In [19]:
# Realizando a transformação da variável alvo
from sklearn.preprocessing import LabelEncoder

In [20]:
label_encoder = LabelEncoder()

In [21]:
y = label_encoder.fit_transform(y)

In [None]:
y

array([1, 1, 0, ..., 1, 1, 1])

#### Treinando o modelo

In [23]:
# Separação de variáveis
from sklearn.model_selection import train_test_split

x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, stratify = y, random_state = 5 )

In [24]:
# Árvore de decisão
from sklearn.tree import DecisionTreeClassifier

arvore = DecisionTreeClassifier(random_state=5, max_depth=3)
arvore.fit(x_treino, y_treino)

In [25]:
arvore.predict(x_teste)

array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,

In [26]:
arvore.score(x_teste, y_teste)

0.7160883280757098

In [27]:
# Plot tree
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

In [28]:
nome_colunas = ['casado (a)',
                'divorciado (a)',
                'fundamental',
                'medio',
                'superior',
                'inadimplencia',
                'fez_emprestimo',
                'idade',
                'saldo',
                'tempo_ult_contato',
                'numero_contatos']

### Segundo Modelo - Método Knn
O método Knn calcula a distância entre os registros da base de dados, fazendo assim a aproximação. Contudo, é preciso fazer uma transformação na escala das variáveis, porque o algoritmo pode dar um peso maior para variável em relação a outra e não necessariamente isso deve acontecer.

In [29]:
from sklearn.preprocessing import MinMaxScaler

normalizacao = MinMaxScaler()
x_treino_normalizado = normalizacao.fit_transform(x_treino)

In [30]:
pd.DataFrame(x_treino_normalizado)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.205882,0.065564,0.123734,0.032258
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.147059,0.045792,0.396527,0.032258
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.338235,0.076036,0.335022,0.000000
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.573529,0.062866,0.315123,0.000000
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.338235,0.148380,0.065847,0.129032
...,...,...,...,...,...,...,...,...,...,...,...,...
946,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.191176,0.044265,0.246382,0.129032
947,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.205882,0.028043,0.275687,0.032258
948,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.323529,0.042952,0.024964,0.129032
949,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.176471,0.042810,0.023878,0.000000


In [31]:
# Implementado o algoritmo
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

In [32]:
knn.fit(x_treino_normalizado, y_treino)

In [33]:
x_teste_normalizado = normalizacao.transform(x_teste)

In [34]:
knn.score(x_teste_normalizado, y_teste)

0.6876971608832808

### Escolhendo o melhor modelo

In [35]:
print(f'Acurácia Árvore: {arvore.score(x_teste, y_teste)}')
print(f'Acurácia KNN: {knn.score(x_teste_normalizado, y_teste)}')

Acurácia Árvore: 0.7160883280757098
Acurácia KNN: 0.6876971608832808


In [36]:
import pickle

In [None]:
# Armazenando o arquivo OneHotEncoder
with open('modelo_onehotenc.pkl', 'wb') as arquivo: 
    pickle.dump(one_hot, arquivo)

In [39]:
# Armazenando o algoritmo árvore
with open('modelo_arvore.pkl', 'wb') as arquivo: 
    pickle.dump(arvore, arquivo)