In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from feature_engine.encoding import OneHotEncoder

In [2]:
df = pd.read_csv("../data/Churn_Modelling.csv")

In [3]:
df

Unnamed: 0,NumeroLinha,IdUsuario,Sobrenome,PontosCredito,Pais,Sexo,Idade,VinculoEmpresa,DinheiroRestante,NumProdutosComprados,CartaoCredito,MembroAtivo,SalarioEstimado,Churn
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [4]:
df.dtypes

NumeroLinha               int64
IdUsuario                 int64
Sobrenome                object
PontosCredito             int64
Pais                     object
Sexo                     object
Idade                     int64
VinculoEmpresa            int64
DinheiroRestante        float64
NumProdutosComprados      int64
CartaoCredito             int64
MembroAtivo               int64
SalarioEstimado         float64
Churn                     int64
dtype: object

In [5]:
df.describe()

Unnamed: 0,NumeroLinha,IdUsuario,PontosCredito,Idade,VinculoEmpresa,DinheiroRestante,NumProdutosComprados,CartaoCredito,MembroAtivo,SalarioEstimado,Churn
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


## Removendo coluna
- Removendo coluna: NumeroLinha. Remoção pois a coluna é um contador do numero de linha.
- Removendo coluna: IdUsuario. Remoção pois a coluna é um identificador não sendo utilizado para o treinamento.
- Removendo coluna: Sobrenome.  Remoção pois a coluna de nome não é necessario para treinamento.

In [6]:
df.drop(columns=["NumeroLinha", "IdUsuario", "Sobrenome"], inplace=True);

In [7]:
df

Unnamed: 0,PontosCredito,Pais,Sexo,Idade,VinculoEmpresa,DinheiroRestante,NumProdutosComprados,CartaoCredito,MembroAtivo,SalarioEstimado,Churn
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


## Verificando se existe valores nulos

In [8]:
df.isna().sum()

PontosCredito           0
Pais                    0
Sexo                    0
Idade                   0
VinculoEmpresa          0
DinheiroRestante        0
NumProdutosComprados    0
CartaoCredito           0
MembroAtivo             0
SalarioEstimado         0
Churn                   0
dtype: int64

## Separando feature e target

In [15]:
features= df.drop(columns="Churn").columns.to_list()
cat_features = ["Pais", "Sexo"]
target = "Churn"

X = df[features]
y = df[target]

## Separando treino e teste

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

## Enconding

In [17]:
onehot = OneHotEncoder(variables=cat_features)

In [18]:
onehot.fit(X_train)
df_train = onehot.fit_transform(X_train)
df_test = onehot.fit_transform(X_test)

## Concat para exportar

In [19]:
df_train = pd.concat([df_train, y_train], axis=1).reset_index(drop=True)
df_train

Unnamed: 0,PontosCredito,Idade,VinculoEmpresa,DinheiroRestante,NumProdutosComprados,CartaoCredito,MembroAtivo,SalarioEstimado,Pais_France,Pais_Germany,Pais_Spain,Sexo_Male,Sexo_Female,Churn
0,673,37,2,0.00,1,1,1,13624.02,1,0,0,1,0,0
1,731,43,9,79120.27,1,0,0,548.52,0,1,0,0,1,1
2,668,42,8,187534.79,1,1,1,32900.41,1,0,0,1,0,1
3,677,26,3,102395.79,1,1,0,119368.99,0,1,0,0,1,0
4,595,41,9,150463.11,2,0,1,81548.38,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8495,768,54,8,69712.74,1,1,1,69381.05,1,0,0,1,0,0
8496,682,58,1,0.00,1,1,1,706.50,1,0,0,0,1,0
8497,735,38,1,0.00,3,0,0,92220.12,1,0,0,0,1,1
8498,667,43,8,190227.46,1,1,0,97508.04,1,0,0,1,0,1


In [20]:
df_test = pd.concat([df_test, y_test], axis=1).reset_index(drop=True)
df_test

Unnamed: 0,PontosCredito,Idade,VinculoEmpresa,DinheiroRestante,NumProdutosComprados,CartaoCredito,MembroAtivo,SalarioEstimado,Pais_Germany,Pais_France,Pais_Spain,Sexo_Male,Sexo_Female,Churn
0,596,32,3,96709.07,2,0,0,41788.37,1,0,0,1,0,0
1,623,43,1,0.00,2,1,1,146379.30,0,1,0,1,0,0
2,601,44,4,0.00,2,1,0,58561.31,0,0,1,0,1,0
3,506,59,8,119152.10,2,1,1,170679.74,1,0,0,1,0,0
4,560,27,7,124995.98,1,1,1,114669.79,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,637,34,9,0.00,2,0,0,26057.08,0,0,1,0,1,0
1496,629,31,6,0.00,2,1,0,93881.75,0,1,0,1,0,0
1497,665,25,7,90920.75,1,0,1,112256.57,0,1,0,1,0,0
1498,469,48,5,0.00,1,1,0,160529.71,0,1,0,0,1,1


## Exportar

In [24]:
df_train.to_csv('../data/processing/df_train.csv')
df_test.to_csv('../data/processing/df_test.csv')