# 0. Introdução

O objetivo deste notebook é gerar um modelo preditivo que busca prever qual será o churn de clientes para o próximo mes da empresa Telco telecom, desta maneira é possível estimar a receita esperada para o proximo mes, avaliando-se quais dos clientes presentes irão continuar.

**Objetivo: Acurácia >= 95%**

**Algoritmos a serem implementados:** 
* Redes neurais devido a sua ja conhecida competencia em problemas de classificação
* ? e por que?
* ? e por que?

# 1. Pré processamento dos dados

In [52]:
import pandas as pd
import numpy as np
import torch.utils.data as data_utils
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
import sklearn.metrics as sm



### Primeiramente os dados categoricos são codificados no formato "one-hot-encoding"

In [2]:
df = pd.read_csv('../Data/churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
categorical_columns = ['gender','Partner','Dependents','PhoneService','MultipleLines',
                    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                    'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
                    'PaperlessBilling', 'PaymentMethod','Churn']

yes_no_other_columns = categorical_columns[1:12] + [categorical_columns[-3]] + [categorical_columns[-1]]
categorical_columns = list(set(categorical_columns) - set(yes_no_other_columns))
categorical_columns



['gender', 'Contract', 'PaymentMethod']

In [4]:
categorical_columns = ['PaymentMethod', 'Contract']

In [5]:
dict_yes_no_other = {
    'Yes'                   :  1,
    'No'                    :  0, 
    'No internet service'   :  -1,
    'No phone service'      :  -1,
}

df_categorical = df.copy()
dict_categorical_reference = {}

try:
    #Linha de execução única
    yes_no_other_columns.remove('InternetService')
except:
    print('Execute novamente os blocos anteriores')
    pass

# Casos especiais: Internet service, gender, 'PaymentMethod', 'Contract' e InternetService

# PaymentMethod, Contract
for value in categorical_columns:
    for item in df[value].unique():
        df_categorical[value + '_' + item] = df[value].apply(lambda x: 1 if x == item else 0)
    
    # Remove coluna convertida
    df_categorical.drop(columns = value,inplace=True)

# Pre tratamento de InternetService para a conversao
df_categorical[df_categorical['InternetService'] == 'No'] = 0 
print(df_categorical['InternetService'].unique())

# Convertendo InternetService
for item in df_categorical['InternetService'].unique():
    if item != 0:
        df_categorical['InternetService' + '_' + item] = df_categorical['InternetService'].apply(lambda x: 1 if x == item else 0)

df_categorical['InternetService'] = df_categorical['InternetService'].apply(lambda x: 1 if x != 0 else x)


# Gender Male 0, Female 1
df_categorical['gender'] = df['gender'].apply(lambda x: 0 if x == 'Male' else 1)

for item in yes_no_other_columns:
    df_categorical[item] =  df[item].apply(lambda x: dict_yes_no_other[x])

# Ajustando ordem e nome de colunas
df_categorical['CHURN'] = df_categorical['Churn']
df_categorical.drop(columns = 'Churn',inplace=True)
categorical_columns = df_categorical.columns.values
translate_columns = {x : x.replace(' ','_').upper() for x in categorical_columns}
df_categorical.rename(columns = translate_columns,inplace=True)
df_categorical['TOTALCHARGES'] = df_categorical['TOTALCHARGES'].apply(lambda x: float(x) if x != ' ' else 0) 

 


['DSL' 'Fiber optic' 0]


In [6]:
df_categorical.sample(5)

Unnamed: 0,CUSTOMERID,GENDER,SENIORCITIZEN,PARTNER,DEPENDENTS,TENURE,PHONESERVICE,MULTIPLELINES,INTERNETSERVICE,ONLINESECURITY,...,PAYMENTMETHOD_ELECTRONIC_CHECK,PAYMENTMETHOD_MAILED_CHECK,PAYMENTMETHOD_BANK_TRANSFER_(AUTOMATIC),PAYMENTMETHOD_CREDIT_CARD_(AUTOMATIC),CONTRACT_MONTH-TO-MONTH,CONTRACT_ONE_YEAR,CONTRACT_TWO_YEAR,INTERNETSERVICE_DSL,INTERNETSERVICE_FIBER_OPTIC,CHURN
3550,0963-ZBDRN,0,0,0,0,32,1,0,1,0,...,1,0,0,0,1,0,0,0,1,1
3308,0,1,0,1,0,0,1,1,0,-1,...,0,0,0,0,0,0,0,0,0,0
6464,4726-DLWQN,0,1,0,0,50,1,1,1,1,...,0,0,1,0,1,0,0,1,0,0
5393,5376-DEQCP,1,0,0,0,1,1,0,1,0,...,1,0,0,0,1,0,0,0,1,1
2105,0925-VYDLG,1,0,0,0,3,1,1,1,0,...,1,0,0,0,1,0,0,0,1,1


In [7]:
df_categorical_selected = df_categorical[df_categorical['TENURE'] > 0]
print('Items removidos {}' .format(len(df_categorical) - len(df_categorical_selected)))

Items removidos 1531


### Normalização

In [8]:
def normaliza_dados(df_referencia, df_to_normalize, _type : str = 'min'):
    df_normalizado = df_to_normalize.copy()

    for c in df_to_normalize.iloc[:,1:-1].columns.values:
        #if len(df[c].unique()) > 2:
            _max_ =  df_referencia[c].max()
            _min_ =  df_referencia[c].min()
            DEN = _max_ - _min_
        
            if _type == 'min':
                df_normalizado[c] = df_normalizado[c].apply(lambda x: (x - _min_)/DEN)
            else:
                _mean_ = df_referencia[c].mean()
                df_normalizado[c] = df_normalizado[c].apply(lambda x: (x - _mean_)/DEN)            
    
    # Ultima coluna nao é normalizada
    #df_normalizado['CHURN'] = df_to_normalize.iloc[:,-1]
    return df_normalizado
    
def z_score(df_referencia, df_to_standard):
    df_z_score = df_to_standard.copy()
    
    for c in df_to_standard.iloc[:,1:-1].columns.values:
        _mean_ =  df_referencia[c].mean()
        DEN = df_referencia[c].std()
    
        df_z_score[c] = df_z_score[c].apply(lambda x: (x - _mean_)/DEN)
    
    #df_z_score['CHURN'] = df.iloc[:,-1]

    return df_z_score    
df_normalizado = normaliza_dados(df_categorical,df_categorical, 'mean')

df_z_score = z_score(df_categorical, df_categorical)

df_normalizado


Unnamed: 0,CUSTOMERID,GENDER,SENIORCITIZEN,PARTNER,DEPENDENTS,TENURE,PHONESERVICE,MULTIPLELINES,INTERNETSERVICE,ONLINESECURITY,...,PAYMENTMETHOD_ELECTRONIC_CHECK,PAYMENTMETHOD_MAILED_CHECK,PAYMENTMETHOD_BANK_TRANSFER_(AUTOMATIC),PAYMENTMETHOD_CREDIT_CARD_(AUTOMATIC),CONTRACT_MONTH-TO-MONTH,CONTRACT_ONE_YEAR,CONTRACT_TWO_YEAR,INTERNETSERVICE_DSL,INTERNETSERVICE_FIBER_OPTIC,CHURN
0,7590-VHVEG,0.504756,-0.154764,0.516967,-0.299588,-0.343785,-0.903166,-0.662502,0.216669,-0.034999,...,0.681528,-0.123669,-0.172086,-0.169104,0.524208,-0.157461,-0.150078,0.656254,-0.439585,0
1,5575-GNVDE,-0.495244,-0.154764,-0.483033,-0.299588,0.114548,0.096834,-0.162502,0.216669,0.465001,...,-0.318472,0.876331,-0.172086,-0.169104,-0.475792,0.842539,-0.150078,0.656254,-0.439585,0
2,3668-QPYBK,-0.495244,-0.154764,-0.483033,-0.299588,-0.329896,0.096834,-0.162502,0.216669,0.465001,...,-0.318472,0.876331,-0.172086,-0.169104,0.524208,-0.157461,-0.150078,0.656254,-0.439585,1
3,7795-CFOCW,-0.495244,-0.154764,-0.483033,-0.299588,0.267326,-0.903166,-0.662502,0.216669,0.465001,...,-0.318472,-0.123669,0.827914,-0.169104,-0.475792,0.842539,-0.150078,0.656254,-0.439585,0
4,9237-HQITU,0.504756,-0.154764,-0.483033,-0.299588,-0.329896,0.096834,-0.162502,0.216669,-0.034999,...,0.681528,-0.123669,-0.172086,-0.169104,0.524208,-0.157461,-0.150078,-0.343746,0.560415,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,-0.495244,-0.154764,0.516967,0.700412,-0.024341,0.096834,0.337498,0.216669,0.465001,...,-0.318472,0.876331,-0.172086,-0.169104,-0.475792,0.842539,-0.150078,0.656254,-0.439585,0
7039,2234-XADUH,0.504756,-0.154764,0.516967,0.700412,0.642326,0.096834,0.337498,0.216669,-0.034999,...,-0.318472,-0.123669,-0.172086,0.830896,-0.475792,0.842539,-0.150078,-0.343746,0.560415,0
7040,4801-JZAZL,0.504756,-0.154764,0.516967,0.700412,-0.204896,-0.903166,-0.662502,0.216669,0.465001,...,0.681528,-0.123669,-0.172086,-0.169104,0.524208,-0.157461,-0.150078,0.656254,-0.439585,0
7041,8361-LTMKD,-0.495244,0.845236,0.516967,-0.299588,-0.302118,0.096834,0.337498,0.216669,-0.034999,...,-0.318472,0.876331,-0.172086,-0.169104,0.524208,-0.157461,-0.150078,-0.343746,0.560415,1


In [9]:
df_z_score

Unnamed: 0,CUSTOMERID,GENDER,SENIORCITIZEN,PARTNER,DEPENDENTS,TENURE,PHONESERVICE,MULTIPLELINES,INTERNETSERVICE,ONLINESECURITY,...,PAYMENTMETHOD_ELECTRONIC_CHECK,PAYMENTMETHOD_MAILED_CHECK,PAYMENTMETHOD_BANK_TRANSFER_(AUTOMATIC),PAYMENTMETHOD_CREDIT_CARD_(AUTOMATIC),CONTRACT_MONTH-TO-MONTH,CONTRACT_ONE_YEAR,CONTRACT_TWO_YEAR,INTERNETSERVICE_DSL,INTERNETSERVICE_FIBER_OPTIC,CHURN
0,7590-VHVEG,1.009487,-0.427873,1.034457,-0.653965,-0.965510,-3.053794,-2.061523,0.52589,-0.099141,...,1.462767,-0.375634,-0.455878,-0.451100,1.049573,-0.432276,-0.420183,1.381614,-0.885597,0
1,5575-GNVDE,-0.990462,-0.427873,-0.966554,-0.653965,0.321706,0.327415,-0.505661,0.52589,1.317188,...,-0.683539,2.661785,-0.455878,-0.451100,-0.952633,2.313008,-0.420183,1.381614,-0.885597,0
2,3668-QPYBK,-0.990462,-0.427873,-0.966554,-0.653965,-0.926504,0.327415,-0.505661,0.52589,1.317188,...,-0.683539,2.661785,-0.455878,-0.451100,1.049573,-0.432276,-0.420183,1.381614,-0.885597,1
3,7795-CFOCW,-0.990462,-0.427873,-0.966554,-0.653965,0.750777,-3.053794,-2.061523,0.52589,1.317188,...,-0.683539,-0.375634,2.193256,-0.451100,-0.952633,2.313008,-0.420183,1.381614,-0.885597,0
4,9237-HQITU,1.009487,-0.427873,-0.966554,-0.653965,-0.926504,0.327415,-0.505661,0.52589,-0.099141,...,1.462767,-0.375634,-0.455878,-0.451100,1.049573,-0.432276,-0.420183,-0.723688,1.129022,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,-0.990462,-0.427873,1.034457,1.528916,-0.068360,0.327415,1.050202,0.52589,1.317188,...,-0.683539,2.661785,-0.455878,-0.451100,-0.952633,2.313008,-0.420183,1.381614,-0.885597,0
7039,2234-XADUH,1.009487,-0.427873,1.034457,1.528916,1.803954,0.327415,1.050202,0.52589,-0.099141,...,-0.683539,-0.375634,-0.455878,2.216488,-0.952633,2.313008,-0.420183,-0.723688,1.129022,0
7040,4801-JZAZL,1.009487,-0.427873,1.034457,1.528916,-0.575445,-3.053794,-2.061523,0.52589,1.317188,...,1.462767,-0.375634,-0.455878,-0.451100,1.049573,-0.432276,-0.420183,1.381614,-0.885597,0
7041,8361-LTMKD,-0.990462,2.336812,1.034457,-0.653965,-0.848490,0.327415,1.050202,0.52589,-0.099141,...,-0.683539,2.661785,-0.455878,-0.451100,1.049573,-0.432276,-0.420183,-0.723688,1.129022,1


In [10]:
df_normalizado.CONTRACT_TWO_YEAR.value_counts()

CONTRACT_TWO_YEAR
-0.150078    5986
 0.849922    1057
Name: count, dtype: int64

### DataLoaders (Pythorch)

Quando trabalhamos com pythorch nos é fornecido as primitivas dataset e datalaodrs, os datasets são um conjunto data/target enquanto dataloaders tornam os datasets iteraveis

In [11]:
batch_size = 8
def create_dataloaders(dataset_train,batch_size = batch_size,print_separation = True,train_size = 0.7):
  
  #Separa em teste e treino
  N_train = int(train_size*len(dataset_train))
  train_all, test = train_test_split(dataset_train, test_size= 1 - train_size)
  train, validation = train_test_split(train_all, test_size = 1 - train_size)

  x_train = train.iloc[:,0:-1]
  y_train = train.iloc[:,-1]

  x_train = torch.tensor(x_train.astype(np.float32).values)
  y_train  = torch.tensor(y_train.astype(np.int8).values)

  train_dataset = data_utils.TensorDataset(x_train, y_train)
  train_dataloader = data_utils.DataLoader(train_dataset,batch_size = batch_size ,shuffle=True)

  #valid_samples = [x for x in range(0,len(train)) if x not in train_samples]
  x_valid = validation.iloc[:,0:-1]
  y_valid = validation.iloc[:,-1]
  x_valid = torch.tensor((x_valid.astype(np.float32)).values)
  y_valid = torch.tensor((y_valid.astype(np.int8)).values)

  valid_dataset = data_utils.TensorDataset(x_valid, y_valid)
  valid_dataloader = data_utils.DataLoader(valid_dataset, batch_size = batch_size, shuffle=True)

  x_test = test.iloc[:,0:-1]
  y_test =  test.iloc[:,-1]
  x_test = torch.tensor(x_test.astype(np.float32).values)
  y_test  = torch.tensor(y_test.astype(np.int8).values)
  
  test_dataset = data_utils.TensorDataset(x_test, y_test)
  test_dataloader = data_utils.DataLoader(test_dataset ,batch_size = batch_size, shuffle=True)

  if (print_separation == True):
   print('''
    Dados de treinamento: {}
    Dados de teste: {}
    Dados de validação: {}
    '''.format(len(train_dataset),len(test_dataset),len(valid_dataset)))
  return train_dataloader, valid_dataloader, test_dataloader



In [12]:
train_dataloader, valid_dataloader, test_dataloader = create_dataloaders(dataset_train = df_normalizado.iloc[:,1:])


    Dados de treinamento: 3450
    Dados de teste: 2113
    Dados de validação: 1480
    


## Machine Learning

#### Cross Validation

In [75]:
import random
random.seed()
def sorted_k_fold(dataset : pd.DataFrame, k : int) -> list:
    
    TAMANHO = len(dataset)
    qtd_itens_grupo = int(TAMANHO/k)
    resto = TAMANHO
    idx_groups = []
    idx_avaliable = list(dataset.index.values)
    
    # Indica que a divisao é não homogenea
    if resto % k != 0:
        adicao = resto%k
        resto = resto - (resto%k)
        qtd_amostras = qtd_itens_grupo + adicao

        amostra = random.sample(idx_avaliable, qtd_amostras)
        idx_groups.append(amostra)
        idx_avaliable = list(set(idx_avaliable) - set(amostra))

    for i in range(1, k):
        amostra = random.sample(idx_avaliable, qtd_itens_grupo)
        idx_groups.append(amostra) 
        idx_avaliable = list(set(idx_avaliable) - set(amostra))
    
    return idx_groups

def k_fold_validation(model, k : int , dataset : pd.DataFrame, data_prep : str = 'z_score'):
    # data_prep: Norm normalização min max media, dados variam de -1 ate 1
    # data_prep: z_score, padronização para que os dados tenham media 0 desvio padrao 1, ou seja distribuição normal

    idx_groups = sorted_k_fold(dataset, k)
    print(len(idx_groups))
    data_groups = [dataset.iloc[idx] for idx in idx_groups]
    acc = []
    precision = []
    recall = []

    for data_group in data_groups:

        df_referencia = dataset.drop(data_group.index.values)
        if data_prep == 'mean':
            train = normaliza_dados(df_referencia, dataset.drop(data_group.index.values), 'mean')
            test = normaliza_dados(df_referencia, data_group, 'mean')
        
        else:
            train = z_score(df_referencia, dataset.drop(data_group.index.values))
            test = z_score(df_referencia, data_group)


        x_train = train.iloc[:, 1:-1].values
        y_train = train.iloc[:, -1].values

        x_test = test.iloc[:,1:-1].values 
        y_test = test.iloc[:, -1].values
        
        model.fit_model(x_train, y_train)
        model_metrics = model.evaluate(x_test, y_test) 
        acc.append(model_metrics[0])
        precision.append(round(model_metrics[1],3))
        recall.append(round(model_metrics[2],3))
    
    print('Acuracia : {}'.format(acc))
    print('Std: {}'.format((np.array(acc).std())))
    print('Mean {}'.format(np.array(acc).mean()))
    
    print('Precision : {}'.format(precision))
    print('Std: {}'.format((np.array(precision).std())))
    print('Mean {}'.format(np.array(precision).mean()))
    
    print('Recall {}'.format(recall))
    print('Std: {}'.format((np.array(recall).std())))
    print('Mean {}'.format(np.array(recall).mean()))

    return acc



### Árvore de decisão

Árvores de decisão basicamente encontram o melhor "caminho" para um dada entrada tentando, para o caso de classificação, classificar essa entrada com base em seus atributos. 

O ponto mais crítico do algoritmo é a função de escolha do parametro a ser utilizado em uma regra de ramificação, para tal a implementação mais comumum utiliza como base a busca gulosa baseando-se no ganho de informação, que nada mais é que a variação da entropia do meu conjunto(entropia olhando-se para uma variável objetivo, no caso de classificação a label da classe) ao subdividi-lo com relação a uma variável de teste "A"

Um outro ponto importante do modelo é o tratamento de overfeeting que será discutido mais adiante

In [84]:
df_normalizado.columns

Index(['CUSTOMERID', 'GENDER', 'SENIORCITIZEN', 'PARTNER', 'DEPENDENTS',
       'TENURE', 'PHONESERVICE', 'MULTIPLELINES', 'INTERNETSERVICE',
       'ONLINESECURITY', 'ONLINEBACKUP', 'DEVICEPROTECTION', 'TECHSUPPORT',
       'STREAMINGTV', 'STREAMINGMOVIES', 'PAPERLESSBILLING', 'MONTHLYCHARGES',
       'TOTALCHARGES', 'PAYMENTMETHOD_ELECTRONIC_CHECK',
       'PAYMENTMETHOD_MAILED_CHECK', 'PAYMENTMETHOD_BANK_TRANSFER_(AUTOMATIC)',
       'PAYMENTMETHOD_CREDIT_CARD_(AUTOMATIC)', 'CONTRACT_MONTH-TO-MONTH',
       'CONTRACT_ONE_YEAR', 'CONTRACT_TWO_YEAR', 'INTERNETSERVICE_DSL',
       'INTERNETSERVICE_FIBER_OPTIC', 'CHURN'],
      dtype='object')

In [86]:
df_categorical.describe()

Unnamed: 0,GENDER,SENIORCITIZEN,PARTNER,DEPENDENTS,TENURE,PHONESERVICE,MULTIPLELINES,INTERNETSERVICE,ONLINESECURITY,ONLINEBACKUP,...,PAYMENTMETHOD_ELECTRONIC_CHECK,PAYMENTMETHOD_MAILED_CHECK,PAYMENTMETHOD_BANK_TRANSFER_(AUTOMATIC),PAYMENTMETHOD_CREDIT_CARD_(AUTOMATIC),CONTRACT_MONTH-TO-MONTH,CONTRACT_ONE_YEAR,CONTRACT_TWO_YEAR,INTERNETSERVICE_DSL,INTERNETSERVICE_FIBER_OPTIC,CHURN
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,...,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,0.495244,0.154764,0.483033,0.299588,25.75252,0.903166,0.325004,0.783331,0.069999,0.128212,...,0.318472,0.123669,0.172086,0.169104,0.475792,0.157461,0.150078,0.343746,0.439585,0.26537
std,0.500013,0.361705,0.499748,0.45811,25.63673,0.295752,0.64273,0.412004,0.706051,0.738369,...,0.465917,0.329227,0.377482,0.37487,0.499449,0.364261,0.357173,0.474991,0.496372,0.441561
min,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,17.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,1.0,1.0,49.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,72.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [94]:
feature_selected = ['CUSTOMERID', 'SENIORCITIZEN', 
       'TENURE', 'PHONESERVICE', 'INTERNETSERVICE',
       'PAPERLESSBILLING', 'PAYMENTMETHOD_ELECTRONIC_CHECK',
       'CONTRACT_MONTH-TO-MONTH',
       'CONTRACT_ONE_YEAR', 'CONTRACT_TWO_YEAR', 'INTERNETSERVICE_DSL',
       'INTERNETSERVICE_FIBER_OPTIC', 'CHURN']

In [87]:
df_normalizado.columns

Index(['CUSTOMERID', 'GENDER', 'SENIORCITIZEN', 'PARTNER', 'DEPENDENTS',
       'TENURE', 'PHONESERVICE', 'MULTIPLELINES', 'INTERNETSERVICE',
       'ONLINESECURITY', 'ONLINEBACKUP', 'DEVICEPROTECTION', 'TECHSUPPORT',
       'STREAMINGTV', 'STREAMINGMOVIES', 'PAPERLESSBILLING', 'MONTHLYCHARGES',
       'TOTALCHARGES', 'PAYMENTMETHOD_ELECTRONIC_CHECK',
       'PAYMENTMETHOD_MAILED_CHECK', 'PAYMENTMETHOD_BANK_TRANSFER_(AUTOMATIC)',
       'PAYMENTMETHOD_CREDIT_CARD_(AUTOMATIC)', 'CONTRACT_MONTH-TO-MONTH',
       'CONTRACT_ONE_YEAR', 'CONTRACT_TWO_YEAR', 'INTERNETSERVICE_DSL',
       'INTERNETSERVICE_FIBER_OPTIC', 'CHURN'],
      dtype='object')

In [89]:
TEST, TRAIN = train_test_split(df_normalizado[feature_selected].copy(), test_size= 1 - 0.5)
idx_train = TEST.index.values 
idx_test = TRAIN.index.values


In [76]:
from sklearn import tree

class DecisionTree():

    def __init__(self):
        self.arvore = tree.DecisionTreeClassifier(criterion="log_loss")

    def fit_model(self, X_train, Y_train):
        #X_train = train.iloc[:,1:-1].values
        #Y_train = train.iloc[:,-1].values
        self.arvore.fit(X_train, Y_train)

    def evaluate(self, X_test, Y_test):
        #X_test = test.iloc[:, 1:-1].values
        #Y_test = test.iloc[:, -1].values
        
        result = self.arvore.predict(X_test)
        acertos = (result == Y_test).sum()
        self.confusion_matrix = sm.confusion_matrix(y_true = Y_test, y_pred = result)
        self.precision = sm.precision_score(y_true = Y_test, y_pred = result)
        self.recall = sm.recall_score(y_true = Y_test, y_pred = result)

        acc = round(acertos/len(Y_test),2) * 100
        print('Accuracy: {} %'.format(acc))
        return acc, self.precision, self.recall

In [99]:
arv = DecisionTree()
acc = k_fold_validation(arv, 10, df_categorical[feature_selected])

10
Accuracy: 78.0 %
Accuracy: 79.0 %
Accuracy: 75.0 %
Accuracy: 75.0 %
Accuracy: 79.0 %
Accuracy: 78.0 %
Accuracy: 78.0 %
Accuracy: 75.0 %
Accuracy: 76.0 %
Accuracy: 78.0 %
Acuracia : [78.0, 79.0, 75.0, 75.0, 79.0, 78.0, 78.0, 75.0, 76.0, 78.0]
Std: 1.57797338380595
Mean 77.1
Precision : [0.58, 0.583, 0.567, 0.553, 0.63, 0.589, 0.556, 0.543, 0.601, 0.59]
Std: 0.02442867167899227
Mean 0.5791999999999999
Recall [0.497, 0.523, 0.48, 0.459, 0.542, 0.522, 0.468, 0.495, 0.452, 0.467]
Std: 0.028993964889266182
Mean 0.49050000000000005


In [36]:
arv.confusion_matrix

array([[456,  69],
       [ 89,  90]], dtype=int64)

IMPRIMA SUA ARVORE AQUI

In [17]:
a = np.array([1,2,3])
b = np.array([1,2,6])
(a == b).sum()

2

### Regressão logística

In [100]:
from sklearn.linear_model import LogisticRegression
class logisticReg():
    def __init__(self, solver):
        self.solver = solver
    def fit_model(self, X_train, Y_train):
        self.reg = LogisticRegression(random_state=0, solver=self.solver)
        #X_train = train.iloc[:,1:-1].values
        #Y_train = train.iloc[:,-1].values
        self.reg.fit(X_train, Y_train)

    def evaluate(self, X_test, Y_test):
        #X_test = test.iloc[:, 1:-1].values
        #Y_test = test.iloc[:, -1].values
        
        result = self.reg.predict(X_test)
        acertos = (result == Y_test).sum()
        self.confusion_matrix = sm.confusion_matrix(y_true = Y_test, y_pred = result)
        self.precision = sm.precision_score(y_true = Y_test, y_pred = result)
        self.recall = sm.recall_score(y_true = Y_test, y_pred = result)
        
        acc = round(acertos/len(Y_test),2) * 100
        print('Accuracy: {} %'.format(acc))
        
        return acc, self.precision, self.recall

In [101]:
log_reg = logisticReg(solver = 'newton-cholesky')
acc = k_fold_validation(log_reg, 10, df_categorical[feature_selected])

10
Accuracy: 79.0 %
Accuracy: 82.0 %
Accuracy: 80.0 %
Accuracy: 81.0 %
Accuracy: 80.0 %
Accuracy: 80.0 %
Accuracy: 79.0 %
Accuracy: 80.0 %
Accuracy: 81.0 %
Accuracy: 78.0 %
Acuracia : [79.0, 82.0, 80.0, 81.0, 80.0, 80.0, 79.0, 80.0, 81.0, 78.0]
Std: 1.0954451150103321
Mean 80.0
Precision : [0.682, 0.714, 0.627, 0.631, 0.656, 0.696, 0.634, 0.611, 0.656, 0.671]
Std: 0.0313170879872315
Mean 0.6577999999999999
Recall [0.512, 0.538, 0.549, 0.537, 0.535, 0.482, 0.539, 0.479, 0.5, 0.522]
Std: 0.023723616924912632
Mean 0.5193000000000001


### Knn

In [103]:
from sklearn.neighbors import KNeighborsClassifier as KnnClassifier
class KNN():
    def __init__(self, k_neighbors):
        self.k_neighbors = k_neighbors

    def fit_model(self, X_train, Y_train):
        self.reg = KnnClassifier()
        #X_train = train.iloc[:,1:-1].values
        #Y_train = train.iloc[:,-1].values
        self.reg.fit(X_train, Y_train)

    def evaluate(self, X_test, Y_test):
        #X_test = test.iloc[:, 1:-1].values
        #Y_test = test.iloc[:, -1].values
        
        result = self.reg.predict(X_test)
        
        self.confusion_matrix = sm.confusion_matrix(y_true = Y_test, y_pred = result)
        self.precision = sm.precision_score(y_true = Y_test, y_pred = result)
        self.recall = sm.recall_score(y_true = Y_test, y_pred = result)

        acertos = (result == Y_test).sum()
        acc = round(acertos/len(Y_test),2) * 100
        print('Accuracy: {} %'.format(acc))
        
        return acc, self.precision, self.recall

In [104]:
knn_model = KNN(k_neighbors=5)
acc = k_fold_validation(knn_model, 10, df_categorical[feature_selected])

10
Accuracy: 79.0 %
Accuracy: 77.0 %
Accuracy: 77.0 %
Accuracy: 80.0 %
Accuracy: 77.0 %
Accuracy: 77.0 %
Accuracy: 80.0 %
Accuracy: 80.0 %
Accuracy: 78.0 %
Accuracy: 78.0 %
Acuracia : [79.0, 77.0, 77.0, 80.0, 77.0, 77.0, 80.0, 80.0, 78.0, 78.0]
Std: 1.2688577540449522
Mean 78.3
Precision : [0.57, 0.585, 0.591, 0.599, 0.602, 0.62, 0.631, 0.643, 0.592, 0.662]
Std: 0.02725160545729373
Mean 0.6094999999999999
Recall [0.54, 0.515, 0.468, 0.521, 0.49, 0.502, 0.522, 0.532, 0.467, 0.472]
Std: 0.02586677405475991
Mean 0.5029


In [45]:
knn_model.confusion_matrix

array([[423,  89],
       [ 99,  93]], dtype=int64)

### SVM

### Naive bayes

### Gradient boosting

# 2 - Redes neurais

## 2.1 Estrategia 1: um único modelo

### 2.1.1 Definindo o modelo

### 2.1.2 Treinando o modelo

### 2.1.2 Validação

## 2.2 Estrategia 2: modelos por range de tenure, 1-x...

### 2.2.1 Definição

### 2.2.2 Treino

### 2.2.3 Validação