# Analise de influencia das variaveis no churn dos clientes

**Objetivo:** Identificar quais variáveis estão diretamente relacionadas com o churn de clientes, através de métodos como  analise de correlação, regressões múltiplas e análise de sensibilidade. 


## 0. Imports, carga e preparação de dados

In [84]:
import pandas as pd
from ydata_profiling import ProfileReport
from matplotlib import pyplot as plt
import seaborn as sns

## 1. Correlações

As correlações podem ser extraídas diretamente de ferramentas de auto-report, que além de correlações lineares e não lineares nos fornecem uma boa visão geral dos dados.

In [49]:
df = pd.read_csv('../Data/churn.csv')
profile = ProfileReport(df = df, title = 'churn_eda_report')
categorical_columns = ['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService',
                       'OnlineSecurity','OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 
                       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod','Churn']

In [48]:
df.iloc[:,-5:]

Unnamed: 0,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Yes,Electronic check,29.85,29.85,No
1,No,Mailed check,56.95,1889.5,No
2,Yes,Mailed check,53.85,108.15,Yes
3,No,Bank transfer (automatic),42.30,1840.75,No
4,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...
7038,Yes,Mailed check,84.80,1990.5,No
7039,Yes,Credit card (automatic),103.20,7362.9,No
7040,Yes,Electronic check,29.60,346.45,No
7041,Yes,Mailed check,74.40,306.6,Yes


In [50]:

df_categorical = df.copy()
for c in categorical_columns:
    dict_categorical_to_numeric = {}
    values = df[c].unique()
    for idx, value in enumerate(values):
        dict_categorical_to_numeric[value] = idx
    df_categorical[c] = df[c].apply(lambda x: dict_categorical_to_numeric[x])
    

In [52]:
df_categorical.sample(3)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
3409,4806-DXQCE,0,1,0,0,70,1,2,1,1,...,1,1,1,1,2,0,0,113.65,7714.65,0
1565,6968-GMKPR,0,0,1,0,55,1,1,1,0,...,1,0,0,0,0,0,0,81.55,4509.5,0
6891,7853-OETYL,0,0,0,0,4,0,0,0,1,...,0,0,0,0,0,0,0,29.05,129.6,0


Verificando os campos numericos em numericos (até então estão tipo object)

In [61]:
for item in df.columns.values:
    if item not in categorical_columns:
        print('='*30)
        try:
            df_categorical[item] = df_categorical[item].apply(lambda x: float(x))
            print('{} --> Ok'.format(item))
        except Exception as e:
            print('{} -- FALHOU'.format(item))
            print(e)
        print('='*30)

customerID -- FALHOU
could not convert string to float: '7590-VHVEG'
SeniorCitizen --> Ok
tenure --> Ok
MonthlyCharges --> Ok
TotalCharges -- FALHOU
could not convert string to float: ' '


In [None]:
df_categorical[df_categorical.TotalCharges == ' '].tenure.value_counts()

Quando não há vencimentos o totalcharges é nulo

In [70]:
# Adaptando conversão de TotalCharges
df_categorical['TotalCharges'] = df['TotalCharges'].apply(lambda x: float(x) if x != ' ' else 0)


In [75]:
df_categorical.InternetService.value_counts()

InternetService
1    3096
0    2421
2    1526
Name: count, dtype: int64

In [76]:
df.InternetService.value_counts()

InternetService
Fiber optic    3096
DSL            2421
No             1526
Name: count, dtype: int64

### 1.1 Linear

In [77]:
pear_corr = df_categorical.iloc[:,1:].corr(method='pearson')
pear_corr

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
gender,1.0,-0.001874,0.001808,0.010517,0.005106,-0.006488,-0.009451,-0.000863,-0.003429,0.01223,0.005092,0.000985,0.001156,-0.000191,0.000126,0.011754,-0.005209,-0.014569,-8e-05,-0.008612
SeniorCitizen,-0.001874,1.0,-0.016479,-0.211185,0.016567,0.008576,0.113791,-0.03231,-0.210897,-0.144828,-0.157095,-0.22377,-0.13013,-0.120802,-0.142554,-0.15653,-0.093704,0.220173,0.103006,0.150889
Partner,0.001808,-0.016479,1.0,-0.452676,-0.379697,-0.017706,-0.117307,-0.000891,-0.08185,0.090753,-0.094451,-0.069072,-0.080127,-0.075779,-0.294806,-0.014877,-0.133115,-0.096848,-0.317504,0.150448
Dependents,0.010517,-0.211185,-0.452676,1.0,0.159712,-0.001762,-0.019657,0.04459,0.190523,0.062775,0.156439,0.180832,0.140395,0.12582,0.243187,0.111377,0.123844,-0.11389,0.062078,-0.164221
tenure,0.005106,0.016567,-0.379697,0.159712,1.0,0.008448,0.258958,-0.030359,0.145298,-0.253743,0.178649,0.144459,0.136145,0.140781,0.671607,-0.006152,0.340305,0.2479,0.826178,-0.352229
PhoneService,-0.006488,0.008576,-0.017706,-0.001762,0.008448,1.0,0.67507,0.387436,0.125353,0.12977,0.138755,0.12335,0.171538,0.165205,0.002247,-0.016505,-0.00407,0.247398,0.113214,0.011942
MultipleLines,-0.009451,0.113791,-0.117307,-0.019657,0.258958,0.67507,1.0,0.186826,-0.066844,-0.130619,-0.013069,-0.066684,0.030195,0.028187,0.083343,-0.133255,0.025676,0.4907,0.412104,0.03631
InternetService,-0.000863,-0.03231,-0.000891,0.04459,-0.030359,0.387436,0.186826,1.0,0.607788,0.650962,0.662957,0.609795,0.71289,0.70902,0.099721,0.138625,0.008124,-0.32326,-0.175755,-0.047291
OnlineSecurity,-0.003429,-0.210897,-0.08185,0.190523,0.145298,0.125353,-0.066844,0.607788,1.0,0.621739,0.74904,0.791225,0.701976,0.704984,0.389978,0.334003,0.2138,-0.621227,-0.15438,-0.332819
OnlineBackup,0.01223,-0.144828,0.090753,0.062775,-0.253743,0.12977,-0.130619,0.650962,0.621739,1.0,0.601503,0.617003,0.604117,0.606863,0.035407,0.260715,0.003183,-0.710477,-0.537212,-0.074205


### 1.2 Não Linear

In [94]:
spear_corr = df_categorical.iloc[:,1:].corr(method='spearman')
#plt.figure(figsize=(16,6))
sns.heatmap(df_categorical.iloc[:,1:].corr(method='spearman') ,annot=True)


<Axes: >

In [90]:
spear_corr

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
gender,1.0,-0.001874,0.001808,0.010517,0.003486,-0.006488,-0.009342,-0.001517,-0.005346,0.012583,0.004757,-0.000136,0.000443,-0.001036,0.001094,0.011754,-0.004904,-0.013736,-0.001394,-0.008612
SeniorCitizen,-0.001874,1.0,-0.016479,-0.211185,0.018556,0.008576,0.125333,-0.016964,-0.208471,-0.138051,-0.147298,-0.223571,-0.119002,-0.108873,-0.14385,-0.15653,-0.103416,0.221067,0.108171,0.150889
Partner,0.001808,-0.016479,1.0,-0.452676,-0.384931,-0.017706,-0.127568,-0.0009,-0.0962,0.097835,-0.107242,-0.080979,-0.088886,-0.083891,-0.295925,-0.014877,-0.130628,-0.108463,-0.340298,0.150448
Dependents,0.010517,-0.211185,-0.452676,1.0,0.164175,-0.001762,-0.021599,0.034373,0.193828,0.053874,0.153893,0.182357,0.136389,0.120306,0.24412,0.111377,0.128865,-0.107011,0.078409,-0.164221
tenure,0.003486,0.018556,-0.384931,0.164175,1.0,0.008483,0.292002,-0.028575,0.179074,-0.270135,0.209984,0.178136,0.158364,0.163182,0.667623,-0.007695,0.336847,0.276417,0.889696,-0.367062
PhoneService,-0.006488,0.008576,-0.017706,-0.001762,0.008483,1.0,0.568254,0.400349,0.110079,0.122864,0.128255,0.107906,0.16645,0.159519,0.001842,-0.016505,-0.003994,0.239701,0.085371,0.011942
MultipleLines,-0.009342,0.125333,-0.127568,-0.021599,0.292002,0.568254,1.0,0.159399,-0.088049,-0.170587,-0.024509,-0.087514,0.019501,0.017974,0.08924,-0.145577,0.019531,0.525844,0.430005,0.038293
InternetService,-0.001517,-0.016964,-0.0009,0.034373,-0.028575,0.400349,0.159399,1.0,0.506265,0.595125,0.58847,0.509715,0.656963,0.653,0.079586,0.117944,0.004893,-0.206889,-0.188862,-0.02869
OnlineSecurity,-0.005346,-0.208471,-0.0962,0.193828,0.179074,0.110079,-0.088049,0.506265,1.0,0.523612,0.664998,0.711453,0.612247,0.617051,0.406696,0.323286,0.24401,-0.526906,-0.114527,-0.342148
OnlineBackup,0.012583,-0.138051,0.097835,0.053874,-0.270135,0.122864,-0.170587,0.595125,0.523612,1.0,0.512947,0.518495,0.526616,0.530811,0.009146,0.249298,0.007245,-0.659711,-0.533806,-0.057523


## 2. Regressões múltiplas

## 3. Análise de sensibilidade