In [None]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd 
import numpy  as np
import matplotlib.pyplot as plt 
plt.style.use('ggplot')
import seaborn as sns 
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.columns = df.columns.str.lower()

### Creating list with vars to use in eda

In [None]:
df['target'] = df['churn'].map({'Yes':1,'No':0}) 
vars_to_eda  = df.drop('customerid',axis = 1).columns.tolist()
df.drop('customerid',axis = 1 , inplace = True)

### TotalCharges column has blkanspace, let's replace then by zero

In [None]:
df["totalcharges"].replace(" ",0,inplace = True)
df["totalcharges"] = df["totalcharges"].astype(float)

### Convert tenure to int format

In [None]:
df['tenure'] = df['tenure'].astype('int')
df['seniorcitizen'] = df['seniorcitizen'].astype('object')

### Exploratory Data Analysis

### Plot of distribution of each variable

In [None]:
for i in df[vars_to_eda].select_dtypes('object'):
    ax = plt.figure(figsize=(14,4))
    
    plt.subplot(1,2,1)
    ax = df[i].value_counts(normalize = False).\
         to_frame('quantity').\
         reset_index().\
         rename(columns = {'index':i}).\
         plot(x = i,y = 'quantity' , kind = 'bar',ax = plt.gca(),color = 'navy')
    plt.title('Quantity - '+ str(i),fontsize = 16)
    plt.xlabel(i)
    plt.ylabel('Quantity')
    plt.legend('')
    
    
    
    plt.subplot(122)
    ax = df[i].value_counts(normalize = True).\
         to_frame('pct').\
         reset_index().\
         rename(columns = {'index':i}).\
         plot(x = i,y = 'pct' , kind = 'bar',ax = plt.gca(),color = 'navy')
    plt.title('Pct - ' + str(i) ,fontsize = 16)
    plt.xlabel(i)
    plt.ylabel('%')
    plt.legend('')
    
    
    
    plt.tight_layout()
    
    
    plt.show(ax)
    
    

-> Apenas 30% da base possui dependentes

-> Mais de 90% da base possui serviço telefonico

-> Aproximadamente 44% da base possui Fibra Ótica,34% posusi rede DSL e 21% não possui serviços de internet

-> 50% da base não possui serviços de segurança online

-> 45% Não possui backup online

-> 45% não possui proteção ao aparelho

-> 50% não possui suporte técnico

-> A distribuição de quem possui e não possui servicos de streaming de TV mas possui internet é bem próxima, aproximadamente 40%

-> O mesmo comportamento acima é descrito para o serviço de streaming de Filmes 

-> A maior parte da base faz contrato mensal

In [None]:
for i in df[vars_to_eda].drop('churn',axis = 1).select_dtypes('object').columns.tolist():
    plt.figure(figsize=(12,4))

    plt.subplot(1,2,1)
    sns.barplot(data = df.groupby(i)['churn'].value_counts(normalize = False).to_frame('quantity').reset_index(),
               x = i,y = 'quantity',hue = 'churn')
    ax = plt.title('Quantity by Churn - ' + str(i),fontsize = 15)
    ax = plt.xticks(rotation = 90)
    
    plt.subplot(1,2,2)
    sns.barplot(data = df.groupby(i)['churn'].value_counts(normalize = True).to_frame('pct').reset_index(),
                x = i,y = 'pct',hue = 'churn')
    ax = plt.title('Pct by Churn - ' + str(i),fontsize = 15)
    ax = plt.xticks(rotation = 90)
    plt.show()

    plt.tight_layout() ; 

### Univariate analysis from continuos variable

In [None]:
ax = plt.figure(figsize = (14,6))
ax = sns.heatmap(df[vars_to_eda].select_dtypes(['int','float']).corr(),annot = True)
ax = plt.title('Correlation Matix')

In [None]:
ax = sns.regplot(data = df.groupby('tenure')['target'].mean().to_frame('pct_churn').reset_index(),
                 x = 'tenure',y = 'pct_churn',color = 'blue')
ax = plt.title('Tenure x Churn')

Como pode-se ver, a taxa de churn diminui conforme a variável tenure aumenta

### Calculate WOE from categorical vars to grouped then:

In [None]:
def calculate_woe(feature):
    woe = pd.crosstab(df[feature],df['target'],normalize='columns')
    woe.columns = ['no_churn','churn']
    woe = woe.reset_index()
    woe['woe'] = np.log(woe['churn']/woe['no_churn'])
    display(woe)

In [None]:
for i in df.drop(['target','churn'],axis = 1).select_dtypes('object').columns.tolist():
    calculate_woe(i)

### Grouping some categories

Com base no woe que foi encontrado acima, vamos fazer a junçao das categorias que apresentam um woe próximo. Vale ressaltar que me alguns casos não faria sentido o agrupamento por questões de negócio

In [None]:
df['multiplelines'] = df['multiplelines'].map({'No':'No','No phone service':'No','Yes':'Yes'})
df['group_paymentmethod'] = df['paymentmethod'].map({'Bank transfer (automatic)':'automatic',
                                                     'Credit card (automatic)':'automatic',
                                                     'Mailed check':'automatic',
                                                     'Electronic check':'eletronic'})

In [None]:
for i in ['multiplelines','group_paymentmethod']:
    plt.figure(figsize=(12,4))

    plt.subplot(1,2,1)
    sns.barplot(data = df.groupby(i)['churn'].value_counts(normalize = False).to_frame('quantity').reset_index(),
               x = i,y = 'quantity',hue = 'churn')
    ax = plt.title('Quantity by Churn - ' + str(i),fontsize = 15)
    ax = plt.xticks(rotation = 90)
    
    plt.subplot(1,2,2)
    sns.barplot(data = df.groupby(i)['churn'].value_counts(normalize = True).to_frame('pct').reset_index(),
                x = i,y = 'pct',hue = 'churn')
    ax = plt.title('Pct by Churn - ' + str(i),fontsize = 15)
    ax = plt.xticks(rotation = 90)
    
    
    plt.tight_layout() 
    plt.show();

In [None]:
for i in df.drop('target',axis = 1 ).select_dtypes(['int','float']).columns.tolist():
    for j in df.drop(['target','churn'],axis = 1).select_dtypes('object').columns.tolist():
        ax = plt.figure(figsize = (14,4))
        ax = sns.boxplot( x = j , y = i , hue = 'churn' , data = df)
        ax = plt.title('BoxPlot from variable '+ str(i) + ' hue by ' + str(j))
        
        ax = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        plt.show()

In [None]:
for i in df.drop(['target','churn'],axis = 1).select_dtypes('object'):
    ax = df.groupby(i)['target'].mean().to_frame('churn_rate').reset_index().plot(x = i , y = 'churn_rate' , kind = 'bar')
    ax = plt.title(i)
    plt.show()

In [None]:
sns.pairplot(df.drop('target',axis = 1).select_dtypes(['float','int'])) ; 

In [None]:
def metrics(model):
    aux = pd.DataFrame(index = X_test.index)
    aux['prob'] = model.predict_proba(X_test)[:,1]
    aux['decil'] = pd.qcut(aux['prob'],10,labels=np.arange(1,11,1))
    aux['target'] = y_test
    
    
    
    temp = aux.groupby('decil')
    
    result = {   
               'prob':temp['prob'].mean()
             , 'target':temp['target'].mean()
             , 'count'  :temp['target'].size()
             , 'event': temp['target'].sum()
             
    
             }
    
    
        
    result['non_event']          = result['count'] - result['event']
    result['pct_event']          = result['event']/result['event'].sum()
    result['pct_non_event']      = result['non_event']/result['non_event'].sum()
    result = pd.DataFrame.from_dict(result).reset_index()    
    display(result)
    
    plt.figure(figsize = (14,5))
    plt.subplot(131)
    result.plot(x = 'decil' , y = ['pct_event','pct_non_event'] , kind = 'bar',ax = plt.gca())


    plt.subplot(132)
    sns.scatterplot( x = 'prob' , y = 'target' , data = result)
    
    plt.subplot(133)
    for i in aux['target'].unique():
        sns.distplot(aux[aux['target'] == i ]['prob'])
    return 

### Lest Fit Some Models'

### Scaling continus variables

In [None]:
X = df.drop(['target','churn','paymentmethod'],axis = 1)
y = df['target']

############################################################################
#                       Scaling Variables                                  #
############################################################################


from sklearn.preprocessing import StandardScaler
for i in X.select_dtypes(['int','float']).columns.tolist():
    j = i + '_std'
    X[j] = StandardScaler().fit_transform(X[[i]].to_numpy())

### Splitting DataFrame

In [None]:
vars_to_model = [
#                 'gender',
                 'seniorcitizen',
#                 'partner',
#                 'dependents',
#                 'tenure',
                 'phoneservice',
                # 'multiplelines',
                 'internetservice',
#                  'onlinesecurity',
#                  'onlinebackup',
#                  'deviceprotection',
#                  'techsupport',
#                 'streamingtv',
                 'streamingmovies',
                 'contract',
                 'paperlessbilling',
#                 'monthlycharges',
                 'totalcharges',
                 'group_paymentmethod',
#                 'tenure_std',
#                 'monthlycharges_std',
#                 'totalcharges_std'
                ]


from sklearn.model_selection import train_test_split
############################################################################
#                       Spliting DataSet                                   #
############################################################################
X_train,X_test,y_train,y_test = train_test_split(X , y , test_size = 0.4 , random_state = 42)
X_train = pd.get_dummies(X_train)
X_test  = pd.get_dummies(X_test)

### Fitting Logistic Regression:

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(max_iter = 5000000).fit(X_train,y_train)
logistic_metrics = metrics(logistic_model)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_random_forest = RandomForestClassifier(max_leaf_nodes=50).fit(X_train,y_train)
metrics(model_random_forest)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
model_gradient = GradientBoostingClassifier().fit(X_train,y_train)
metrics(model_gradient)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_decision_tree = DecisionTreeClassifier(max_depth=5,min_samples_leaf= 20,criterion='entropy').fit(X_train,y_train)
metrics(model_decision_tree)