In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# Importando o train_test_split
from sklearn.model_selection import train_test_split

# Importando o RandomForest
from sklearn.ensemble import RandomForestClassifier

# Importando a métrica
from sklearn.metrics import accuracy_score

# GridSearch e Cross_val_score
from sklearn.model_selection import cross_val_score, GridSearchCV

In [None]:
# Carregando os dados
df = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

df.shape

In [None]:
df.head().T

In [None]:
# Verificando os tipos dos dados e quantidades
df.info()

In [None]:
# Convertendo a coluna TotalCharges para float
#df['TotalCharges'] = df['TotalCharges'].astype(float)

In [None]:
# Vamos identificar os valores em branco
df[df['TotalCharges'].str.contains(' ')]

In [None]:
# Para corrigir a coluna TotalCharges vamos trocar espaço em branco
# por -1 e forçar novamente a conversão
df['TotalCharges'] = df['TotalCharges'].str.replace(' ', '-1').astype(float)

In [None]:
# Guardar df original
df2 = df.copy()
rg = df.copy()

In [None]:
df.columns

In [None]:
df = pd.get_dummies(df, columns=['gender', 'Partner', 'Dependents', 'PhoneService', 
                                 'MultipleLines', 'InternetService', 'OnlineSecurity',
                                 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                                 'StreamingTV', 'StreamingMovies', 'Contract',
                                 'PaperlessBilling', 'PaymentMethod'])

In [None]:
df.shape

In [None]:
# Verificando os dados
df.head().T

In [None]:
# Definindo as features / colunas de entrada
feats = [c for c in df.columns if c not in ['customerID', 'Churn']]

In [None]:
# Separando o dataframe

# Primeiro treino e teste
train, test = train_test_split(df, test_size=0.20, random_state=42)

# Depois o treino e validação
train, valid = train_test_split(train, test_size=0.20, random_state=42)

train.shape, valid.shape, test.shape

In [None]:
# Treinando o modelo

# Instanciando o modelo
rf = RandomForestClassifier(n_estimators=200, random_state=42)

In [None]:
# Treinando o modelo
rf.fit(train[feats], train['Churn'])

In [None]:
# Avaliando o desempenho do modelo nos dados de validação
preds_val = rf.predict(valid[feats])

accuracy_score(valid['Churn'], preds_val)

In [None]:
preds_val

In [None]:
# Avaliando o desempenho do modelo nos dados de teste
preds_test = rf.predict(test[feats])

accuracy_score(test['Churn'], preds_test)

In [None]:
# Olhando a coluna Churn do dataframe completo
df['Churn'].value_counts(normalize=True)

In [None]:
# Trabalhando com df2
df2.info()

In [None]:
# Tipo category do pandas
df2['gender'].astype('category').cat.categories

In [None]:
df2['gender'].astype('category').cat.codes

In [None]:
df2['PaymentMethod'].astype('category').cat.categories

In [None]:
# Convertendo as colunas categórias para colunas numéricas
for col in df2.columns:
    if df2[col].dtype == 'object':
        df2[col] = df2[col].astype('category').cat.codes

In [None]:
# Verificando o dataframe resultante
df2.info()

In [None]:
# Separando o dataframe em 3 partes: train2, valid2 e test2

# Primerio treino e teste
train2, test2 = train_test_split(df2, test_size=0.2, random_state=42)

# Depois treino e validação
train2, valid2 = train_test_split(train2, test_size=0.2, random_state=42)

train2.shape, valid2.shape, test2.shape

In [None]:
# Separando as colunas para treino
feats2 = [c for c in df2.columns if c not in ['costumerID', 'Churn']]
feats2

In [None]:
# Instanciando o modelo
rf2 = RandomForestClassifier(n_estimators=200, random_state=42)

# Treinando o modelo
rf2.fit(train2[feats2], train2['Churn'])

In [None]:
# Obtendo as previsões na base de validação
preds2 = rf2.predict(valid2[feats2])

# Verificando a acurácia
accuracy_score(valid2['Churn'], preds2)

In [None]:
# Obtendo as previsões nos dados de teste
preds_test2 = rf2.predict(test2[feats2])

# Verificando a acurácia

accuracy_score(test2['Churn'], preds_test2)

In [None]:
# Importando matplotlib
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 10))

# Avaliando a importância de cada coluna (cada variável)

# O primeiro modelo criado
pd.Series(rf.feature_importances_, index=feats).sort_values().plot.barh()

In [None]:
# Importando a biblioteca para plotar o gráfico de matriz de confusão
import scikitplot as skplt

In [None]:
# Matriz de confusão - Dados de validação
skplt.metrics.plot_confusion_matrix(valid['Churn'], preds_val)

# Parte - Aluno

In [None]:
# Utilizo o dataframe tratado em sala de aula, o qual fiz cópia anteriormente
# Começo convertendo as colunas categóricas para colunas numéricas
for col in rg.columns:
    if rg[col].dtype == 'object':
        rg[col] = rg[col].astype('category').cat.codes

In [None]:
# Visualizo como ficou
rg.info()

In [None]:
# Defino as features / colunas de entrada
rfeats = [c for c in rg.columns if c not in ['customerID', 'Churn']]

In [None]:
# Separo o dataframe
# Primeiro treino e teste
rtrain, rtest = train_test_split(rg, test_size=0.20, random_state=42, stratify=rg['Churn'])

# Depois o treino e validação
rtrain, rvalid = train_test_split(rtrain, test_size=0.20, random_state=42)

rtrain.shape, rvalid.shape, rtest.shape

# Instanciando o modelo
rrf = RandomForestClassifier(n_estimators=200
                             , random_state=42
                             , criterion="entropy"
                             , max_depth=10
                             , min_samples_split=5
                             , min_samples_leaf=10
                             , max_features=4
)

# Treinando o modelo
rrf.fit(rtrain[rfeats], rtrain['Churn'])

# Avaliando o desempenho do modelo nos dados de validação
rpreds_val = rrf.predict(rvalid[rfeats])

# Avaliando o desempenho do modelo nos dados de teste
rpreds_test = rrf.predict(rtest[rfeats])

# Verificando a acurácia
print(accuracy_score(rvalid['Churn'], rpreds_val))
print(accuracy_score(rtest['Churn'], rpreds_test))

In [None]:
%%time
param_grid  = { 
                'n_estimators' : [100, 200],
                'min_samples_split': [2,5,10,15],
                'min_samples_leaf': [1,2,5,10],
                'max_depth': range(1,5,2),
                'max_features' : (4, 'log2', 'sqrt'),
                'class_weight':[{1: w} for w in [1,1.5]]
              }

GridRF = GridSearchCV(RandomForestClassifier(random_state=15), param_grid)

GridRF.fit(rtrain[rfeats], rtrain['Churn'])
#RF_preds = GridRF.predict_proba(X_test)[:, 1]
#RF_performance = roc_auc_score(Y_test, RF_preds)

print(
    #'DecisionTree: Area under the ROC curve = {}'.format(RF_performance)
     "\nBest parameters \n" + str(GridRF.best_params_))