In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
df = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head(3).T

In [None]:
df.info()

In [None]:
df['TotalCharges'] = df['TotalCharges'].str.replace(' ','0').astype(float)

In [None]:
pd.get_dummies(df['gender']).iloc[:, 1:]

In [None]:
pd.get_dummies(df['PaymentMethod'])

In [None]:
df = pd.get_dummies(df, columns=['gender','Partner','Dependents','PhoneService','MultipleLines',
                                 'InternetService','OnlineSecurity','OnlineBackup','DeviceProtection',
                                 'TechSupport','StreamingTV','StreamingMovies','Contract',
                                 'PaperlessBilling','PaymentMethod'])

In [None]:
df.head().T

In [None]:
feats = [c for c in df.columns if c not in ['customerID','Churn']]

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42)

train, valid = train_test_split(train, test_size=0.2, random_state=42)

train.shape, valid.shape, test.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=42)

In [None]:
rf.fit(train[feats], train['Churn'])

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
preds_val = rf.predict(valid[feats])

accuracy_score(valid['Churn'], preds_val)

In [None]:
preds_test = rf.predict(test[feats])

accuracy_score(test['Churn'], preds_test)

In [None]:
pd.Series(rf.feature_importances_, index=feats).sort_values().plot.barh()

In [None]:
import scikitplot as skplt

In [None]:
skplt.metrics.plot_confusion_matrix(valid['Churn'],preds_val)

# A Partir daqui Trabalho

In [None]:
#Testando o limitador de tamanho da árvore
rft = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=9)
rft.fit(train[feats], train['Churn'])
pred_teste = rft.predict(valid[feats])
accuracy_score(valid['Churn'], pred_teste)

#Varia um pouco. Capaz de melhorar em quase 1% a acurácia.

In [None]:
#Testando aumentando o número de estimadores
rft = RandomForestClassifier(n_estimators=300, random_state=42)
rft.fit(train[feats], train['Churn'])
pred_teste = rft.predict(valid[feats])
accuracy_score(valid['Churn'], pred_teste)

#O aumento de estimadores não necessariamente aumenta a acurácia

In [None]:
#Testando limitando o número de registros num nó para poder splitar
rft = RandomForestClassifier(n_estimators=200, random_state=42, min_samples_split= 1000)
rft.fit(train[feats], train['Churn'])
pred_teste = rft.predict(valid[feats])
accuracy_score(valid['Churn'], pred_teste)

#Varia bastante. Para valores muito altos, a árvore fica incapaz de crescer e prejudica a acurácia.

In [None]:
#Testando limitando o número de registros numa folha no final da árvore
rft = RandomForestClassifier(n_estimators=200, random_state=42, min_samples_leaf= 100)
rft.fit(train[feats], train['Churn'])
pred_teste = rft.predict(valid[feats])
accuracy_score(valid['Churn'], pred_teste)

#Assim como no parâmetro anterior, varia bastante. Para valores muito altos prejudica a acurácia.

In [None]:
#Separando os datasets novamente, dessa vez levando em consideração o desbalanceio, ou seja, estratificando os datasets de teste e validação pela variável alvo

train, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Churn'])

train, valid = train_test_split(train, test_size=0.2, random_state=42)

train.shape, valid.shape, test.shape

In [None]:
#Testando da maneira básica para comparação. Lembrando que a acurácia foi de 0.7888198757763976 para o de validação e 0.794889992902768 para teste
rft = RandomForestClassifier(n_estimators=200, random_state=42)
rft.fit(train[feats], train['Churn'])
pred_teste = rft.predict(valid[feats])
print(accuracy_score(valid['Churn'], pred_teste))

pred_teste_test = rft.predict(test[feats])
print(accuracy_score(test['Churn'], pred_teste_test))

#Um resultado diferente, mas nada convidativo

In [None]:
#Agora com opções
rft = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=9, min_samples_split= 10)
rft.fit(train[feats], train['Churn'])

pred_teste = rft.predict(valid[feats])
print(accuracy_score(valid['Churn'], pred_teste))

pred_teste_test = rft.predict(test[feats])
print(accuracy_score(test['Churn'], pred_teste_test))

#Um resultado ligeiramente melhor

In [None]:
#Analisando o debalanceio
df['Churn'].value_counts()

#No: 73,46%
#Yes: 26,54%

In [None]:
#Testando colocar pesos nas possibilidades de Churn para atacar o desbalanceio
class_weight = dict({'No':1, 'Yes':1.1})
rdf = RandomForestClassifier(bootstrap=True,
            class_weight=class_weight, 
            criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=300,
            oob_score=False,
            random_state=42,
            verbose=0, warm_start=False)

rdf.fit(train[feats], train['Churn'])

pred_teste = rdf.predict(valid[feats])
print(accuracy_score(valid['Churn'], pred_teste))

pred_teste_test = rdf.predict(test[feats])
print(accuracy_score(test['Churn'], pred_teste_test))

#Pelo Ratio entre eles não foi possível melhorar o modelo. Em testes de tentativa e erro dá pra chegar nos 80%