In [1]:
import pandas as pd
import numpy as np

from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import recall_score

### Carga de Dados

In [2]:
# Carregar dados
df_churn = pd.read_csv('./datasets/churn_telecom.csv')

In [3]:
# Visualizar Estrutura
df_churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   IDCliente         7032 non-null   object 
 1   Genero            7032 non-null   object 
 2   Mais65anos        7032 non-null   int64  
 3   TemParceiro       7032 non-null   object 
 4   TemDependentes    7032 non-null   object 
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  tenure            7032 non-null   int64  
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 


In [4]:
# Visualizar as primeiras linhas
df_churn.head(5)

Unnamed: 0,IDCliente,Genero,Mais65anos,TemParceiro,TemDependentes,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,No,No phone service,DSL,No,Yes,...,No,No,No,1,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,Yes,No,DSL,Yes,No,...,No,No,No,34,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,Yes,No,DSL,Yes,Yes,...,No,No,No,2,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,No,No phone service,DSL,Yes,No,...,Yes,No,No,45,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,Yes,No,Fiber optic,No,No,...,No,No,No,2,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
# Visualizar últimos registros
df_churn.tail(5)

Unnamed: 0,IDCliente,Genero,Mais65anos,TemParceiro,TemDependentes,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7027,6840-RESVB,Male,0,Yes,Yes,Yes,Yes,DSL,Yes,No,...,Yes,Yes,Yes,24,One year,Yes,Mailed check,84.8,1990.5,No
7028,2234-XADUH,Female,0,Yes,Yes,Yes,Yes,Fiber optic,No,Yes,...,No,Yes,Yes,72,One year,Yes,Credit card (automatic),103.2,7362.9,No
7029,4801-JZAZL,Female,0,Yes,Yes,No,No phone service,DSL,Yes,No,...,No,No,No,11,Month-to-month,Yes,Electronic check,29.6,346.45,No
7030,8361-LTMKD,Male,1,Yes,No,Yes,Yes,Fiber optic,No,No,...,No,No,No,4,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
7031,3186-AJIEK,Male,0,No,No,Yes,No,Fiber optic,Yes,No,...,Yes,Yes,Yes,66,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No


In [6]:
# Contar clientes usando a variável Churn como referência
df_churn['Churn'].value_counts()

No     5163
Yes    1869
Name: Churn, dtype: int64

In [7]:
# Distribuição percentual de Clientes pela variável Churn
df_churn['Churn'].value_counts(normalize=True)

No     0.734215
Yes    0.265785
Name: Churn, dtype: float64

### Preparação da Base para Algoritmo LOF

In [8]:
# Selecionando as colunas para o algoritmo
X = df_churn.drop(columns=['IDCliente', 'Churn'])
y = df_churn['Churn']

In [9]:
# Definir uma função para transformar "Yes" em 1 e "No" em 0
def binary_transformer_function(X):
  return X.applymap(lambda x: 1 if x == 'Yes' else 0)

In [10]:
# Transformações
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_features = ['Genero', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'DeviceProtection', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']
binary_features = ['TemParceiro', 'TemDependentes', 'TechSupport', 'PhoneService', 'PaperlessBilling']
no_transformation_features = ['Mais65anos']

# Criar Transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()
binary_tranformer = FunctionTransformer(binary_transformer_function)

# Criar o preprocessor
preprocessor = ColumnTransformer(
  transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features),
    ('bin', binary_tranformer, binary_features),
    ('pass', 'passthrough', no_transformation_features)
  ]
)

# Transformar os dados
X_transformed = preprocessor.fit_transform(X)

In [11]:
# Visualizar X_transformed
X_transformed.shape

(7032, 36)

### Treinar o algoritmo LOF

In [12]:
# Instânciar um objeto LOF
lof = LocalOutlierFactor(n_neighbors=30, contamination=0.26)

In [13]:
# Treinar algoritmo e já gerar as classificações de anomalia para registro (ponto de dados)
y_pred = lof.fit_predict(X_transformed)

In [14]:
# Mostrar valores preditos (anomalia ou não anomalia)
# No sklearn, o predict gera um valor = -1 (anomalia) e valor = 1 para (pontos normais)
y_pred

array([ 1,  1, -1, ...,  1, -1,  1])

In [15]:
# Mostrar o LOF calculado para cada ponto de dados
# No sklearn, o LOF calculado fica na propriedade negative_outliers_factor_
# negative_outlier_factor_ é o inverso do LOF. Quanto menor, mais anormal.
-lof.negative_outlier_factor_

array([0.99111539, 1.01897903, 1.17653636, ..., 1.07937064, 1.19307812,
       1.07615609])

### Apresentar resultados

In [16]:
# Identificar anomalias
outliers = y_pred == -1
inliers = y_pred == 1

# Contar anomalias e os pontos normais
num_outliers = np.sum(outliers)
num_inliers = np.sum(inliers)

# Apresentar estatísticas
print(f'Anomalias detectadas: {num_outliers}')
print(f'Pontos normais: {num_inliers}')

Anomalias detectadas: 1829
Pontos normais: 5203


In [17]:
# Converter y para a mesma base do y_pred
y_true = y.map(lambda x: -1 if x == 'Yes' else 1)

In [18]:
# Calcular Score com base no valor de y (Churn real da base)
# Usar Recall, pois o objetivo principal é maximizar o TPR (True Positive Rate)
recall_score(y_true, y_pred)

0.7549874104202983