# Proyecto Telecom X - Etapa 2 - Machine Learning 
#    (Previsión de Evasión (Churn) de clientes.)


In [1]:
# Librarys used here

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import tabulate as tabulate
import folium
from pandas import json_normalize
from folium.plugins import HeatMap
import warnings
import dfply as dp
from pathlib import Path

warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

import json




import requests
import pickle

from pathlib import Path
from typing import Tuple, Callable, List

from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTENC

from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import StratifiedKFold, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, RocCurveDisplay
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.svm import SVC
from sklearn.inspection import permutation_importance

import xgboost
from xgboost import XGBClassifier


### Descrption project

In this project, predictive models will be developed that can predict which customers are most likely to cancel their services.

The company wants to anticipate the churn problem explored in the previous stage. Therefore, in this notebook, a robust pipeline will be built for this initial modeling stage, in which the following tasks will be carried out:

Prepare the data for modeling (processing, coding, normalization).
Perform evaluation analysis and variable selection.
Train different classification models, evaluate their performance using different metrics, and interpret the results of each.
Finally, a report will be created with a strategic conclusion highlighting the main factors that influence churn.


#### Data dictionary 

- `customerID`: número de identificación único de cada cliente
- `Churn`: si el cliente dejó o no la empresa
- `gender`: género (masculino y femenino)
- `SeniorCitizen`: información sobre si un cliente tiene o no una edad igual o mayor a 65 años
- `Partner`: si el cliente tiene o no una pareja
- `Dependents`: si el cliente tiene o no dependientes
- `tenure`: meses de contrato del cliente
- `PhoneService`: suscripción al servicio telefónico
- `MultipleLines`: suscripción a más de una línea telefónica
- `InternetService`: suscripción a un proveedor de internet
- `OnlineSecurity`: suscripción adicional de seguridad en línea
- `OnlineBackup`: suscripción adicional de respaldo en línea
- `DeviceProtection`: suscripción adicional de protección del dispositivo
- `TechSupport`: suscripción adicional de soporte técnico, menor tiempo de espera
- `StreamingTV`: suscripción de televisión por cable
- `StreamingMovies`: suscripción de streaming de películas
- `Contract`: tipo de contrato
- `PaperlessBilling`: si el cliente prefiere recibir la factura en línea
- `PaymentMethod`: forma de pago
- `Charges.Monthly`: total de todos los servicios del cliente por mes
- `Charges.Total`: total gastado por el cliente

### Preprocessing data

de acuerdo al challenge anterior en donde se realizaron algunas cosas.....

In [3]:
# Load the dataset

#. → carpeta actual
#.. → carpeta padre (subir un nivel)

csv_path = Path("..") / "challenge-telecomX_Latam" / "TelecomX_Data_Cleaned.csv"
df1 = pd.read_csv(csv_path)


df1.head()

Unnamed: 0,customerID,Churn,customer.gender,customer.SeniorCitizen,customer.Partner,customer.Dependents,customer.tenure,phone.PhoneService,phone.MultipleLines,internet.InternetService,...,internet.OnlineBackup,internet.DeviceProtection,internet.TechSupport,internet.StreamingTV,internet.StreamingMovies,account.Contract,account.PaperlessBilling,account.PaymentMethod,account.Charges.Monthly,account.Charges.Total
0,0002-ORFBO,False,Female,False,True,True,9,True,False,DSL,...,1,0,1,1,0,One year,True,Mailed check,65.6,593.3
1,0003-MKNFE,False,Male,False,False,False,9,True,True,DSL,...,0,0,0,0,1,Month-to-month,False,Mailed check,59.9,542.4
2,0004-TLHLJ,True,Male,False,False,False,4,True,False,Fiber optic,...,0,1,0,0,0,Month-to-month,True,Electronic check,73.9,280.85
3,0011-IGKFF,True,Male,True,True,False,13,True,False,Fiber optic,...,1,1,0,1,1,Month-to-month,True,Electronic check,98.0,1237.85
4,0013-EXCHZ,True,Female,True,True,False,3,True,False,Fiber optic,...,0,0,1,1,0,Month-to-month,True,Mailed check,83.9,267.4


In [4]:
df1.columns

Index(['customerID', 'Churn', 'customer.gender', 'customer.SeniorCitizen',
       'customer.Partner', 'customer.Dependents', 'customer.tenure',
       'phone.PhoneService', 'phone.MultipleLines', 'internet.InternetService',
       'internet.OnlineSecurity', 'internet.OnlineBackup',
       'internet.DeviceProtection', 'internet.TechSupport',
       'internet.StreamingTV', 'internet.StreamingMovies', 'account.Contract',
       'account.PaperlessBilling', 'account.PaymentMethod',
       'account.Charges.Monthly', 'account.Charges.Total'],
      dtype='object')

In [8]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   Churn                      7043 non-null   bool    
 1   customer.gender            7043 non-null   category
 2   customer.SeniorCitizen     7043 non-null   bool    
 3   customer.Partner           7043 non-null   bool    
 4   customer.Dependents        7043 non-null   bool    
 5   customer.tenure            7043 non-null   int64   
 6   phone.PhoneService         7043 non-null   bool    
 7   phone.MultipleLines        7043 non-null   bool    
 8   internet.InternetService   7043 non-null   category
 9   internet.OnlineSecurity    7043 non-null   category
 10  internet.OnlineBackup      7043 non-null   category
 11  internet.DeviceProtection  7043 non-null   category
 12  internet.TechSupport       7043 non-null   category
 13  internet.StreamingTV       7043 n

In [6]:
# trasform the DataFrame to have a more readable format

cat_cols = [ 'customer.gender','internet.InternetService','account.Contract','account.PaymentMethod',
            'internet.OnlineSecurity', 'internet.OnlineBackup', 'internet.DeviceProtection',
    'internet.TechSupport', 'internet.StreamingTV', 'internet.StreamingMovies']
for col in cat_cols:
    df1[col] = df1[col].astype('category')


# convert the 'customer.SeniorCitizen' column to boolean

cat_cols = ['customer.Partner', 'customer.Dependents', 'phone.PhoneService', 'phone.MultipleLines',
    'account.PaperlessBilling', 'Churn','customer.SeniorCitizen']

# remplace Yes/No with 1/0

df1 = df1.replace({'Yes': 1, 'No': 0})

for col in cat_cols:
    df1[col] = df1[col].astype('bool')
    

# convert the 'account.Charges.Total' column to float
df1['account.Charges.Total'] = df1['account.Charges.Total'].astype(float)

In [7]:
df2 = df1.drop(['customerID'], axis=1)


df1.describe()

Unnamed: 0,customer.tenure,account.Charges.Monthly,account.Charges.Total
count,7043.0,7043.0,7032.0
mean,32.371149,64.761692,2283.300441
std,24.559481,30.090047,2266.771362
min,0.0,18.25,18.8
25%,9.0,35.5,401.45
50%,29.0,70.35,1397.475
75%,55.0,89.85,3794.7375
max,72.0,118.75,8684.8


In Challenge Part 1 of the project, we successfully conducted a comprehensive Exploratory Data Analysis (EDA) focused on understanding customer churn behavior within TelecomX LATAM. This included thorough data cleaning, variable transformation, and the identification of key trends and correlations. We analyzed how different factors such as contract type, tenure, payment method, and monthly charges influence churn rates. Additionally, we examined variable distributions, addressed missing values, detected multicollinearity, and gained actionable business insights.