In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [2]:
df_1 = pd.read_csv("./data/churn.xls")
df = df_1.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [3]:
# Verificamos que no existan valores missing en el dataset
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [4]:
# Comprobamos si existen filas duplicadas completas
df.duplicated().sum()

np.int64(0)

In [5]:
# Eliminamos identificadores y variables sin valor predictivo
df = df.drop(columns=['RowNumber', 'CustomerId', 'Surname'])

In [6]:
# Convertimos variables categóricas a numéricas mediante One-Hot Encoding
df = pd.get_dummies(df, columns=['Geography', 'Gender'], drop_first=True)

In [7]:
# Definimos las columnas numéricas que serán escaladas
numericas = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [8]:
# Escalamos las variables numéricas
df[numericas] = scaler.fit_transform(df[numericas])

In [9]:
# Feature: cliente con balance positivo
df['HasBalance'] = (df['Balance'] > 0).astype(int)

In [10]:
# Feature: ratio salario / edad
df['SalaryAgeRatio'] = df['EstimatedSalary'] / (df['Age'] + 1)

In [11]:
# Feature: cliente comprometido
df['EngagedCustomer'] = ((df['IsActiveMember'] == 1) & (df['NumOfProducts'] > 1)).astype(int)

In [12]:
# Separamos variables predictoras y objetivo
X = df.drop('Exited', axis=1)
y = df['Exited']

In [13]:
# Correlación con la variable objetivo
df.corr(numeric_only=True)['Exited'].sort_values(ascending=False)

Exited               1.000000
Age                  0.285323
EngagedCustomer      0.178537
Geography_Germany    0.173488
Balance              0.118533
HasBalance           0.111294
EstimatedSalary      0.012097
SalaryAgeRatio       0.003483
HasCrCard           -0.007138
Tenure              -0.014001
CreditScore         -0.027094
NumOfProducts       -0.047820
Geography_Spain     -0.052667
Gender_Male         -0.106512
IsActiveMember      -0.156128
Name: Exited, dtype: float64

In [14]:
# División en train y test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)