In [1]:
import pandas as pd
from pandas import get_dummies
from sklearn.impute import SimpleImputer
from utilities.evaluation import ModelEvaluation
from sklearn.tree import DecisionTreeClassifier
from mlxtend.evaluate import bias_variance_decomp
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
churn = pd.read_csv("https://raw.githubusercontent.com/stivenlopezg/DS-ONLINE-64/main/data/churn-modeling.csv", sep=',')
churn.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15634602,Hargrave,619,France,Female,42,2,0.0,1,Yes,Yes,101348.88,1
1,15647311,Hill,608,Spain,Female,41,1,83807.86,1,No,Yes,112542.58,0
2,15619304,Onio,502,France,Female,42,8,159660.8,3,Yes,No,113931.57,1
3,15701354,Boni,699,France,Female,39,1,0.0,2,No,No,93826.63,0
4,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,Yes,Yes,79084.1,0


In [3]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       10000 non-null  int64  
 1   Surname          10000 non-null  object 
 2   CreditScore      10000 non-null  int64  
 3   Geography        9980 non-null   object 
 4   Gender           10000 non-null  object 
 5   Age              10000 non-null  int64  
 6   Tenure           10000 non-null  int64  
 7   Balance          10000 non-null  float64
 8   NumOfProducts    10000 non-null  int64  
 9   HasCrCard        10000 non-null  object 
 10  IsActiveMember   10000 non-null  object 
 11  EstimatedSalary  9988 non-null   float64
 12  Exited           10000 non-null  int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 1015.8+ KB


In [4]:
churn['Exited'].value_counts(normalize=True)

0    0.7963
1    0.2037
Name: Exited, dtype: float64

In [5]:
exited = churn.pop('Exited')

train_data, test_data, train_label, test_label = train_test_split(churn, exited,
                                                                  test_size=0.3,
                                                                  stratify=exited, random_state=42)

print(f'El set de entrenamiento tiene {train_data.shape[0]} observaciones, y {train_data.shape[1]} variables.')
print(f'El set de prueba tiene {test_data.shape[0]} observaciones, y {test_data.shape[1]} variables.')

El set de entrenamiento tiene 7000 observaciones, y 12 variables.
El set de prueba tiene 3000 observaciones, y 12 variables.


In [6]:
numerical_features = ['CreditScore', 'Age', 'Tenure',
                      'Balance', 'NumOfProducts', 'EstimatedSalary']

categorical_features = ['Gender', 'Geography', 'HasCrCard', 'IsActiveMember']

drop_features = ["CustomerId", "Surname"]

In [7]:
train_data.drop(labels=drop_features, axis=1, inplace=True)
test_data.drop(labels=drop_features, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


## Preprocesamiento

### Numericas

In [8]:
# Imputacion de datos missing

imputer_num = SimpleImputer(strategy='median')

train_data[numerical_features] = imputer_num.fit_transform(train_data[numerical_features])

test_data[numerical_features] = imputer_num.transform(test_data[numerical_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[numerical_features] = imputer_num.fit_transform(train_data[numerical_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[numerical_features] = imputer_num.transform(t

In [9]:
# Escalar datos numericos

scaler = StandardScaler()

train_data[numerical_features] = scaler.fit_transform(train_data[numerical_features])

test_data[numerical_features] = scaler.transform(test_data[numerical_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[numerical_features] = scaler.fit_transform(train_data[numerical_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[numerical_features] = scaler.transform(test_data[n

### Categoricos

In [10]:
imputer_cat = SimpleImputer(strategy='most_frequent')

train_data[categorical_features] = imputer_cat.fit_transform(train_data[categorical_features])

test_data[categorical_features] = imputer_cat.transform(test_data[categorical_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[categorical_features] = imputer_cat.fit_transform(train_data[categorical_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[categorical_features] = imputer_cat.trans

In [11]:
# OHE

train_data = get_dummies(data=train_data, columns=categorical_features)

test_data = get_dummies(data=test_data, columns=categorical_features)

In [16]:
lr = LogisticRegression(random_state=42)

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(estimator=lr,
                                                            X_train=train_data.values,
                                                            y_train=train_label.values,
                                                            X_test=test_data.values,
                                                            y_test=test_label.values,
                                                            loss='0-1_loss',
                                                            random_seed=42)

print(f'El promedio de la perdida esperada es: {avg_expected_loss}')
print(f'El promedio del sesgo: {avg_bias}')
print(f'E promedio de la varianza: {avg_var}')

El promedio de la perdida esperada es: 0.18661833333333333
El promedio del sesgo: 0.18766666666666668
E promedio de la varianza: 0.010461666666666666


In [17]:
dtree = DecisionTreeClassifier(random_state=42)

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(estimator=dtree,
                                                            X_train=train_data.values,
                                                            y_train=train_label.values,
                                                            X_test=test_data.values,
                                                            y_test=test_label.values,
                                                            loss='0-1_loss',
                                                            random_seed=42)

print(f'El promedio de la perdida esperada es: {avg_expected_loss}')
print(f'El promedio del sesgo: {avg_bias}')
print(f'E promedio de la varianza: {avg_var}')

El promedio de la perdida esperada es: 0.21040333333333336
El promedio del sesgo: 0.14066666666666666
E promedio de la varianza: 0.14594666666666667
