In [1]:
### Codificación de etiquetas
### La clase LabelEncoder se usa para codificar una sola columna, 
### mientras que la clase OrdinalEncoder se usa para dos o más columnas a la vez, hasta el conjunto de datos completo.

import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

data = pd.read_csv('E:/datasets/travel_insurance_us.csv')

data.head()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commission (in value),Gender,Age
0,JZI,Airlines,Online,Value Plan,0,12,"TAIWAN, PROVINCE OF CHINA",45.0,15.75,M,39
1,EPX,Travel Agency,Online,Cancellation Plan,0,50,JAPAN,22.0,0.0,,36
2,EPX,Travel Agency,Online,Cancellation Plan,0,251,BRAZIL,80.0,0.0,,36
3,EPX,Travel Agency,Online,Cancellation Plan,0,6,INDIA,-10.0,0.0,,36
4,JZI,Airlines,Online,Value Plan,0,5,CHINA,45.0,15.75,M,34


In [52]:
encoder = OrdinalEncoder()
encoder.fit(data)

data_ordinal = encoder.transform(data)

data_ordinal

array([[  9.,   0.,   1., ..., 486.,   1.,  37.],
       [  7.,   1.,   1., ...,   0.,   2.,  34.],
       [  7.,   1.,   1., ...,   0.,   2.,  34.],
       ...,
       [  7.,   1.,   1., ...,   0.,   2.,  34.],
       [  7.,   1.,   1., ...,   0.,   2.,  22.],
       [  2.,   0.,   1., ..., 482.,   1.,  23.]])

In [53]:
data_ordinal = pd.DataFrame(data_ordinal, columns=data.columns)
data_ordinal.head()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commission (in value),Gender,Age
0,9.0,0.0,1.0,24.0,0.0,14.0,121.0,602.0,486.0,1.0,37.0
1,7.0,1.0,1.0,10.0,0.0,52.0,61.0,470.0,0.0,2.0,34.0
2,7.0,1.0,1.0,10.0,0.0,253.0,16.0,735.0,0.0,2.0,34.0
3,7.0,1.0,1.0,10.0,0.0,8.0,54.0,121.0,0.0,2.0,34.0
4,9.0,0.0,1.0,24.0,0.0,7.0,24.0,602.0,486.0,1.0,32.0


In [54]:
target = data_ordinal['Claim']
features = data_ordinal.drop('Claim', axis=1)
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345
)

In [55]:
model = DecisionTreeClassifier(random_state=12345)
model.fit(features_train, target_train)

DecisionTreeClassifier(random_state=12345)

In [56]:
### Codificacion ordinal

### Este tipo de variable categórica se denomina variable ordinal, 
### a diferencia de una variable nominal (una variable de categorías sin orden). 
### La codificación ordinal es una codificación de una variable ordinal con etiquetas numéricas dispuestas en un orden natural
### específico, generalmente realizada mediante enumeración manual de etiquetas.

## temperature_dict = {'cold': 0, 'warm': 1, 'hot': 2}
## df['temperature'] = df['temperature'].map(temperature_dict)

### Las variables que tienen muchas categorías se denominan variables de alta cardinalidad, 
### y el uso de OHE para codificarlas conduce a un alto consumo de memoria

### Para resumir todo, la codificación de una variable categórica con OHE generalmente dará como resultado el mejor rendimiento 
### a menos que:

### Se use en un algoritmo basado en árboles
### Sea una variable ordinal (la codificación ordinal generalmente funciona mejor para las variables ordinales).
### Sea una variable de alta cardinalidad (es posible que se necesiten técnicas de codificación más avanzadas).

In [57]:
### Escalado de características
from sklearn.preprocessing import StandardScaler

pd.options.mode.chained_assignment = None

numeric = ['Duration', 'Net Sales', 'Commission (in value)', 'Age'] ## Lista de todas las características numéricas.

In [58]:
features_train.head()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Duration,Destination,Net Sales,Commission (in value),Gender,Age
33312,7.0,1.0,1.0,10.0,79.0,24.0,414.0,0.0,2.0,34.0
50154,7.0,1.0,1.0,10.0,22.0,113.0,398.0,0.0,2.0,34.0
26729,7.0,1.0,1.0,10.0,15.0,76.0,343.0,0.0,2.0,28.0
37842,7.0,1.0,1.0,1.0,44.0,124.0,597.0,0.0,2.0,33.0
23588,7.0,1.0,1.0,1.0,10.0,76.0,457.0,0.0,2.0,32.0


In [59]:
scaler = StandardScaler()
scaler.fit(features_train[numeric])

features_train[numeric] = scaler.transform(features_train[numeric])
features_valid[numeric] = scaler.transform(features_valid[numeric])

In [60]:
features_train.head()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Duration,Destination,Net Sales,Commission (in value),Gender,Age
33312,7.0,1.0,1.0,10.0,0.422711,24.0,-0.638946,-0.770269,2.0,-0.298857
50154,7.0,1.0,1.0,10.0,-0.392566,113.0,-0.732311,-0.770269,2.0,-0.298857
26729,7.0,1.0,1.0,10.0,-0.492687,76.0,-1.053254,-0.770269,2.0,-0.807193
37842,7.0,1.0,1.0,1.0,-0.077897,124.0,0.428917,-0.770269,2.0,-0.383579
23588,7.0,1.0,1.0,1.0,-0.564203,76.0,-0.388027,-0.770269,2.0,-0.468302


In [63]:
features_train.shape

(37995, 188)

In [2]:
### Resumen

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('E:/datasets/travel_insurance_us.csv')

data_ohe = pd.get_dummies(data, drop_first=True)
target = data_ohe['Claim']
features = data_ohe.drop('Claim', axis=1)
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345)

numeric = ["Duration", "Net Sales", "Commission (in value)", "Age"]

scaler = StandardScaler()
scaler.fit(features_train[numeric])
features_train[numeric] = scaler.transform(features_train[numeric])
features_valid[numeric] = scaler.transform(features_valid[numeric])

print(features_train.shape)

(37995, 187)
