# Datasets para modelos

In [1]:
# Números
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

In [2]:
path_original = "..\\data\\original\\"
path_proc =  "..\\data\\processed\\"
path_ext = "..\\data\\external\\"

## Datasets

In [3]:
# Datos
data = pd.read_csv(path_proc + "data_clean.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13470 entries, 0 to 13469
Data columns (total 47 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        13470 non-null  int64  
 1   REGION    13470 non-null  int64  
 2   SEX       13470 non-null  int64  
 3   AHEIGHT   13470 non-null  float64
 4   AWEIGHTP  13470 non-null  float64
 5   BMI       13470 non-null  float64
 6   AGE_P     13470 non-null  int64  
 7   RACERPI2  13470 non-null  int64  
 8   R_MARITL  13470 non-null  float64
 9   DOINGLWA  13470 non-null  float64
 10  WRKLYR4   13470 non-null  float64
 11  HYPYR1    13470 non-null  float64
 12  HYPMED2   13470 non-null  float64
 13  CHLYR     13470 non-null  float64
 14  CHLMDNW2  13470 non-null  float64
 15  ANGEV     13470 non-null  float64
 16  MIEV      13470 non-null  float64
 17  HRTEV     13470 non-null  float64
 18  STREV     13470 non-null  float64
 19  EPHEV     13470 non-null  float64
 20  COPDEV    13470 non-null  fl

In [4]:
# Metadata
meta = pd.read_csv(path_ext + "feat_metadata.csv", sep=";")

## Variables dummy

Se obtienen las variables *dummies* de las variables categóricas.

In [5]:
cat_features = list(meta.loc[meta.object == "category","code"])
cont_features = list(meta.loc[meta.object == "float","code"])
bool_features = list(meta.loc[meta.object == "bool","code"])
id_feature = ["id"]

In [6]:
# Se convierten todas las variables categóricas a int
data.loc[
    :,
    (data.columns.isin(cat_features) == True)
    & (data.columns.isin(id_feature) == False),
] = data.loc[
    :,
    (data.columns.isin(cat_features) == True)
    & (data.columns.isin(id_feature) == False),
].astype(
    "int32"
)
# Se convierten todas las variables booleanas a int
data.loc[
    :,
    (data.columns.isin(bool_features) == True)
    & (data.columns.isin(id_feature) == False),
] = data.loc[
    :,
    (data.columns.isin(bool_features) == True)
    & (data.columns.isin(id_feature) == False),
].astype(
    "int32"
)
# Se convierten todas las variables contçinuas a float
data.loc[
    :,
    (data.columns.isin(cont_features) == True)
    & (data.columns.isin(id_feature) == False),
] = data.loc[
    :,
    (data.columns.isin(cont_features) == True)
    & (data.columns.isin(id_feature) == False),
].astype(
    "float32"
)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13470 entries, 0 to 13469
Data columns (total 47 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        13470 non-null  int64  
 1   REGION    13470 non-null  int32  
 2   SEX       13470 non-null  int32  
 3   AHEIGHT   13470 non-null  float32
 4   AWEIGHTP  13470 non-null  float32
 5   BMI       13470 non-null  float32
 6   AGE_P     13470 non-null  float32
 7   RACERPI2  13470 non-null  int32  
 8   R_MARITL  13470 non-null  int32  
 9   DOINGLWA  13470 non-null  int32  
 10  WRKLYR4   13470 non-null  int32  
 11  HYPYR1    13470 non-null  int32  
 12  HYPMED2   13470 non-null  int32  
 13  CHLYR     13470 non-null  int32  
 14  CHLMDNW2  13470 non-null  int32  
 15  ANGEV     13470 non-null  int32  
 16  MIEV      13470 non-null  int32  
 17  HRTEV     13470 non-null  int32  
 18  STREV     13470 non-null  int32  
 19  EPHEV     13470 non-null  int32  
 20  COPDEV    13470 non-null  in

In [8]:
# Selección de variables categóricas
to_dummy = data.loc[:,data.columns.isin(cat_features) == True]
to_dummy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13470 entries, 0 to 13469
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   REGION    13470 non-null  int32
 1   SEX       13470 non-null  int32
 2   RACERPI2  13470 non-null  int32
 3   R_MARITL  13470 non-null  int32
 4   ASICPUSE  13470 non-null  int32
dtypes: int32(5)
memory usage: 263.2 KB


In [9]:
# Se crean las variables dummies
cat_dummy = pd.get_dummies(to_dummy.astype(str), drop_first=True)

In [10]:
# Dataset para regresión
data_dummy = pd.concat([data[["id"]], data[cont_features], data[bool_features], cat_dummy], axis=1)
data_dummy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13470 entries, 0 to 13469
Data columns (total 58 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          13470 non-null  int64  
 1   AHEIGHT     13470 non-null  float32
 2   AWEIGHTP    13470 non-null  float32
 3   BMI         13470 non-null  float32
 4   AGE_P       13470 non-null  float32
 5   CIGSDAY     13470 non-null  float32
 6   VIG         13470 non-null  float32
 7   MOD         13470 non-null  float32
 8   ALC         13470 non-null  float32
 9   ASISLEEP    13470 non-null  float32
 10  DOINGLWA    13470 non-null  int32  
 11  WRKLYR4     13470 non-null  int32  
 12  HYPYR1      13470 non-null  int32  
 13  HYPMED2     13470 non-null  int32  
 14  CHLYR       13470 non-null  int32  
 15  CHLMDNW2    13470 non-null  int32  
 16  ANGEV       13470 non-null  int32  
 17  MIEV        13470 non-null  int32  
 18  HRTEV       13470 non-null  int32  
 19  STREV       13470 non-nul

In [11]:
data_dummy.to_csv(path_proc + "data_dummy.csv", index=False)

## Datasets balanceado

Se realizan 5 datasets, cada uno con distinto sampleo de la clase 0.

In [12]:
data_dummy = data_dummy.set_index("id")

In [13]:
# Casos target
print(f"Casos positivos: {len(data_dummy[data_dummy.target == 1])}")
print(f"Casos negativos: {len(data_dummy[data_dummy.target == 0])}")

Casos positivos: 1528
Casos negativos: 11942


In [14]:
# Balanceo
target1 = data_dummy[data_dummy.target == 1]
target0 = data_dummy[data_dummy.target == 0].sample(n=1528, random_state=55)
data_bal = pd.concat([target0,target1])
# Casos target
print(f"Casos positivos: {len(data_bal[data_bal.target == 1])}")
print(f"Casos negativos: {len(data_bal[data_bal.target == 0])}")
data_bal.to_csv(f"{path_proc}data_bal.csv", index=True)

Casos positivos: 1528
Casos negativos: 1528


In [16]:
data_bal.to_csv(path_proc + "data_bal_dummy.csv", index=False)

### Separación dev-test

In [17]:
X_dev, X_test = train_test_split(data_bal, test_size=0.2, random_state=55)

In [18]:
X_dev.to_csv(path_proc + "data_bal_dummy_dev.csv")
X_test.to_csv(path_proc + "data_bal_dummy_test.csv")

### Separación train-val

In [19]:
i = 1
while i < 6:
    print(f"Random State: {55*i}")
    X_train, X_val = train_test_split(X_dev, test_size=0.2, random_state=55*i)
    X_train.to_csv(f"{path_proc}data_bal_dummy_train{i}.csv")
    X_val.to_csv(f"{path_proc}data_bal_dummy_val{i}.csv")
    i = i+1

Random State: 55
Random State: 110
Random State: 165
Random State: 220
Random State: 275
