In [4]:
### Ajuste de peso

import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

data = pd.read_csv('E:/datasets/travel_insurance_us_preprocessed.csv')

target = data['Claim']
features = data.drop('Claim', axis=1)
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345
)

model = LogisticRegression(random_state=12345, solver='liblinear', class_weight='balanced')
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
print('F1:', f1_score(target_valid, predicted_valid))

F1: 0.08698830409356725


In [5]:
### Sobremuestreo
### 1. Se divide el dataset de entrenamiento en observaciones negativas y positivas.

import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('E:/datasets/travel_insurance_us_preprocessed.csv')

target = data['Claim']
features = data.drop('Claim', axis=1)
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345
)

# < escribe el código aquí>
features_zeros = features_train[target==0]
features_ones = features_train[target==1]
target_zeros = target_train[target == 0]
target_ones = target_train[target == 1]

print(features_zeros.shape)
print(features_ones.shape)
print(target_zeros.shape)
print(target_ones.shape)

(37411, 196)
(584, 196)
(37411,)
(584,)


  features_zeros = features_train[target==0]
  features_ones = features_train[target==1]


In [6]:
### Las observaciones positivas son escasas en comparación con las negativas, 
### por lo que debes apegarte al pensamiento positivo a través de la pura tiranía de la voluntad... y el sobremuestreo.

In [7]:
### 2. Se duplican varias veces las observaciones positivas (las que raramente ocurren).

repeat = 10
features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat) # <escribe el código aquí  >

print(features_upsampled.shape)
print(target_upsampled.shape)

(43251, 196)
(43251,)


In [8]:
### 3. Se crea una nueva muestra de entrenamiento con base en los datos obtenidos.
### 4. Se mezclan los datos

from sklearn.utils import shuffle
from sklearn.utils import resample

features_upsampled, target_upsampled = shuffle(features_upsampled, target_upsampled, random_state=12345)

print(features_upsampled.shape)
print(target_upsampled.shape)

(43251, 196)
(43251,)


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

model = LogisticRegression(random_state=12345, solver='liblinear')
model.fit(features_upsampled, target_upsampled)

predicted_valid = pd.Series(model.predict(features_valid))

model_f1 = f1_score(target_valid, predicted_valid)

print('F1 for upsampled data:', model_f1)

F1 for upsampled data: 0.13688212927756654


In [10]:
### Funcion para un sobremuestreo

def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)

    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=12345
    )

    return features_upsampled, target_upsampled

In [11]:
features_upsampled, target_upsampled = upsample(
    features_train, target_train, 10
)

In [12]:
### Submuestreo
### Para eliminar aleatoriamente algunos elementos de la tabla, utiliza la función sample(). 
### Esta función requiere un parámetro llamado frac ('fraction' o fracción), 
### que especifica la proporción de los elementos totales que quieres retener.

def downsample(features, target, fraction):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    # < escribe el código aquí >
    features_downsampled = features_zeros.sample(frac=fraction, random_state=12345)
    features_downsampled = pd.concat([features_downsampled, features_ones])
    #features_downsampled = pd.concat([features_downsampled, features_ones.sample(frac=1-fraction, random_state=12345)])
    
    target_downsampled = target_zeros.sample(frac=fraction, random_state=12345)
    target_downsampled = pd.concat([target_downsampled, target_ones])
    #target_downsampled = pd.concat([target_downsampled, target_ones.sample(frac=1-fraction, random_state=12345)])
    
    features_downsampled, target_downsampled = shuffle(features_downsampled, target_downsampled, random_state=12345)

    return features_downsampled, target_downsampled


In [13]:
features_downsampled, target_downsampled = downsample(
    features_train, target_train, 0.1
)

In [14]:
print(features_downsampled.shape)
print(target_downsampled.shape)

(4325, 196)
(4325,)


In [15]:
model = LogisticRegression(random_state=12345, solver='liblinear')
model.fit(features_downsampled, target_downsampled)

predicted_valid = pd.Series(model.predict(features_valid))

model_f1 = f1_score(target_valid, predicted_valid)

print('F1 for downsampled data:', model_f1)

F1 for downsampled data: 0.13333333333333333
