# Clasificador de Riesgo de Enfermedad Coronaria

In [1]:
import pandas as pd
import numpy as np
import statistics
import datetime
import time

# Preprocess data
from sklearn.preprocessing import StandardScaler
# Create model
from sklearn.linear_model import LogisticRegression
# Optimize model
from sklearn.model_selection import GridSearchCV
# Train model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
# Get metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# Import data

In [2]:
df = pd.read_csv("./datos_train.csv")

# Preprocess data

In [114]:
class Preprocesor:
  def __init__(self, df):
    self.df = df.copy()


  # AUXILIARY FUNCTIONS
  
  def _fix_age(self, age):
    if not isinstance(age, str):
      return age
    age = age.replace("Age ", "").replace(" to ", "-").replace(" or older", "-")
    return age

  def _get_avg_from_num_str(self, age):
    if not isinstance(age, str):
      return age
    strnums = age.split('-')
    nums = [int(strnum) for strnum in strnums if strnum]
    avg = sum(nums) / len(nums)
    return int(avg)

  def set_id_as_index(self):
    self.df.set_index('id', inplace=True)

  def fix_ages(self):
    self.df["CategoriaDeEdad"] = self.df["CategoriaDeEdad"].apply(self._fix_age)


  # REPLACE STRINGS WITH NUMBERS

  def replace_yes_no(self, col):
    self.df[col] = self.df[col].map({'Yes': 1, 'No': 0})

  def set_to_dummie(self, cols):
    self.df = pd.get_dummies(self.df, columns=cols, dtype=float)

  def set_salud_general_to_ordinal_nums(self):
    self.df["SaludGeneral"] = self.df["SaludGeneral"].map({
      'Excellent': 5,
      'Very good' : 4,
      'Good': 3, 
      'Fair': 2,
      'Poor': 1
    })

  def set_categoria_de_edad_to_ordinal_nums(self):
    self.df["CategoriaDeEdad"] = self.df["CategoriaDeEdad"].apply(self._get_avg_from_num_str)

  def set_fumador_to_weights(self):
    self.df["Fumador"] = self.df["Fumador"].map({
      'Never smoked' : 0,
      'No': 2,
      'Former smoker' : 4,
      'Yes': 5, 
      'Current smoker - now smokes some days': 7,
      'Current smoker - now smokes every day': 10
    })

  def set_sex_to_boolean(self):
    self.df["Sexo"] = self.df["Sexo"].map({
      'Male': 1,
      'Female' : 0,
    })


  # NAN HANDLING

  def replace_nan_with_median(self, col_names):
    for col_name in col_names:
      col = self.df[col_name]
      median = statistics.median(col.dropna())
      self.df[col_name] = col.fillna(median)    

  def drop_column(self, col):
    self.df = self.df.drop(col, axis=1)


  # CLEAN DATA

  def clean_data(self):
    self.df = self.df.rename(columns={'AccidenteCerebrovascular ': 'AccidenteCerebroVascular'})
    self.fix_ages()
    self.set_id_as_index()
    self.replace_yes_no('ActividadFisica')
    self.replace_yes_no('AccidenteCerebroVascular')
    self.replace_yes_no('Asma')
    self.replace_yes_no('CáncerDePiel')
    self.replace_yes_no('Diabetes')
    self.replace_yes_no('ConsumoDeAlcohol')
    self.replace_yes_no('EnfermedadRenal')
    self.replace_yes_no('DificultadParaCaminar')
    self.set_to_dummie(['Sexo', 'Raza'])


  # TRANSFORM FUNCTIONS

  def transform_1(self):
    self.clean_data()
    self.set_salud_general_to_ordinal_nums()
    self.set_categoria_de_edad_to_ordinal_nums()
    self.drop_column("SaludFisica")
    self.drop_column("SaludMental")
    self.set_fumador_to_weights()
    self.replace_nan_with_median(self.df.columns)
  
  def transform_2(self):
    self.clean_data()
    self.set_salud_general_to_ordinal_nums()
    self.set_categoria_de_edad_to_ordinal_nums()
    self.drop_column("SaludFisica")
    self.drop_column("SaludMental")
    self.set_fumador_to_weights()
    self.df["IMC"].dropna()
    self.df["Fumador"].dropna()
    self.replace_nan_with_median(self.df.columns)

  def transform_3(self):
    self.df = self.df.rename(columns={'AccidenteCerebrovascular ': 'AccidenteCerebroVascular'})
    self.fix_ages()
    self.set_id_as_index()
    self.replace_yes_no('ActividadFisica')
    self.replace_yes_no('AccidenteCerebroVascular')
    self.replace_yes_no('Asma')
    self.replace_yes_no('CáncerDePiel')
    self.replace_yes_no('Diabetes')
    self.replace_yes_no('ConsumoDeAlcohol')
    self.replace_yes_no('EnfermedadRenal')
    self.replace_yes_no('DificultadParaCaminar')
    self.set_to_dummie(['Sexo'])
    self.drop_column("Raza")
    self.set_salud_general_to_ordinal_nums()
    self.set_categoria_de_edad_to_ordinal_nums()
    self.drop_column("SaludFisica")
    self.drop_column("SaludMental")
    self.set_fumador_to_weights()
    self.df["IMC"].dropna()
    self.df["Fumador"].dropna()
    self.replace_nan_with_median(self.df.columns)

  def get_df(self):
    return self.df.copy()

## Scales preprocessing

In [4]:
class CustomStandardScaler:
  def __init__(self, train):
    self.scaler = StandardScaler()
    self.scaler.fit(train)
  
  def transform(self, data):
    return self.scaler.transform(data)

## Create X and Y datasets

In [7]:
def getXandY(data):
  data = data.copy()
  X = data.drop('EnfermedadCoronaria', axis = 1)
  y = data[["EnfermedadCoronaria"]]
  return X, y

# Create model

## Evaluate model

In [133]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
  model.fit(X_train, y_train.values.ravel())
  y_pred = model.predict(X_test)
  y_pred_proba = model.predict_proba(X_test)[:, 1]
  
  accuracy = accuracy_score(y_test,y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  auc_roc = roc_auc_score(y_test, y_pred_proba)
  
  metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1,
    'AUC-ROC': auc_roc
  }
  
  return metrics


### StratifiedFoldStrategy 

In [29]:
def evaluate_model_cv(model, X, y, n_splits=5):
    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Initialize lists to store metrics for each fold
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    auc_roc_scores = []
    fit_times = []

    # Iterate through each fold
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.loc[train_index, :], X.loc[test_index, :]
        y_train, y_test = y.loc[train_index, :], y.loc[test_index, :]

        # Apply standard scaling
        standardScaler = CustomStandardScaler(X)
        X_train = standardScaler.transform(X_train)
        X_test  = standardScaler.transform(X_test)
        
        # Measure time to fit the model
        start_time = time.time()
        model.fit(X_train, y_train.values.ravel())
        end_time = time.time()

        # Calculate fitting time
        fit_time = end_time - start_time
        fit_times.append(fit_time)
        
        # Make predictions
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        
        # Calculate metrics
        accuracy = accuracy_score(y_test,y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc_roc = roc_auc_score(y_test, y_pred_proba)
        
        # Append metrics to the lists
        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        auc_roc_scores.append(auc_roc)
    
    # Calculate the mean and standard deviation of each metric
    metrics = {
        'Accuracy': (np.mean(accuracy_scores), np.std(accuracy_scores)),
        'Precision': (np.mean(precision_scores), np.std(precision_scores)),
        'Recall': (np.mean(recall_scores), np.std(recall_scores)),
        'F1 Score': (np.mean(f1_scores), np.std(f1_scores)),
        'AUC-ROC': (np.mean(auc_roc_scores), np.std(auc_roc_scores)),
        'Fit Time': (np.mean(fit_times), np.std(fit_times))
    }
    
    return metrics

### Compare models

In [30]:
def compare_models(models, X, y, verbose = 0):
  results = []

  for name, model in models.items():
    metrics = evaluate_model_cv(model, X, y)
    result = {
      'Model': name,
      'Accuracy (Mean)': metrics['Accuracy'][0],
      'Accuracy (Std)': metrics['Accuracy'][1],
      'Precision (Mean)': metrics['Precision'][0],
      'Precision (Std)': metrics['Precision'][1],
      'Recall (Mean)': metrics['Recall'][0],
      'Recall (Std)': metrics['Recall'][1],
      'F1 Score (Mean)': metrics['F1 Score'][0],
      'F1 Score (Std)': metrics['F1 Score'][1],
      'AUC-ROC (Mean)': metrics['AUC-ROC'][0],
      'AUC-ROC (Std)': metrics['AUC-ROC'][1],
      'Fit Time (Mean)': metrics['Fit Time'][0],
      'Fit Time (Std)': metrics['Fit Time'][1]
    }
    if verbose:
      print(result)
    results.append(result)
  
  return pd.DataFrame(results)

## Train model

In [134]:
def train_model(model, data, verbose = 0):
  X, y = getXandY(data)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
  # Apply standard scaling
  standardScaler = CustomStandardScaler(X)
  X_train = standardScaler.transform(X_train)
  # Measure time to fit the model
  model.fit(X_train, y_train.values.ravel())
  if verbose:
    X_test = standardScaler.transform(X_test)
    print(evaluate_model(model, X_train, y_train, X_test, y_test))
  return model

## Get best model

In [51]:
def getBestParams(X_train, y_train, model, param_grid):
  # Initialize the grid search model
  grid_search = GridSearchCV(model, param_grid, cv=5)
  # Fit the grid search model
  grid_search.fit(X_train, y_train)
  # Print best params
  print("Best parameters:", grid_search.best_params_)

# SAMPLE
# param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear', 'saga']}

# Playground

## Create preprocessed datasets

In [128]:
p1 = Preprocesor(df)
p1.transform_1()
data_1 = p1.get_df()

X_1, y_1 = getXandY(data)
standardScaler_1 = CustomStandardScaler(X_1)

In [129]:
p2 = Preprocesor(df)
p2.transform_2()
data_2 = p2.get_df()

X_2, y_2 = getXandY(data)
standardScaler_2 = CustomStandardScaler(X_2)

In [130]:
p3 = Preprocesor(df)
p3.transform_3()
data_3 = p3.get_df()

X_3, y_3 = getXandY(data)
standardScaler_3 = CustomStandardScaler(X_3)

## Models set

In [135]:
logisticRegression_1 = LogisticRegression(random_state=42, max_iter=1000)

logisticRegression_2 = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)

logisticRegression_3 = LogisticRegression(C=0.01, solver="liblinear", random_state=42, max_iter=1000)

logisticRegression_4 = LogisticRegression(class_weight='balanced', C=0.01, solver="liblinear",
                                          random_state=42, max_iter=1000)

logisticRegression_5 = LogisticRegression(class_weight='balanced', C=0.01, solver="liblinear",
                                          penalty="l1", random_state=42, max_iter=1000)

logisticRegression_6 = LogisticRegression(class_weight='balanced', C=0.01, solver="lbfgs",
                                          random_state=42, max_iter=1000)
              
logisticRegression_7 = LogisticRegression(class_weight='balanced', C=0.001, solver="liblinear",
                                          random_state=42, max_iter=1000)                        


## Models evaluations

In [44]:
model = logisticRegression_3
data = data_1

# Get Crossed Validation datasets
X, y = getXandY(data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply standard scaling
standardScaler = CustomStandardScaler(X)
X_train_scaled = standardScaler.transform(X_train)
X_test_scaled  = standardScaler.transform(X_test)

# evaluate_model(model, X_train_scaled, y_train, X_test_scaled, y_test)
evaluate_model_cv(model, X, y, n_splits=5)


{'Precision': (0.5282212896201022, 0.006865079920724799),
 'Recall': (0.07162081695843754, 0.0020386341718738256),
 'F1 Score': (0.12612401075156313, 0.003135026580383677),
 'AUC-ROC': (0.8376185270139803, 0.0017678177991564982)}

## Comparison

In [17]:
data = data_1

# Define your models
models = {
    'Logistic Regression 1': LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000),
    'Logistic Regression 2': LogisticRegression(random_state=42, max_iter=1000),
}

X, y = getXandY(data)

# Compare the models
comparison_df = compare_models(models, X, y, 1)
print(comparison_df)

{'Model': 'Logistic Regression 1', 'Precision (Mean)': 0.1840236487723898, 'Precision (Std)': 0.0006686890539157039, 'Recall (Mean)': 0.7780159580802668, 'Recall (Std)': 0.005667990625645661, 'F1 Score (Mean)': 0.297643756185946, 'F1 Score (Std)': 0.0011640508001936147, 'AUC-ROC (Mean)': 0.8384853562476128, 'AUC-ROC (Std)': 0.0017524458122455645}
{'Model': 'Logistic Regression 2', 'Precision (Mean)': 0.5263322972045271, 'Precision (Std)': 0.006330585644712654, 'Recall (Mean)': 0.07424080028581637, 'Recall (Std)': 0.002256561968765422, 'F1 Score (Mean)': 0.13010683165371725, 'F1 Score (Std)': 0.0033752742842296855, 'AUC-ROC (Mean)': 0.838113572378874, 'AUC-ROC (Std)': 0.0017851364307249055}
                   Model  Precision (Mean)  Precision (Std)  Recall (Mean)  \
0  Logistic Regression 1          0.184024         0.000669       0.778016   
1  Logistic Regression 2          0.526332         0.006331       0.074241   

   Recall (Std)  F1 Score (Mean)  F1 Score (Std)  AUC-ROC (Mean)  

In [35]:
data = data_1

# Define your models
models = {
    'LR 2': logisticRegression_2,
    'LR 3': logisticRegression_3,
    'LR 4': logisticRegression_4,
}

X, y = getXandY(data)

# Compare the models
comparison_df = compare_models(models, X, y, 1)
print(comparison_df)

{'Model': 'LR 2', 'Accuracy (Mean)': 0.7470684493292525, 'Accuracy (Std)': 0.0011296632264012265, 'Precision (Mean)': 0.1840236487723898, 'Precision (Std)': 0.0006686890539157039, 'Recall (Mean)': 0.7780159580802668, 'Recall (Std)': 0.005667990625645661, 'F1 Score (Mean)': 0.297643756185946, 'F1 Score (Std)': 0.0011640508001936147, 'AUC-ROC (Mean)': 0.8384853562476128, 'AUC-ROC (Std)': 0.0017524458122455645, 'Fit Time (Mean)': 1.93329176902771, 'Fit Time (Std)': 0.15217076033949264}
{'Model': 'LR 3', 'Accuracy (Mean)': 0.9316443775616088, 'Accuracy (Std)': 0.00012880417582060956, 'Precision (Mean)': 0.528022599747968, 'Precision (Std)': 0.007328708705631654, 'Recall (Mean)': 0.07290699059187805, 'Recall (Std)': 0.002225934784795727, 'F1 Score (Mean)': 0.12810421141452608, 'F1 Score (Std)': 0.0033737782975798865, 'AUC-ROC (Mean)': 0.8381788572443535, 'AUC-ROC (Std)': 0.0017824239960852494, 'Fit Time (Mean)': 3.1697707653045653, 'Fit Time (Std)': 0.10357516411818854}
{'Model': 'LR 4', 'A

In [22]:
X_1, y_1 = getXandY(data_1)
X_2, y_2 = getXandY(data_2)

model = logisticRegression_4

print(evaluate_model_cv(model, X_1, y_1, n_splits=5))
print(evaluate_model_cv(model, X_2, y_2, n_splits=5))


{'Precision': (0.18397141092560784, 0.0006575583923760966), 'Recall': (0.7783255924735024, 0.005745331522153439), 'F1 Score': (0.2975980141512455, 0.0011542215086756938), 'AUC-ROC': (0.838491866183914, 0.0017527454326951727), 'Fit Time': (2.987709665298462, 0.06945670760275129)}
{'Precision': (0.18397141092560784, 0.0006575583923760966), 'Recall': (0.7783255924735024, 0.005745331522153439), 'F1 Score': (0.2975980141512455, 0.0011542215086756938), 'AUC-ROC': (0.838491866183914, 0.0017527454326951727), 'Fit Time': (3.643292236328125, 0.7537846196978392)}


In [61]:
data = data_1

# Define your models
models = {
    'LR 4':  logisticRegression_4,
    'LR 5':  logisticRegression_5,
    'LR 6':  logisticRegression_6,
}

X, y = getXandY(data)

# Compare the models
comparison_df = compare_models(models, X, y, 1)
print(comparison_df)

{'Model': 'LR 4', 'Accuracy (Mean)': 0.7469125810796309, 'Accuracy (Std)': 0.0011386879549406172, 'Precision (Mean)': 0.18397141092560784, 'Precision (Std)': 0.0006575583923760966, 'Recall (Mean)': 0.7783255924735024, 'Recall (Std)': 0.005745331522153439, 'F1 Score (Mean)': 0.2975980141512455, 'F1 Score (Std)': 0.0011542215086756938, 'AUC-ROC (Mean)': 0.838491866183914, 'AUC-ROC (Std)': 0.0017527454326951727, 'Fit Time (Mean)': 3.0690026760101317, 'Fit Time (Std)': 0.11631498425902}
{'Model': 'LR 5', 'Accuracy (Mean)': 0.7466517069381678, 'Accuracy (Std)': 0.0011833750893363256, 'Precision (Mean)': 0.18385689424684143, 'Precision (Std)': 0.0007381838095883737, 'Recall (Mean)': 0.778659044896987, 'Recall (Std)': 0.00581794088793942, 'F1 Score (Mean)': 0.2974724890654553, 'F1 Score (Std)': 0.0012594966378245923, 'AUC-ROC (Mean)': 0.8384899023240429, 'AUC-ROC (Std)': 0.0017545328576010818, 'Fit Time (Mean)': 10.835013103485107, 'Fit Time (Std)': 2.965923943297715}
{'Model': 'LR 6', 'Accur

ValueError: l1_ratio must be specified when penalty is elasticnet.

In [68]:
data = data_1

# Define your models
models = {
    'LR 4':  logisticRegression_4,
    'LR 7':  logisticRegression_7,
}

X, y = getXandY(data)

# Compare the models
comparison_df = compare_models(models, X, y)
print(comparison_df)

  Model  Accuracy (Mean)  Accuracy (Std)  Precision (Mean)  Precision (Std)  \
0  LR 4         0.746913        0.001139          0.183971         0.000658   
1  LR 7         0.745590        0.001104          0.183467         0.000649   

   Recall (Mean)  Recall (Std)  F1 Score (Mean)  F1 Score (Std)  \
0       0.778326      0.005745         0.297598        0.001154   
1       0.780517      0.005669         0.297097        0.001144   

   AUC-ROC (Mean)  AUC-ROC (Std)  Fit Time (Mean)  Fit Time (Std)  
0        0.838492       0.001753         3.823942        0.800882  
1        0.838538       0.001752         6.001464        2.416731  


# Get submit csv

In [123]:
validate_df = pd.read_csv('./nuevas_instancias_clasificar.csv')

In [131]:
# Set custom imputs
model = logisticRegression_5
data = data_3
standardScaler = standardScaler_3

# Create and transform validation data
v = Preprocesor(validate_df)
v.transform_3()
validate_data = v.get_df()
validate_data = standardScaler.transform(validate_data)

# Train model
train_model(model, data, verbose=1)
# Get prediction
validate_prediction = model.predict(validate_data)

# Format results dataframe
prediction_df = pd.DataFrame(validate_prediction)
prediction_df.columns = ['Predicted']
prediction_df.index.names = ['id']
# Create csv
date = datetime.datetime.now().strftime("%d-%Y-%I-%M-%p-%B")
filename = 'prediction-results/result-' + date + '.csv'
prediction_df.to_csv(filename,sep=',')


{'Accuracy': 0.7444530126280442, 'Precision': 0.1824545725350015, 'Recall': 0.7769145394006659, 'F1 Score': 0.2955100563881434, 'AUC-ROC': 0.8357840715242532}


## Kaggle results



**LR 4:** 0.76143

In [None]:
data = data_2

logisticRegression_4 = LogisticRegression(class_weight='balanced', C=0.01, solver="liblinear",
                                          random_state=42, max_iter=1000)

**LR 6:** 0.76155

In [None]:
data = data_2

logisticRegression_6 = LogisticRegression(class_weight='balanced', C=0.01, solver="lbfgs",
                                          random_state=42, max_iter=1000)


logisticRegression_5 = LogisticRegression(class_weight='balanced', C=0.01, solver="liblinear",
                                          penalty="l1", random_state=42, max_iter=1000)
