In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
file_path = '/content/drive/My Drive/Datasets/survey_lung_cancer.csv'

In [5]:
df = pd.read_csv(file_path)

In [33]:
# Codificando variables categóricas a numéricas
label_encoder = LabelEncoder()
df['GENDER'] = label_encoder.fit_transform(df['GENDER'])
df['LUNG_CANCER'] = label_encoder.fit_transform(df['LUNG_CANCER'])

In [7]:
# Dividiendo los datos en características y etiqueta
X = df.drop('LUNG_CANCER', axis=1)  # Todas las columnas excepto LUNG_CANCER
y = df['LUNG_CANCER']  # La columna LUNG_CANCER

In [8]:
# Dividiendo los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [80]:
# Creando el modelo de regresión logística
model = LogisticRegression()
model.fit(X_train, y_train)
model = LogisticRegression(max_iter=1000)  # Aumenta el número de iteraciones
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [81]:
# Realizando predicciones en el conjunto de prueba
y_pred = model.predict(X_test)

In [52]:
# Evaluando el modelo
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [53]:
print(f"Exactitud del modelo: {accuracy}")
print(f"Matriz de confusión:\n{conf_matrix}")

Exactitud del modelo: 0.978494623655914
Matriz de confusión:
[[ 5  2]
 [ 0 86]]


In [56]:
# Exactitud
accuracy = accuracy_score(y_test, y_pred)
print(f"Exactitud: {accuracy}")

# Precisión
precision = precision_score(y_test, y_pred)
print(f"Precisión: {precision}")

# Recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")

# Puntuación F1
f1 = f1_score(y_test, y_pred)
print(f"Puntuación F1: {f1}")

Exactitud: 0.978494623655914
Precisión: 0.9772727272727273
Recall: 1.0
Puntuación F1: 0.9885057471264368


In [46]:
from sklearn.tree import DecisionTreeClassifier

In [57]:
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)

In [58]:
y_pred_tree = tree_model.predict(X_test)

In [59]:
from sklearn.model_selection import cross_val_score
cross_val_accuracy = cross_val_score(tree_model, X, y, cv=10)

In [42]:
y_pred_best_tree = best_tree_model.predict(X_test)

In [60]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth': range(1, 10), 'min_samples_split': range(2, 10)}
grid_search = GridSearchCV(tree_model, parameters, cv=10)
grid_search.fit(X_train, y_train)

best_tree_model = grid_search.best_estimator_


In [61]:
y_pred_best_tree = best_tree_model.predict(X_test)

In [62]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Exactitud
accuracy = accuracy_score(y_test, y_pred_best_tree)
print(f"Exactitud: {accuracy}")

# Precisión
precision = precision_score(y_test, y_pred_best_tree)
print(f"Precisión: {precision}")

# Recall
recall = recall_score(y_test, y_pred_best_tree)
print(f"Recall: {recall}")

# Puntuación F1
f1 = f1_score(y_test, y_pred_best_tree)
print(f"Puntuación F1: {f1}")

Exactitud: 0.9247311827956989
Precisión: 0.9247311827956989
Recall: 1.0
Puntuación F1: 0.9608938547486032


In [69]:
# Suponiendo que X_train, X_test, y_train, y_test están ya definidos y preparados
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)

Exactitud: 0.9247311827956989
Precisión: 0.9647058823529412
Recall: 0.9534883720930233
Puntuación F1: 0.9590643274853802
AUC-ROC: 0.7624584717607974


In [72]:
from sklearn.metrics import roc_auc_score

In [73]:
# Evaluación usando las métricas definidas en la metodología
tree_accuracy = accuracy_score(y_test, y_pred_tree)
tree_precision = precision_score(y_test, y_pred_tree)
tree_recall = recall_score(y_test, y_pred_tree)
tree_f1 = f1_score(y_test, y_pred_tree)
tree_auc = roc_auc_score(y_test, tree_model.predict_proba(X_test)[:, 1])

In [76]:
# Realizando predicciones y evaluando el modelo
y_pred_tree = tree_model.predict(X_test)

In [77]:
print(f"Exactitud: {tree_accuracy}")
print(f"Precisión: {tree_precision}")
print(f"Recall: {tree_recall}")
print(f"Puntuación F1: {tree_f1}")
print(f"AUC-ROC: {tree_auc}")

Exactitud: 0.9247311827956989
Precisión: 0.9647058823529412
Recall: 0.9534883720930233
Puntuación F1: 0.9590643274853802
AUC-ROC: 0.7624584717607974


In [92]:
#Dividiendo los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [93]:
# Creando y entrenando el modelo de árbol de decisión
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

In [68]:
# Realizando predicciones y evaluando el modelo
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Exactitud del modelo: {accuracy}")
print(f"Matriz de confusión:\n{conf_matrix}")

Exactitud del modelo: 0.9247311827956989
Matriz de confusión:
[[ 4  3]
 [ 4 82]]


In [78]:
# Calculando métricas para el modelo de árbol de decisión
accuracy_tree = accuracy_score(y_test, y_pred_best_tree)
precision_tree = precision_score(y_test, y_pred_best_tree)
recall_tree = recall_score(y_test, y_pred_best_tree)
f1_tree = f1_score(y_test, y_pred_best_tree)
auc_roc_tree = roc_auc_score(y_test, best_tree_model.predict_proba(X_test)[:, 1])

In [99]:
print ("ARBOL DE DECISION")
print(f"Exactitud: {accuracy_tree}")
print(f"Precisión: {precision_tree}")
print(f"Recall: {recall_tree  }")
print(f"Puntuación F1: {f1_tree}")
print(f"AUC-ROC: {auc_roc_tree}")

ARBOL DE DECISION
Exactitud: 0.9247311827956989
Precisión: 0.9247311827956989
Recall: 1.0
Puntuación F1: 0.9608938547486032
AUC-ROC: 0.7541528239202658


In [94]:
# Creación y entrenamiento del modelo de regresión logística
logistic_model = LogisticRegression(max_iter=1000)  # Aumenta max_iter si es necesario
logistic_model.fit(X_train, y_train)

# Realizando predicciones
y_pred_logistic = logistic_model.predict(X_test)


In [96]:
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
precision_logistic = precision_score(y_test, y_pred_logistic)
recall_logistic = recall_score(y_test, y_pred_logistic)
f1_logistic = f1_score(y_test, y_pred_logistic)
auc_roc_logistic = roc_auc_score(y_test, logistic_model.predict_proba(X_test)[:, 1])

In [100]:
print ("REGRESION LOGISTICA")
print(f"Exactitud: {accuracy_logistic}")
print(f"Precisión: {precision_logistic}")
print(f"Recall: {recall_logistic}")
print(f"Puntuación F1: {f1_logistic}")
print(f"AUC-ROC: {auc_roc_logistic}")

REGRESION LOGISTICA
Exactitud: 0.978494623655914
Precisión: 0.9772727272727273
Recall: 1.0
Puntuación F1: 0.9885057471264368
AUC-ROC: 0.9850498338870433


In [89]:
from sklearn.metrics import roc_auc_score

# Calculando AUC-ROC para el modelo de regresión logística
auc_roc_logistic = roc_auc_score(y_test, logistic_model.predict_proba(X_test)[:, 1])
print(f"AUC-ROC para el modelo de regresión logística: {auc_roc_logistic}")

AUC-ROC para el modelo de regresión logística: 0.9850498338870433


In [91]:
from sklearn.metrics import roc_auc_score

# Calculando las probabilidades predichas
y_pred_proba_tree = best_tree_model.predict_proba(X_test)[:, 1]

# Calculando AUC-ROC
auc_roc_tree = roc_auc_score(y_test, y_pred_proba_tree)
print(f"AUC-ROC para el modelo de árbol de decisión: {auc_roc_tree}")


AUC-ROC para el modelo de árbol de decisión: 0.7541528239202658
