In [None]:
# Configuração para não exibir os warnings
import warnings
warnings.filterwarnings("ignore")

# Imports necessários
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split # para particionar em bases de treino e teste (holdout)
from sklearn.model_selection import KFold # para preparar os folds da validação cruzada
from sklearn.model_selection import cross_val_score # para executar a validação cruzada
from sklearn.metrics import accuracy_score # para a exibição da acurácia do modelo

from sklearn.neighbors import KNeighborsClassifier # algoritmo KNN
from sklearn.tree import DecisionTreeClassifier # algoritmo Árvore de Classificação
from sklearn.naive_bayes import GaussianNB # algoritmo Naive Bayes
from sklearn.svm import SVC # algoritmo SVM

In [None]:
# Fonte: https://www.kaggle.com/datasets/khwaishsaxena/lung-cancer-dataset

import os

data_path = '../dados/lung_cancer.csv'
# url = ''


# Carga do dataset
data = pd.read_csv('Lung Cancer.csv')

data.drop(columns=['id', 'country'], inplace=True) # exclusão da colinas id e country por não serem relevantes para a análise
data.drop(columns=['diagnosis_date', 'end_treatment_date'], inplace=True) # exclusão das data por não serem relevantes em novos casos

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

# Instanciando o OneHotEncoder
# handle_unknown='ignore' evita erro se uma categoria nova aparecer no futuro
# sparse_output=False retorna uma matriz densa ao invés de esparsa
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_features = encoder.fit_transform(data[['gender', 'cancer_stage', 'family_history', 'smoking_status', 'treatment_type']])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['gender', 'cancer_stage', 'family_history', 'smoking_status', 'treatment_type']))

data_encoded = pd.concat([encoded_df, data], axis=1)
data_encoded = data_encoded.drop(columns=['gender', 'cancer_stage', 'family_history', 'smoking_status', 'treatment_type'])


# Instanciando o StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data_encoded[['age', 'bmi', 'cholesterol_level']])
scaled_df = pd.DataFrame(scaled_features, columns=scaler.get_feature_names_out(['age', 'bmi', 'cholesterol_level']))

data_encoded = data_encoded.drop(columns=['age', 'bmi', 'cholesterol_level']) # exclusão das colunas não normalizadas
data_scaled = pd.concat([scaled_df, data_encoded], axis=1)

In [23]:
data_scaled

Unnamed: 0,age,bmi,cholesterol_level,gender_Female,gender_Male,cancer_stage_Stage I,cancer_stage_Stage II,cancer_stage_Stage III,cancer_stage_Stage IV,family_history_No,...,smoking_status_Passive Smoker,treatment_type_Chemotherapy,treatment_type_Combined,treatment_type_Radiation,treatment_type_Surgery,hypertension,asthma,cirrhosis,other_cancer,survived
0,0.899796,-0.130748,-0.797424,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0,0,1,0,0
1,-0.500977,1.279296,1.067550,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1,1,0,0,1
2,0.999851,1.613882,0.791257,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1,1,0,0,0
3,-0.400922,1.494387,0.169599,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1,1,0,0,0
4,-1.801696,-1.289852,-1.280936,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
889995,-1.501530,1.709478,0.215648,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1,1,1,0,0
889996,0.699686,-1.062811,0.146575,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0
889997,-0.701088,0.968608,0.192624,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1,0,0,0,1
889998,1.199962,-1.421297,-0.912546,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1,1,0,0,0


In [24]:
data_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890000 entries, 0 to 889999
Data columns (total 24 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   age                            890000 non-null  float64
 1   bmi                            890000 non-null  float64
 2   cholesterol_level              890000 non-null  float64
 3   gender_Female                  890000 non-null  float64
 4   gender_Male                    890000 non-null  float64
 5   cancer_stage_Stage I           890000 non-null  float64
 6   cancer_stage_Stage II          890000 non-null  float64
 7   cancer_stage_Stage III         890000 non-null  float64
 8   cancer_stage_Stage IV          890000 non-null  float64
 9   family_history_No              890000 non-null  float64
 10  family_history_Yes             890000 non-null  float64
 11  smoking_status_Current Smoker  890000 non-null  float64
 12  smoking_status_Former Smoker  

In [None]:
# Preparação dos dados

# Separação em bases de treino e teste (holdout)
array = data_scaled.values
X = array[:,0:23] # atributos
y = array[:,23] # classe (target)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=7) # faz a divisão

# Criando os folds para a validação cruzada
num_particoes = 3 # número de folds da validação cruzada (3, 5, 10)
kfold = KFold(n_splits=num_particoes, shuffle=True, random_state=7) # faz o particionamento em 10 folds

In [None]:
# Modelagem

# Definindo uma seed global para esta célula de código
np.random.seed(7) 

# Listas para armazenar os modelos, os resultados e os nomes dos modelos
models = []
results = []
names = []

# Preparando os modelos e adicionando-os em uma lista
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

# Avaliando um modelo por vez
for name, model in models:
  cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
  results.append(cv_results)
  names.append(name)
  msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) # média e desvio padrão dos 10 resultados da validação cruzada
  print(msg)

# Boxplot de comparação dos modelos
fig = plt.figure() 
fig.suptitle('Comparação da Acurácia dos Modelos') 
ax = fig.add_subplot(111) 
plt.boxplot(results) 
ax.set_xticklabels(names) 
plt.show()