# 1. Load

# The Problem: 
### Predicting whether a person with a given set of characteristics is likely to have a heart attack.

In [None]:
import pandas as pd
!pwd
df_raw = pd.read_csv("final-test/data/heart.csv") 
print("Columns:", df_raw.columns.tolist())

In [None]:
print(df_raw.head(1))
print("Types :\n",df_raw.dtypes)

2. Pré-processamento

In [None]:
print(df_raw.isna())
print("SIZE: ", len(df_raw))

In [None]:
# same the df.shape[0]
df_raw.isna().count()

In [None]:
df_raw.isnull().sum()

Resultado: não existens ausentes

# 3. Normalization

In [15]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# Normalization
# Apply StandardScaler ONLY to numeric columns.
# And you should NEVER try to transform categorical columns to 0 and 1 before standardizing.
# The same logic applies to OneHotEncoder (categorical columns ONLY)

df_categorical = df_raw.select_dtypes(exclude=['number'])
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded = encoder.fit_transform(df_categorical)

encoded_df = pd.DataFrame(
    encoded,
    columns=encoder.get_feature_names_out(df_categorical.columns),
    index=df_raw.index
)

# Outliers: Statistical Rule vs. Quality Rule (Domain Violation)
Fundamental Difference (Very Important)

## Statistical Outlier (IQR)
- Extreme but plausible value
- Ex.: Cholesterol = 450
- Action: cap / flag / feature engineering

## Domain Violation (Data Quality)
- Impossible value in the real world
- Ex.: Age = 200
- Action: quality rule, not statistical

*Age = 200 is not an outlier

*Age = 200 is invalid data

# Treat outliers only in the original numeric columns.

In [None]:
df_numeric = df_raw.select_dtypes(include=['number'])
df_numeric.dtypes

In [None]:
print(df_numeric["RestingBP"].value_counts()) 

In [None]:
print(df_numeric["FastingBS"].value_counts()) ## is a binary variable; statistical IQR does not apply.

In [None]:
print(df_numeric["Age"].value_counts()) ## It is already numerical not to apply statistical IQR.

# What to do with these cases?

It depends on the project's maturity:

Option When to use Correct (cap at 120) legacy data Input few occurrences 

Delete critical error record Block ingestion from mature pipelines Create flag always recommended

In [20]:
target_col = "HeartDisease"
cols_numeric_features = [c for c in df_numeric.columns if c != target_col]

df_outliers = df_numeric.drop(columns=["HeartDisease"],  errors="ignore")

# Automatically detects binary characters (e.g., 0/1). Then we should not apply the IRQ.
binary_cols = [c for c in cols_numeric_features if df_outliers[c].dropna().nunique() <= 2] 

# continuous (where IQR makes sense)
cols_iqr = [c for c in cols_numeric_features if c not in binary_cols and c not in ['Age']] 

In [None]:
print(cols_iqr)

In [None]:
""" OLD check outliers statistic """

## data removed
for col in df_outliers.columns:
    Q1 = df_raw[col].quantile(0.25)
    Q3 = df_raw[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outliers = df_raw[(df_raw[col] < lower) | (df_raw[col] > upper)][col]
    print(f"{col}: {len(outliers)} outliers")

In [22]:
""" NEW FUNCTION OUTLIERS """
 
 # created new columns  as col_outlier_flag 
import numpy as np

def iqr_cap_with_flag(df, cols, k=1.5):
    df2 = df_outliers.copy()
    limits = {}

    for col in cols:
        q1 = df2[col].quantile(0.25)
        q3 = df2[col].quantile(0.75)
        iqr = q3 - q1
        low = q1 - k * iqr
        high = q3 + k * iqr

        limits[col] = (low, high)

        df2[f"{col}_outlier_flag"] = ((df2[col] < low) | (df2[col] > high)).astype(int)
        df2[f"{col}_capped"] = df2[col].clip(lower=low, upper=high)

    return df2, limits

df_numeric_treated, iqr_limits = iqr_cap_with_flag(df_outliers, cols=cols_iqr, k=1.5)

In [None]:
df_numeric.dtypes

In [None]:
df_outliers.dtypes

In [None]:
df_numeric_treated.dtypes

# df_numeric_treated

Interpretation of maturity (high level)
O teu dataset agora separa claramente:
| Tipo     | Exemplo     | Tratamento       |
| -------- | ----------- | ---------------- |
| Domínio  | Age         | regra de negócio |
| Binário  | FastingBS   | validação lógica |
| Contínuo | Cholesterol | IQR + cap + flag |

This mean Data Quality + Feature Engineering, it not just ML.


In [75]:
# concatenate all variables
df_encoded = pd.concat([df_numeric_treated, encoded_df], axis=1)

In [None]:
# check
df_encoded.dtypes

In [None]:
df_encoded.shape

In [None]:
df_encoded.isna().sum().sum()

In [None]:
# adds/returns the target variable
df_encoded["HeartDisease"] = df_raw["HeartDisease"]
df_encoded.dtypes

In [None]:
df_encoded["HeartDisease"].value_counts()

# Outlier Checking with IQR

IQR = Q3 – Q1
- Where:
- Q1 = 25th percentile
- Q3 = 75th percentile

A value is considered an outlier if it falls outside of: [ Q1 - 1.5*IQR , Q3 + 1.5*IQR ]

Important Rule: Outliers only make sense for:

original continuous numeric variables
- Never for one-hot columns (0/1)
-  Never for the target

Mental Summary (important):
- df_encoded is the correct DataFrame
- Outliers only in real continuous variables
-  Do not apply IQR to one-hot columns
-  Do not apply IQR to the target
- Save iqr_limits for production

# Technique -> RestingBP
I will use the most recommended pattern: Winsorization (cap) + flag only for continuous numeric columns.

In [None]:
print(df_encoded["RestingBP"].value_counts()) ### applied to the function: iqr_cap_with_flag

In [None]:
print(df_encoded["Cholesterol"].value_counts())

In [None]:
print(df_encoded["MaxHR"].value_counts())

In [None]:
print(df_encoded["Oldpeak"].value_counts())

# End of outlier analysis.

# ~~Remove Outliers~~ We handle outliers in original numeric columns, not binary ones.
Removing outliers reduces the number of rows in the dataset. If you do this for many columns and have a narrow IQR, you may lose a lot of data. Alternative: replace outliers with medians or limits, for example.

In [None]:
print(len(df_clean)) ## old, there were 587 rows left.

In [None]:
print(len(df_encoded)) # manteve 918 linhas mesmo tamanho de df_raw OK 

In [None]:
## Not executed after outlier treatment, only before and after check
## Check total record: BEFORE TREATING OUTLIERS

print("Total: ",df.shape[0],"Total sem outliers: ", df_clean.shape[0])
reduced = df.shape[0] - df_clean.shape[0]
print(f'A total of {reduced} were reduced')

# MODEL

# Separate the data into training/test sets with stratification.

In [93]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

y = df_encoded["HeartDisease"]
X = df_encoded

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
print(X_train.dtypes)

In [95]:
# Select only numeric columns
numeric_cols = X_train.select_dtypes(include=['number']).columns

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled = scaler.transform(X_test[numeric_cols])

# Apply a K-Nearest Neighbors (KNN) model.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

In [None]:
y_train.value_counts()

In [None]:
print(y_train.value_counts())
print(y_test.value_counts())

# Evaluate the model using:

- Confusion matrix
- Accuracy, Precision, Recall, and F1-score

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

## predict: predictions with test data:
y_pred = knn.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")    
plt.title("Confusion Matrix")
plt.show()

## Explanation:

TN (True Negative): The model predicted 0 and the actual label was 0 → correct prediction of negative.

FN (False Negative): The model predicted 0 but the actual label was 1 → "detection failure".

FP (False Positive): The model predicted 1 but the actual label was 0 → "false alarm".

TP (True Positive): The model predicted 1 and the actual label was 1 → correct prediction of positive.


| Real \ predicted | 0 (predicted) | 1 (predicted) |
|-----------------|--------------|--------------|
| 0 (Real)        | TN           | FP           |
| 1 (Real)        | FN           | TP           |


In [None]:
print("Confusion Matrix:\n", cm)

In [None]:
print("\n Classification Report: \n" , classification_report(y_test, y_pred))

# EXPLANATION 1
### After oneHotEncoder
### Only with numeric variables

# KNN Analysis (based on pdf 06-FT03)
``` knn = KNeighborsClassifier(n_neighbors=5) ```

In the first application with default hyperparameters and a cluster count of 5, we obtained:

**Total sample size: 918 rows (100%)**

- TN (True Negative): 76% 
- FP (False Positive): 6% 
- FN (False Negative): 3% 
- TP (True Positive): 99% 


In other words, more than 175% of the total TN + TP (76% TN + 99% TP) of our model trained correctly.

Conversely, we had 3 (FN) lines of the sample incorrectly classified; in the case of this problem, which could be fatal, we will consider it a failure in disease detection (FN) by the model.

As an area for improvement: reduce the 'False Negatives'.

* Regarding other metrics:


1. Accuracy: We obtained *0.936*, which we can consider excellent for the model, as it managed to predict 94% of the cases in the sample. However, it is important to adjust it to include the remaining 3% (FN) as TP.

Best parameters: {'knn__metric': 'manhattan', **'knn__n_neighbors': 11**, 'knn__weights': 'distance'}
Best accuracy (validation): 0.936
Accuracy in the test: 0.951



| Classe | Precision | Recall | F1-score | Support |
|--------|-----------|--------|----------|---------|
| 0      | 0.96      | 0.93   | 0.94     | 82      |
| 1      | 0.94      | 0.97   | 0.96     | 102     |
| **Accuracy** |           |        | **0.95** | 184     |
| **Macro avg** | 0.95      | 0.95   | 0.95     | 184     |
| **Weighted avg** | 0.95      | 0.95   | 0.95     | 184     |

Healthy people:
2. In the accuracy of positives and correct results, we had 96%, of which 4% are classified as imprecise, not 100% 'certain', such as 0 (they may or may not be healthy). In this case, they were marked as healthy but may be <u>sick</u>.

3. Recall = True Positives / (True Positives + False Negatives)
This means that of all the examples that were actually class 0, healthy people, the model was able to identify 93%, leaving only 7% as falsely healthy.

4. F-score
F1 = 2 * (precision * recall) / (precision + recall)
This means that this metric measures balance. It prevents a model with high recall and low precision (or vice versa) from looking good.
In short, we need to have both metrics balanced to actually have a model with good prediction.
f.Score = 94% and recall = 93% (we have balance)

Averages:
Macro Avg = 0.95 may represent an optimal model.
Weighted Avg = 0.95, similar to accuracy, therefore, we can say that the classes are balanced among themselves.
➡️ If weighted = accuracy, normally the classes are not very unbalanced.
Weighted Avg = 95% and comparing with accuracy = 94% (we have balance)

In conclusion, we found a critical area for potential future improvement:

1. Critical point for improvement, for recall in the class of sick people, positive for attack. We had a recall of 94%, meaning it failed to correctly predict 6% of the sick individuals, which can be fatal.

1.1 Suggested solution: improve the balancing of sick people (this was done, we applied OneHotEncoder), that is, increase the number of sick people in the model's training sample. To establish patterns for sick people, we need to confirm if we have more attributes that can better explain class 1 (sick people). DONE

1.2 We handled outliers with flag techniques, only treating original numeric variables from df_raw and not binary ones. DONE

Result: after improvements 1.1 and 1.2, the TP increased to 99% detection. The suggestion for a new application of KNN will be with 11K.

# ~~EXPLANATION 2~~ before oneHotEncoder
### Before oneHotEncoder
### With numeric + categorical variables transformed into numeric variables using OneHotEncoder
# KNN Analysis (based on pdf 06-FT03)
``` knn = KNeighborsClassifier(n_neighbors=5) ```

In the first application with standard hyperparameters and a cluster count of 5, we obtained:
TP (True Positive): 59 out of a total of 587 samples
TN (True Negative): 16 out of a total of 587 samples
FP (False Positive): 19 out of a total of 587 samples
FN (False Negative): 31 out of a total of 587 samples

That is, more than 50% of our model trained correctly. On the other hand, we had 31 lines of the sample incorrectly classified; in the case of this problem, it can be fatal, so we will consider it a failure in the model's disease detection.

Regarding other metrics:
1. Accuracy: we obtained *0.755*, which we can consider good for the model, as it managed to predict 70% of the cases in the sample. However, it is important to adjust it to include the remaining 30% as TP.

Best accuracy (validation): 0.755
Accuracy in the test: 0.703

Healthy people:

| Classe        | Precision | Recall | F1-Score | Support |
|---------------|-----------|--------|----------|---------|
| **0**         | 0.73      | 0.76   | 0.75     | 68      |
| **1**         | 0.66      | 0.62   | 0.64     | 50      |
| **Accuracy**  | —         | —      | 0.70     | 118     |
| **Macro Avg** | 0.70      | 0.69   | 0.69     | 118     |
| **Weighted Avg** | 0.70  | 0.70   | 0.70     | 118     |

2. In the Precision of positives, we had 73%.

3. Recall
Recall = True Positives / (True Positives + False Negatives)
This means that of all the examples that were actually class 0, healthy people, the model managed to identify 76%, leaving only 24% as false negatives.

4. F-score
F1 = 2 * (precision * recall) / (precision + recall)
This means that this metric measures balance. It prevents a model with high recall and low precision (or vice versa) from appearing good.

In short, we need to have both metrics balanced to actually have a model with good prediction.

Averages:
Macro Avg = 0.69 may represent a moderate model.
Weighted Avg = 0.70, similar to accuracy, therefore, we can say that the classes are balanced between them.

➡️ If weighted = accuracy, then classes are usually not very unbalanced.

Conclusion
We found a critical point for possible future improvement:

1. Critical point for improvement, for recall in the class of sick people, positive for attack. We had a recall = 62%, meaning it failed to correctly predict 38% of the sick people, which can be fatal.

1.1 Suggested solution: improve the balancing of sick people, that is, increase the number of sick people in the model's training sample. As for the patterns of sick people, we need to confirm if we have more attributes that can better explain class 1 (sick people).  

# Otimizar o número de vizinhos com GridSearchCV

# Feature Scaling Pipeline

### Step 1 — StandardScaler
- Normalizes all features
- Assumes all are numerical
- Sets mean = 0 and standard deviation = 1

### Step 2 — KNeighborsClassifier
- Highly scale-sensitive model
- Distances dominate the decision

#### When is this pipeline appropriate?

- Fully numeric dataset
- PCA, KNN, K-Means, SVM
- Clean data without categorical variables
- Educational or exploratory notebooks

##### When is this pipeline inadequate?

- Dataset with categorical columns
- Mixed data (num + cat)
- Production
- Advanced feature engineering

##### In this  case:

>> It wouldn't be sufficient

>  It would break the semantics of categorical variables

>  It would generate errors or incorrect learning





In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Feature Scaling Pipeline -> This pipeline is a didactic example of a feature scaling application, suitable only when all variables are numeric.
pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier())
])

# Hiper parametros
param_grid = {
    "knn__n_neighbors": [3, 5, 7, 9, 11],
    "knn__weights": ["uniform", "distance"],
    "knn__metric": ["euclidean", "manhattan"]
}

# otimização de vizinhos
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5
)

grid.fit(X_train_scaled, y_train)

print("Melhores parâmetros:", grid.best_params_)
print(f"Melhor accuracy (validação): {grid.best_score_:.3f} ")

# best_model = grid.best_estimator_
# y_pred = best_model.predict(X_test_scaled)

# accuracy_score: percentagem de classificações corretas
test_acc = accuracy_score(y_test, y_pred)
print(f"Accuracy no teste: {test_acc:.3f}")

# Comparar o desempenho antes e depois da otimização.

# Análise de KNN (baseado no pdf 06-FT03)
## knowledge
``` knn = KNeighborsClassifier(n_neighbors=5) ```

Na primeira aplicação com hiper parametros padrão e número de clusters igual 5, obtivemos:
TP (True Positive): 59 do total de 587 amostras
TN (True Negative): 16 do total de 587 amostras
FP (False Positive): 19 do total de 587 amostras
FN (False Negative): 31 do total de 587 amostras

ou seja mais de 50% do nosso modelo treinou corretamente. Em contra partida, tivemos 31 linhas da amostra classificado incorretamente, no caso do problema em questão pode ser fatal, vamos considerar como falha na deteção da doença pelo modelo.

Com relação a outras métricas:
1. Accuracy: obtivemos *0.755*, podemos considerar  bom para o modelo conseguiu predizer 70 % dos casos na amostra, porém é importate ajustar para conseguir adicionar os 30% restantes como TP.

Pessoas saudáveis:
2. Na Precisão de posotivos e está certo tivemos 73 %

3. recall
Recall = Verdadeiros Positivos / (Verdadeiros Positivos + Falsos Negativos)
O que significa que todos os exemplos que realmente eram classe 0, pessoas saudáveis, o modelo conseguiu identificar 76%. deixando apenas 24% como falsos doentes.

4. F-score
F1 = 2 * (precision * recall) / (precision + recall)
o que significa que essa métrica mede o equilíbrio. Evita que um modelo com recall alto e precision baixa (ou vice-versa) pareça bom.
em resumo, precisamos ter as duas métricas balanceadas para de facto ter um modelo com boa predição.

Médias:
Macro Avg = 0.69 pode representar um modelo moderado.
Weighted Avg = 0.70, semelhante a accuracy, logo, podemos dizer que as classes estão balanceadas entre elas.
➡️ Se weighted = accuracy, normalmente as classes não estão muito desbalanceadas.

conclusão
Encrontramos um ponto crítico de possível melhoria para futura:

1. Ponto crítico de melhoria, para os recall em classe de pessoas doentes, positivas para ataque. tivemos recall = 62& ou seja deixou de predizer corretamente 38% dos doentes, o que pode ser fatal. 
1.1 Sugestão de solução: melhorar o balanceamento de pessoas doentes, ou seja, aumentar a quantidade de doentes na amostra de treinamento do modelo. Já para deixar os padrões de pessoas doentes, precisamos confirmar se temos mais atributos que podem explicam melhor a classe 1(pessoas doentes).

# CONTEXTO PARA explicação:
TP (True Positive): O modelo previu 1 e o rótulo real era 1 → previsão correta de positivo.

TN (True Negative): O modelo previu 0 e o rótulo real era 0 → previsão correta de negativo.

FP (False Positive): O modelo previu 1 mas o rótulo real era 0 → "falso alarme".

FN (False Negative): O modelo previu 0 mas o rótulo real era 1 → "falha na deteção".


| Real \ Previsto | 0 (Previsto) | 1 (Previsto) |
|-----------------|--------------|--------------|
| 0 (Real)        | TN           | FP           |
| 1 (Real)        | FN           | TP           |


# resultado do classification_report
Relatório de Classificação:

classe 0 => Saudável (falso)

classe 1 => doente (positivo)

# Sem oneHotEncoder

| Classe        | Precision | Recall | F1-Score | Support |
|---------------|-----------|--------|----------|---------|
| **0**         | 0.73      | 0.76   | 0.75     | 68      |
| **1**         | 0.66      | 0.62   | 0.64     | 50      |
| **Accuracy**  | —         | —      | 0.70     | 118     |
| **Macro Avg** | 0.70      | 0.69   | 0.69     | 118     |
| **Weighted Avg** | 0.70  | 0.70   | 0.70     | 118     |


# Com oneHotEncoder
| Classe        | Precision | Recall | F1-Score | Support |
|---------------|-----------|--------|----------|---------|
| **0**         | 0.97      | 0.87   | 0.91     | 68      |
| **1**         | 0.84      | 0.96   | 0.90     | 50      |
| **Accuracy**  | —         | —      | 0.91     | 118     |
| **Macro Avg** | 0.90      | 0.91   | 0.91     | 118     |
| **Weighted Avg** | 0.91  | 0.91   | 0.91     | 118     |


# Prever a condição de um novo paciente
*com valores fictícios

# Entendendo os campos:
| Campo           | Descrição                                                    | Exemplo | Tipo        | Por extenso / Significado                                  |
|-----------------|--------------------------------------------------------------|---------|-------------|-------------------------------------------------------------|
| Age             | Idade do paciente                                            | 55      | numérica    | 55 anos                                                     |
| Sex             | Sexo do paciente                                             | M       | categórica  | M = masculino, F = feminino                                 |
| ChestPainType   | Tipo de dor no peito                                         | ATA     | categórica  | ATA = angina atípica; ASY = assintomático; TA = típica; NAP = não anginosa |
| RestingBP       | Pressão arterial em repouso (mm Hg)                           | 130     | numérica    | 130 mm Hg                                                   |
| Cholesterol     | Colesterol sérico total (mg/dl)                               | 245     | numérica    | 245 mg/dl                                                   |
| FastingBS       | Glicemia em jejum > 120 mg/dl?                                | 0       | categórica  | 0 = normal; 1 = alta (acima de 120 mg/dl)                   |
| RestingECG      | Resultado do eletrocardiograma em repouso                     | ST      | categórica  | Normal / Anomalia ST-T / Hipertrofia Ventricular Esquerda  |
| MaxHR           | Frequência cardíaca máxima atingida                           | 150     | numérica    | 150 bpm                                                     |
| ExerciseAngina  | Angina induzida por exercício                                 | N       | categórica  | N = não apresentou; Y = apresentou angina                   |
| Oldpeak         | Depressão do segmento ST por exercício                        | 1.0     | numérica    | Depressão ST de 1.0 mm                                      |
| ST_Slope        | Inclinação do segmento ST no pico do exercício                | Down    | categórica  | Up = subida; Flat = plano; Down = descida                   |
| HeartDisease    | Presença de doença cardíaca (TARGET)                          | 1       | categórica  | 1 = possui doença cardíaca; 0 = não possui                  |



# Reaplicar o modelo filtrando as colunas originais e usando as colunas tratadas dos outliers

# knowledge
* Entre pessoas experientes em Data Stewardship (com foco em governança + explicabilidade + robustez), a opção mais recomendada costuma ser:

✅ Usar _capped + _outlier_flag e NÃO usar as colunas originais (não capped) no modelo.
(equivalente à tua Opção A, mas “sem duplicar” a mesma variável duas vezes.)

Por quê essa é a mais defendida?

- Robustez: o modelo não “explode” por valores extremos.
- Explicabilidade: a flag diz claramente “esse caso era extremo”.
- Governança: fica transparente o que foi tratado e quando (útil para auditoria e stakeholders).
- Menos ruído: evitar manter original + capped juntos reduz colinearidade e confusão na interpretação.

Quando eu NÃO recomendaria essa opção?
- Se o outlier é um evento raro porém super informativo (ex.: fraude, picos reais) e você quer que o modelo “sinta” a magnitude total. Aí você pode manter o original também — mas isso é uma decisão consciente, não padrão.

In [105]:
# Recomendado: usar _capped + _outlier_flag e remover as colunas originais contínuas
target = "HeartDisease"
orig_continuous = ["RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]

capped_cols = [c for c in df_encoded.columns if c.endswith("_capped")]
flag_cols  = [c for c in df_encoded.columns if c.endswith("_outlier_flag")]

# base = tudo que não é target, não é original contínua, não é capped/flag
base_cols = [
    c for c in df_encoded.columns
    if c != target
    and c not in orig_continuous
    and not c.endswith("_capped")
    and not c.endswith("_outlier_flag")
]

X = df_encoded[base_cols + capped_cols + flag_cols]
y = df_encoded[target]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train_scaled, y_train)

In [None]:
# classificação 
y_train.value_counts()

In [None]:
print(y_train.value_counts())
print(y_test.value_counts())

In [None]:
# predict: previsões com os dados de teste
y_pred = knn.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")    
plt.title("Matriz de Confusão")
plt.show()

| Real \ Previsto | 0 (Previsto) | 1 (Previsto) |
|-----------------|--------------|--------------|
| 0 (Real)        | TN           | FP           |
| 1 (Real)        | FN           | TP           |


In [None]:
print("Matriz de Confusão:\n", cm)

In [None]:
print("\nRelatório de Classificação:\n" , classification_report(y_test, y_pred))

# OTIMIZAÇÃO GRID SEARCH CV

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

#pipeline
pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier())
])

# Hiper parametros
param_grid = {
    "knn__n_neighbors": [3, 5, 7, 9, 11],
    "knn__weights": ["uniform", "distance"],
    "knn__metric": ["euclidean", "manhattan"]
}

# otimização de vizinhos
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5
)

grid.fit(X_train_scaled, y_train)

print("Melhores parâmetros:", grid.best_params_)
print(f"Melhor accuracy (validação): {grid.best_score_:.3f} ")

# best_model = grid.best_estimator_
# y_pred = best_model.predict(X_test_scaled)

# accuracy_score: percentagem de classificações corretas
test_acc = accuracy_score(y_test, y_pred)
print(f"Accuracy no teste: {test_acc:.3f}")

# NOVA PREDIÇÃO

In [None]:
X_train.columns

In [None]:
X_train.head(1)

In [None]:
df_encoded.columns

In [None]:
import pandas as pd


# 0) Novo paciente (formato original)
novo_paciente = pd.DataFrame([{
    "Age": 58,
    "Sex": "M",
    "ChestPainType": "ATA",
    "RestingBP": 138,
    "Cholesterol": 240,
    "FastingBS": 0,
    "RestingECG": "ST",
    "MaxHR": 160,
    "ExerciseAngina": "N",
    "Oldpeak": 1.4,
    "ST_Slope": "Flat"
}])


# 1) One-hot encoding (usar o MESMO encoder já fitado)
df_categorical = novo_paciente.select_dtypes(exclude=["number"])
encoded_arr = encoder.transform(df_categorical)

encoded_df_new = pd.DataFrame(
    encoded_arr,
    columns=encoder.get_feature_names_out(df_categorical.columns),
    index=novo_paciente.index
)


# 2) Numéricas + capped/flags (usar os MESMOS limites iqr_limits do treino)
df_numeric = novo_paciente.select_dtypes(include=["number"]).copy()

for col in ["RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]:
    low, high = iqr_limits[col]
    df_numeric[f"{col}_outlier_flag"] = ((df_numeric[col] < low) | (df_numeric[col] > high)).astype(int)
    df_numeric[f"{col}_capped"] = df_numeric[col].clip(lower=low, upper=high)

df_numeric_final = df_numeric[[
    "Age", "FastingBS",
    "RestingBP_capped", "Cholesterol_capped", "MaxHR_capped", "Oldpeak_capped",
    "RestingBP_outlier_flag", "Cholesterol_outlier_flag", "MaxHR_outlier_flag", "Oldpeak_outlier_flag"
]]


# 3) Montar X do novo paciente e alinhar com as colunas do treino
novo_X = pd.concat([df_numeric_final, encoded_df_new], axis=1)

# alinhar com as colunas que o modelo espera (X_train.columns)
novo_X = novo_X.reindex(columns=X_train.columns, fill_value=0)


# 4) Escalar com o MESMO scaler (alinhar com as colunas do scaler)
if hasattr(scaler, "feature_names_in_"):
    cols_scaler = list(scaler.feature_names_in_)
else:
    cols_scaler = list(X_train.columns)  # fallback: use a ordem do treino

novo_X_for_scaler = novo_X.reindex(columns=cols_scaler, fill_value=0)
novo_X_scaled = scaler.transform(novo_X_for_scaler)


# 5) Predição com o modelo treinado (GridSearchCV)
pred_novo = grid.predict(novo_X_scaled)

print(pred_novo[0])
print(f"Risco previsto: {'Doente' if pred_novo[0] == 1 else 'Saudável'}")
