# Algorimos clásicos

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
import pandas as pd
from clasificadorVIH import ClasificadorVIH

clasificador = ClasificadorVIH()

## Puntuaciones de los indicadores

In [None]:
column_names = ['Score', 'Group1', 'Group2', 'Group3', 'Group4', 'Group5','Group6', 'Group7', 'Group8']
data = {name: [] for name in column_names}
for i in range(100):
    g1, g2, g3, g4, g5, g6, g7, g8, score = clasificador.symptoms_analysis(i)
    data['Score'].append(score)
    data['Group1'].append(g1[1])
    data['Group2'].append(g2[1])
    data['Group3'].append(g3[1])
    data['Group4'].append(g4)
    data['Group5'].append(g5)
    data['Group6'].append(g6[1])
    data['Group7'].append(g7[1])
    data['Group8'].append(g8[1])

# Crear DataFrame con los datos recopilados
df_X = pd.DataFrame(data)

# Visualización de las primeras filas del DataFrame para verificar
print(df_X.head())
print(df_X.shape)
print(y_true.shape)

   Score  Group1  Group2  Group3  Group4  Group5  Group6  Group7  Group8
0   9.20     0.0     0.0     0.0    10.8     1.3     2.5     0.0     0.0
1   5.20     0.0     0.0     0.0     2.8     1.3     2.5     0.0     0.0
2   7.65     0.0     0.0     0.0     3.3     1.3     4.7     0.0     0.0
3   8.30     0.0     0.0     0.0     2.8     1.3     5.6     0.0     0.0
4   2.70     4.7     0.0     2.5     2.8     1.3     7.3     5.8     4.3
(100, 9)
(100,)


In [None]:
# Dividir los datos asegurando el balanceo
X_train, X_test, y_train, y_test = train_test_split(df_X, y_true, test_size=0.2, random_state=42, stratify=y_true)

# Verificar la distribución después de la división
print("Distribución en el conjunto de entrenamiento:")
print(pd.Series(y_train).value_counts(normalize=True))

print("Distribución en el conjunto de prueba:")
print(pd.Series(y_test).value_counts(normalize=True))

Distribución en el conjunto de entrenamiento:
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64
Distribución en el conjunto de prueba:
1.0    0.5
0.0    0.5
Name: proportion, dtype: float64


### Regresión logística

In [None]:
from sklearn.calibration import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Crear el modelo de Regresión Logística
logistic_model = LogisticRegression(random_state=42, max_iter=1000)

# Aplicar 10-fold cross-validation usando StratifiedKFold para mantener la proporción de clases
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores_logistic = cross_val_score(logistic_model, df_X, y_true, cv=skf, scoring="accuracy")

print("Métricas de cross-validation:", scores_logistic)
print("Media de cross-validation:", scores_logistic.mean())

# Evaluar el modelo en el conjunto de datos completo con cross-validation
y_pred = cross_val_predict(logistic_model, df_X, y_true, cv=skf)

# Calcular y mostrar la matriz de confusión
cm_logistic = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(cm_logistic)

# Imprimir el informe de clasificación
print("Classification Report:")
print(classification_report(y_true, y_pred))

# Calcular TP, TN, FP, FN
TN, FP, FN, TP = cm_logistic.ravel()

# Calcular Recall, Precision, F1-score
recall = TP / (TP + FN)
precision = TP / (TP + FP)
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1-score: {f1_score:.4f}")
print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")


Métricas de cross-validation: [0.9 0.7 0.7 0.7 0.6 0.7 0.5 0.9 0.7 0.8]
Media de cross-validation: 0.72
Confusion Matrix:
[[39 11]
 [17 33]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.70      0.78      0.74        50
         1.0       0.75      0.66      0.70        50

    accuracy                           0.72       100
   macro avg       0.72      0.72      0.72       100
weighted avg       0.72      0.72      0.72       100

Recall: 0.6600
Precision: 0.7500
F1-score: 0.7021
Accuracy: 0.7200


### Árboles de decisión

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Crear el modelo de Árbol de Decisión
tree_model = DecisionTreeClassifier(random_state=42)

# Aplicar 10-fold cross-validation usando StratifiedKFold para mantener la proporción de clases
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores_tree = cross_val_score(tree_model, df_X, y_true, cv=skf, scoring="accuracy")

print("Métricas de cross-validation:", scores_tree)
print("Media de cross-validation:", scores_tree.mean())

# Evaluar el modelo en el conjunto de datos completo con cross-validation
y_pred = cross_val_predict(tree_model, df_X, y_true, cv=skf)

# Calcular y mostrar la matriz de confusión
cm_tree = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(cm_tree)

# Imprimir el informe de clasificación
print("Classification Report:")
print(classification_report(y_true, y_pred))

# Calcular TP, TN, FP, FN
TN, FP, FN, TP = cm_tree.ravel()

# Calcular Recall, Precision, F1-score
recall = TP / (TP + FN)
precision = TP / (TP + FP)
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1-score: {f1_score:.4f}")
print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")


Métricas de cross-validation: [0.9 0.7 0.8 0.8 0.3 0.7 0.5 0.6 0.5 0.5]
Media de cross-validation: 0.6300000000000001
Confusion Matrix:
[[33 17]
 [20 30]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.62      0.66      0.64        50
         1.0       0.64      0.60      0.62        50

    accuracy                           0.63       100
   macro avg       0.63      0.63      0.63       100
weighted avg       0.63      0.63      0.63       100

Recall: 0.6000
Precision: 0.6383
F1-score: 0.6186
Accuracy: 0.6300


### Máquinas de soporte vectorial

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Crear el modelo de Máquina de Soporte Vectorial (SVM)
svm_model = SVC(random_state=42)

# Aplicar 10-fold cross-validation usando StratifiedKFold para mantener la proporción de clases
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores_svm = cross_val_score(svm_model, df_X, y_true, cv=skf, scoring="accuracy")

print("Métricas de cross-validation:", scores_svm)
print("Media de cross-validation:", scores_svm.mean())

# Evaluar el modelo en el conjunto de datos completo con cross-validation
y_pred = cross_val_predict(svm_model, df_X, y_true, cv=skf)

# Calcular y mostrar la matriz de confusión
cm_svm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(cm_svm)

# Imprimir el informe de clasificación
print("Classification Report:")
print(classification_report(y_true, y_pred))

# Calcular TP, TN, FP, FN
TN, FP, FN, TP = cm_svm.ravel()

# Calcular Recall, Precision, F1-score
recall = TP / (TP + FN)
precision = TP / (TP + FP)
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1-score: {f1_score:.4f}")
print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")


Métricas de cross-validation: [0.9 0.6 0.7 0.7 0.6 0.6 0.6 0.9 0.7 0.8]
Media de cross-validation: 0.71
Confusion Matrix:
[[37 13]
 [16 34]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.70      0.74      0.72        50
         1.0       0.72      0.68      0.70        50

    accuracy                           0.71       100
   macro avg       0.71      0.71      0.71       100
weighted avg       0.71      0.71      0.71       100

Recall: 0.6800
Precision: 0.7234
F1-score: 0.7010
Accuracy: 0.7100


## Texto de documentos

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from text_processor import TextProcessor


def get_text_and_labels():
    text_processor = TextProcessor()
    texts = []
    y_true = []
    for i in range(0, 50):
        text = text_processor.get_text_from_file('datasets', i)
        label = 0
        texts.append(preprocess_text(text))
        y_true.append(label)
    

    for i in range(50, 100):
        text = text_processor.get_text_from_file('datasets', i)
        label = 1
        texts.append(preprocess_text(text))
        y_true.append(label)
        
    return texts, y_true

# Preprocesar el texto (puedes ajustar esta función según tus necesidades)
def preprocess_text(text):
    text = text.lower()
    return text

# Obtener los textos y las etiquetas verdaderas
texts, y_true = get_text_and_labels()

# Convertir textos a vectores TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)  # Puedes ajustar max_features según tus necesidades
X_tfidf = vectorizer.fit_transform(texts).toarray()

# Convertir y_true a una serie de pandas para facilitar el manejo
y_true = pd.Series(y_true)

# Dividir los datos asegurando el balanceo
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_true, test_size=0.2, random_state=42, stratify=y_true)

In [None]:
logreg_model = LogisticRegression()

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores_logreg = cross_val_score(logreg_model, X_tfidf, y_true, cv=skf, scoring="accuracy")

print("Métricas de cross-validation:", scores_logreg)
print("Media de cross-validation:", scores_logreg.mean())

# Evaluar el modelo en el conjunto de datos completo con cross-validation
y_pred = cross_val_predict(logreg_model, X_tfidf, y_true, cv=skf)

# Calcular y mostrar la matriz de confusión
cm_logreg = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(cm_logreg)

# Imprimir el informe de clasificación
print("Classification Report:")
print(classification_report(y_true, y_pred))

# Calcular TP, TN, FP, FN
TN, FP, FN, TP = cm_logreg.ravel()

# Calcular Recall, Precision, F1-score
recall = TP / (TP + FN)
precision = TP / (TP + FP)
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1-score: {f1_score:.4f}")
print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")

Métricas de cross-validation: [0.6 0.8 0.8 1.  1.  0.7 0.9 0.9 0.9 0.8]
Media de cross-validation: 0.8400000000000001
Confusion Matrix:
[[46  4]
 [12 38]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.92      0.85        50
           1       0.90      0.76      0.83        50

    accuracy                           0.84       100
   macro avg       0.85      0.84      0.84       100
weighted avg       0.85      0.84      0.84       100

Recall: 0.7600
Precision: 0.9048
F1-score: 0.8261
Accuracy: 0.8400


In [None]:
decision_tree_model = DecisionTreeClassifier(random_state=42)

# Aplicar 10-fold cross-validation usando StratifiedKFold para mantener la proporción de clases
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores_dt = cross_val_score(decision_tree_model, X_tfidf, y_true, cv=skf, scoring="accuracy")

print("Métricas de cross-validation:", scores_dt)
print("Media de cross-validation:", scores_dt.mean())

# Evaluar el modelo en el conjunto de datos completo con cross-validation
y_pred = cross_val_predict(decision_tree_model, X_tfidf, y_true, cv=skf)

# Calcular y mostrar la matriz de confusión
cm_dt = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(cm_dt)

# Imprimir el informe de clasificación
print("Classification Report:")
print(classification_report(y_true, y_pred))

# Calcular TP, TN, FP, FN
TN, FP, FN, TP = cm_dt.ravel()

# Calcular Recall, Precision, F1-score
recall = TP / (TP + FN)
precision = TP / (TP + FP)
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1-score: {f1_score:.4f}")
print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")


Métricas de cross-validation: [0.8 0.5 0.7 1.  1.  0.9 1.  0.8 0.8 0.7]
Media de cross-validation: 0.82
Confusion Matrix:
[[41  9]
 [ 9 41]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.82      0.82        50
           1       0.82      0.82      0.82        50

    accuracy                           0.82       100
   macro avg       0.82      0.82      0.82       100
weighted avg       0.82      0.82      0.82       100

Recall: 0.8200
Precision: 0.8200
F1-score: 0.8200
Accuracy: 0.8200


In [None]:
svm_model = SVC(random_state=42)

# Aplicar 10-fold cross-validation usando StratifiedKFold para mantener la proporción de clases
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores_svm = cross_val_score(svm_model, X_tfidf, y_true, cv=skf, scoring="accuracy")

print("Métricas de cross-validation:", scores_svm)
print("Media de cross-validation:", scores_svm.mean())

# Evaluar el modelo en el conjunto de datos completo con cross-validation
y_pred = cross_val_predict(svm_model, X_tfidf, y_true, cv=skf)

# Calcular y mostrar la matriz de confusión
cm_svm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(cm_svm)

# Imprimir el informe de clasificación
print("Classification Report:")
print(classification_report(y_true, y_pred))

# Calcular TP, TN, FP, FN
TN, FP, FN, TP = cm_svm.ravel()

# Calcular Recall, Precision, F1-score
recall = TP / (TP + FN)
precision = TP / (TP + FP)
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1-score: {f1_score:.4f}")
print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")

Métricas de cross-validation: [0.6 0.7 0.8 1.  1.  0.7 0.9 0.9 0.9 0.8]
Media de cross-validation: 0.8300000000000001
Confusion Matrix:
[[46  4]
 [13 37]]
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.92      0.84        50
           1       0.90      0.74      0.81        50

    accuracy                           0.83       100
   macro avg       0.84      0.83      0.83       100
weighted avg       0.84      0.83      0.83       100

Recall: 0.7400
Precision: 0.9024
F1-score: 0.8132
Accuracy: 0.8300
