### Tarea 3

In [36]:
import numpy as np
import zipfile
import tarfile
import os
from tqdm import tqdm 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import KFold, cross_val_predict
from scipy.sparse import vstack
from sklearn.model_selection import GridSearchCV

Se debe descaragr el archivo "Datasets.zip" en el mismo directorio de el notebook

## Extracción Dataset

In [2]:
compressed_file = "Datasets.zip"
with zipfile.ZipFile(compressed_file, 'r') as zip_ref:
    folder_name = os.path.splitext(compressed_file)[0]  # Remove the ".zip" extension
    target_folder = os.path.join(folder_name)
    
    if not os.path.exists(target_folder):
        # Create the folder within the target directory
        os.mkdir(target_folder)

    
        # Extract all files to the target folder
        zip_ref.extractall(target_folder)

print("Extracción completada")

Extracción completada


In [3]:
compressed_file = "Datasets\\20news-18828.tar.gz"
folder_name = os.path.splitext(os.path.splitext(compressed_file)[0])[0]  # Remove the ".tar.gz" extension

if not os.path.exists(folder_name):
    os.mkdir(folder_name)

# Extract all files from the TAR.GZ archive to the target folder without creating an additional subfolder
with tarfile.open(compressed_file, 'r:gz') as tar_ref:
    members = tar_ref.getmembers()
    tar_ref.extractall(path=folder_name, members=members)

print("Extracción completada")

Extracción completada


## Separación train-test-val

In [4]:
# Define the path to the 20N dataset folder
dataset_folder = "Datasets/20news-18828/20news-18828"

# Load and preprocess the text data
data = []
labels = []
for category in os.listdir(dataset_folder):
    category_path = os.path.join(dataset_folder, category)
    if os.path.isdir(category_path):
        for file_name in os.listdir(category_path):
            file_path = os.path.join(category_path, file_name)
            with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
                content = file.read()
                data.append(content)
                labels.append(category)

# Split the dataset into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=13)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.33, random_state=13)

In [5]:
(np.unique(labels))

array(['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
       'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
       'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
       'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
       'sci.electronics', 'sci.med', 'sci.space',
       'soc.religion.christian', 'talk.politics.guns',
       'talk.politics.mideast', 'talk.politics.misc',
       'talk.religion.misc'], dtype='<U24')

## Creación de Representaciones

In [6]:
# Define vectorizers for "tf" and "tfidf" representations
tf_vectorizer = CountVectorizer(max_features=5000, stop_words="english")
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")

## Separación de dataset

In [7]:
# Fit and transform the vectorizers on training data
X_train_tf = tf_vectorizer.fit_transform(X_train)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [8]:
# Transform validation and test data
X_val_tf = tf_vectorizer.transform(X_val)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tf = tf_vectorizer.transform(X_test)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

## Entrenamiento & Métricas

In [9]:
# Initialize and train Naive Bayes (MultinomialNB) and Logistic Regression (LogisticRegression) classifiers
nb_classifier = MultinomialNB()
lr_classifier = LogisticRegression(max_iter=500)

nb_classifier.fit(X_train_tf, y_train)
lr_classifier.fit(X_train_tf, y_train)

In [10]:
# Evaluate classifiers on validation data
val_preds_nb = nb_classifier.predict(X_val_tf)
val_preds_lr = lr_classifier.predict(X_val_tf)

print("Validation Results (tf representation):")
print("Naive Bayes Accuracy:", accuracy_score(y_val, val_preds_nb))
print("Logistic Regression Accuracy:", accuracy_score(y_val, val_preds_lr))

# Evaluate classifiers on test data
test_preds_nb = nb_classifier.predict(X_test_tf)
test_preds_lr = lr_classifier.predict(X_test_tf)

print("\nTest Results (tf representation):")
print("Naive Bayes Accuracy:", accuracy_score(y_test, test_preds_nb))
print("Logistic Regression Accuracy:", accuracy_score(y_test, test_preds_lr))

Validation Results (tf representation):
Naive Bayes Accuracy: 0.811046511627907
Logistic Regression Accuracy: 0.8535940803382663

Test Results (tf representation):
Naive Bayes Accuracy: 0.8016085790884718
Logistic Regression Accuracy: 0.8605898123324397


In [11]:
# Initialize and train Naive Bayes (MultinomialNB) and Logistic Regression (LogisticRegression) classifiers
nb_classifier_tfidf = MultinomialNB()
lr_classifier_tfidf = LogisticRegression(max_iter=500)

nb_classifier.fit(X_train_tfidf, y_train)
lr_classifier.fit(X_train_tfidf, y_train)

In [12]:
# Evaluate classifiers on validation data
val_preds_nb = nb_classifier.predict(X_val_tfidf)
val_preds_lr = lr_classifier.predict(X_val_tfidf)

print("Validation Results (tfidf representation):")
print("Naive Bayes Accuracy:", accuracy_score(y_val, val_preds_nb))
print("Logistic Regression Accuracy:", accuracy_score(y_val, val_preds_lr))

# Evaluate classifiers on test data
test_preds_nb = nb_classifier.predict(X_test_tfidf)
test_preds_lr = lr_classifier.predict(X_test_tfidf)

print("\nTest Results (tfidf representation):")
print("Naive Bayes Accuracy:", accuracy_score(y_test, test_preds_nb))
print("Logistic Regression Accuracy:", accuracy_score(y_test, test_preds_lr))

Validation Results (tfidf representation):
Naive Bayes Accuracy: 0.846723044397463
Logistic Regression Accuracy: 0.8736786469344608

Test Results (tfidf representation):
Naive Bayes Accuracy: 0.8557640750670241
Logistic Regression Accuracy: 0.8729222520107238


## Cross-Validation

La validación cruzada es una técnica utilizada para evaluar el que tan bien generaliza un modelo de aprendizaje automático. Es particularmente útil cuando se dispone de una cantidad limitada de datos. La estrategia consiste en dividir el conjunto de datos en múltiples subconjuntos (folds), entrenar el modelo en algunos de estos subconjuntos y probarlo en los subconjuntos restantes. Este proceso se repite varias veces y las métricas de rendimiento se promedian en estas iteraciones. Los principales objetivos de la validación cruzada son los siguientes:

- Evaluar el Rendimiento del Modelo: La validación cruzada ayuda a estimar qué tan bien un modelo generalizará a datos no vistos, proporcionando una evaluación más robusta en comparación con una sola división de entrenamiento y prueba.

- Ajuste de Hiperparámetros: Puede ayudar en la búsqueda de hiperparámetros evaluando diferentes combinaciones de hiperparámetros en múltiples folds, lo que ayuda a seleccionar el mejor conjunto de hiperparámetros.

- Evitar el Sobreajuste: La validación cruzada ayuda a detectar el sobreajuste. Si un modelo tiene un buen desempeño en los datos de entrenamiento pero un mal desempeño en los datos de validación o prueba, es posible que esté sobreajustando los datos de entrenamiento.

- Optimizar la Selección del Modelo: Ayuda a comparar diferentes modelos y seleccionar aquel que tenga el mejor rendimiento en promedio entre los pliegues.

Así es cómo se puede comparar Naive Bayes (NB) y Regresión Logística (LR) utilizando una validación cruzada de K pliegues con conjuntos de entrenamiento y validación, y realizar la búsqueda de hiperparámetros.

In [31]:
# Define the number of folds for cross-validation
n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=13)
# Initialize classifiers
nb_classifier = MultinomialNB()
lr_classifier = LogisticRegression(max_iter=500)

# Combine training and validation sets
X_combined = vstack((X_train_tfidf, X_val_tfidf))
y_combined = np.concatenate((y_train, y_val))

In [41]:
# Define hyperparameters for LR grid search 
lr_params = {
    'penalty':['l1', 'l2', 'elasticnet', None], # Regularization type
    'C': [0.01, 0.1, 1.0],  # Regularization strength
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],  # Solver algorithms
}

# Initialize LR classifier for grid search
lr_grid_search = GridSearchCV(estimator=lr_classifier, param_grid=lr_params, cv=kf, scoring='f1_macro')
lr_grid_search.fit(X_combined, y_combined)

# Get the best LR model after hyperparameter tuning
best_lr_classifier = lr_grid_search.best_estimator_

KeyboardInterrupt: 

In [None]:
# Report precision, recall, and F1 scores for the best Logistic Regression model
lr_preds = best_lr_classifier.predict(X_combined)
print("Logistic Regression Classification Report (Combined Training+Validation Set):")
print(classification_report(y_combined, lr_preds))

Logistic Regression Classification Report (Combined Training+Validation Set):
                          precision    recall  f1-score   support

             alt.atheism       0.95      0.93      0.94       718
           comp.graphics       0.88      0.91      0.89       875
 comp.os.ms-windows.misc       0.89      0.90      0.89       874
comp.sys.ibm.pc.hardware       0.88      0.90      0.89       898
   comp.sys.mac.hardware       0.95      0.92      0.94       850
          comp.windows.x       0.94      0.92      0.93       880
            misc.forsale       0.91      0.91      0.91       870
               rec.autos       0.94      0.96      0.95       887
         rec.motorcycles       0.98      0.97      0.98       905
      rec.sport.baseball       0.98      0.98      0.98       897
        rec.sport.hockey       0.99      0.99      0.99       912
               sci.crypt       0.99      0.97      0.98       896
         sci.electronics       0.91      0.93      0.92       8

In [None]:
# Hyperparameter search for Naive Bayes (alpha parameter)
nb_param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0]}
nb_grid_search = GridSearchCV(nb_classifier, nb_param_grid, cv=kf, scoring='f1_macro')
nb_grid_search.fit(X_combined, y_combined)
best_nb_classifier = nb_grid_search.best_estimator_

In [None]:
# Perform 10-fold cross-validation with the best Naive Bayes classifier
nb_cv_preds = cross_val_predict(best_nb_classifier, X_combined, y_combined, cv=kf)

# Report precision, recall, and F1 scores for Naive Bayes
print("Naive Bayes Classification Report (Training Set + Validation Set):")
print(classification_report(y_combined, nb_cv_preds))

Naive Bayes Classification Report (Training Set + Validation Set):
                          precision    recall  f1-score   support

             alt.atheism       0.86      0.88      0.87       718
           comp.graphics       0.73      0.81      0.77       875
 comp.os.ms-windows.misc       0.78      0.78      0.78       874
comp.sys.ibm.pc.hardware       0.75      0.79      0.77       898
   comp.sys.mac.hardware       0.86      0.84      0.85       850
          comp.windows.x       0.85      0.85      0.85       880
            misc.forsale       0.83      0.83      0.83       870
               rec.autos       0.89      0.89      0.89       887
         rec.motorcycles       0.92      0.93      0.93       905
      rec.sport.baseball       0.96      0.94      0.95       897
        rec.sport.hockey       0.96      0.97      0.97       912
               sci.crypt       0.96      0.93      0.95       896
         sci.electronics       0.82      0.80      0.81       885
        

LR supera NB en términos de precisión, macro-average F1-score, y weighted-average F1-score en el set de entrenamiento y validación. 
esto indica que, en promedio, LR es mejor clasificando documentos en sus respectivas categorías