In [None]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score



In [2]:
with open('data/tfidf_data.pkl', 'rb') as file:
    tfidf_matrix_dense, df_prepared = pickle.load(file)

print(type(tfidf_matrix_dense))
print(type(df_prepared))

<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>


In [None]:

# Assuming tfidf_matrix_dense and df_prepared['encoded_label'] are defined from previous steps
X_train, X_test, y_train, y_test = train_test_split(
    tfidf_matrix_dense, df_prepared['encoded_label'], test_size=0.2, random_state=42
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (640, 5000)
X_test shape: (160, 5000)
y_train shape: (640,)
y_test shape: (160,)


In [None]:
# Initialize the Logistic Regression model with parameters that encourage overfitting
logreg_model = LogisticRegression(max_iter=10000, C=1e10)

# Fit the model to the training data
logreg_model.fit(X_train, y_train)

# Make predictions on the training and testing sets
y_train_pred = logreg_model.predict(X_train)
y_test_pred = logreg_model.predict(X_test)

In [None]:

# Calculate metrics for the training set
training_accuracy = accuracy_score(y_train, y_train_pred)
training_precision = precision_score(y_train, y_train_pred, average='macro', zero_division=0)
training_recall = recall_score(y_train, y_train_pred, average='macro')
training_f1 = f1_score(y_train, y_train_pred, average='macro')

# Handle potential errors for ROC AUC calculation on training data
try:
    training_roc_auc = roc_auc_score(y_train, logreg_model.predict_proba(X_train), multi_class='ovr')
except ValueError:
    print("Error calculating training ROC AUC. Check y_true and y_score shapes.")
    training_roc_auc = 0


# Calculate metrics for the testing set
testing_accuracy = accuracy_score(y_test, y_test_pred)
testing_precision = precision_score(y_test, y_test_pred, average='macro', zero_division=0)
testing_recall = recall_score(y_test, y_test_pred, average='macro')
testing_f1 = f1_score(y_test, y_test_pred, average='macro')

# Handle potential errors for ROC AUC calculation on testing data
try:
    testing_roc_auc = roc_auc_score(y_test, logreg_model.predict_proba(X_test), multi_class='ovr')
except ValueError:
    print("Error calculating testing ROC AUC. Check y_true and y_score shapes.")
    testing_roc_auc = 0

print("Training Set Metrics:")
print(f"Accuracy: {training_accuracy:.4f}")
print(f"Precision: {training_precision:.4f}")
print(f"Recall: {training_recall:.4f}")
print(f"F1-score: {training_f1:.4f}")
print(f"AUC-ROC: {training_roc_auc:.4f}")

print("\nTesting Set Metrics:")
print(f"Accuracy: {testing_accuracy:.4f}")
print(f"Precision: {testing_precision:.4f}")
print(f"Recall: {testing_recall:.4f}")
print(f"F1-score: {testing_f1:.4f}")
print(f"AUC-ROC: {testing_roc_auc:.4f}")

Training Set Metrics:
Accuracy: 0.9984
Precision: 0.9984
Recall: 0.9985
F1-score: 0.9984
AUC-ROC: 1.0000

Testing Set Metrics:
Accuracy: 0.6438
Precision: 0.6556
Recall: 0.6439
F1-score: 0.6468
AUC-ROC: 0.8582


In [None]:
# Experiment with different values of C for regularization
C_values = [0.1, 1, 10]
results = {}

for C in C_values:
    # Initialize and train the model with the current C value
    logreg_model = LogisticRegression(max_iter=10000, C=C)
    logreg_model.fit(X_train, y_train)

    # Make predictions on training and testing sets
    y_train_pred = logreg_model.predict(X_train)
    y_test_pred = logreg_model.predict(X_test)

    # Calculate evaluation metrics
    training_accuracy = accuracy_score(y_train, y_train_pred)
    training_precision = precision_score(y_train, y_train_pred, average='macro', zero_division=0)
    training_recall = recall_score(y_train, y_train_pred, average='macro')
    training_f1 = f1_score(y_train, y_train_pred, average='macro')
    
    try:
        training_roc_auc = roc_auc_score(y_train, logreg_model.predict_proba(X_train), multi_class='ovr')
    except ValueError:
        print("Error calculating training ROC AUC. Check y_true and y_score shapes.")
        training_roc_auc = 0

    testing_accuracy = accuracy_score(y_test, y_test_pred)
    testing_precision = precision_score(y_test, y_test_pred, average='macro', zero_division=0)
    testing_recall = recall_score(y_test, y_test_pred, average='macro')
    testing_f1 = f1_score(y_test, y_test_pred, average='macro')
    
    try:
        testing_roc_auc = roc_auc_score(y_test, logreg_model.predict_proba(X_test), multi_class='ovr')
    except ValueError:
        print("Error calculating testing ROC AUC. Check y_true and y_score shapes.")
        testing_roc_auc = 0

    results[C] = {
        'training': {
            'accuracy': training_accuracy, 'precision': training_precision, 'recall': training_recall,
            'f1': training_f1, 'roc_auc': training_roc_auc
        },
        'testing': {
            'accuracy': testing_accuracy, 'precision': testing_precision, 'recall': testing_recall,
            'f1': testing_f1, 'roc_auc': testing_roc_auc
        }
    }

# Print results for each C value
for C, metrics in results.items():
    print(f"Results for C={C}:")
    print(f"  Training: {metrics['training']}")
    print(f"  Testing: {metrics['testing']}")

# Select the best C value based on the results (e.g., minimize the gap between training and testing performance)
# For demonstration, we select C=10 (In real application, you should implement a more sophisticated selection)
best_C = 10

# Retrain the model with the best C value
final_model = LogisticRegression(max_iter=10000, C=best_C)
final_model.fit(X_train, y_train)

Results for C=0.1:
  Training: {'accuracy': 0.896875, 'precision': 0.9085057349609176, 'recall': 0.8949053371456156, 'f1': 0.8959979716646953, 'roc_auc': 0.9883096296602161}
  Testing: {'accuracy': 0.6125, 'precision': 0.6379683840749415, 'recall': 0.6304420699769537, 'f1': 0.6095073404856014, 'roc_auc': 0.8338465980142766}
Results for C=1:
  Training: {'accuracy': 0.9859375, 'precision': 0.9861502425044091, 'recall': 0.9857399444674332, 'f1': 0.985834918421152, 'roc_auc': 0.9994428035211887}
  Testing: {'accuracy': 0.65, 'precision': 0.6625077465018429, 'recall': 0.6515508864346073, 'f1': 0.6537802794889968, 'roc_auc': 0.8547610875730434}
Results for C=10:
  Training: {'accuracy': 0.9984375, 'precision': 0.9984939759036144, 'recall': 0.9983870967741936, 'f1': 0.9984356515022634, 'roc_auc': 0.9999967423168377}
  Testing: {'accuracy': 0.625, 'precision': 0.6377986403895807, 'recall': 0.6265401614238824, 'f1': 0.6293067777137047, 'roc_auc': 0.8618491882378045}


In [8]:
best_C = 1

# Retrain the model with the best C value
final_model = LogisticRegression(max_iter=10000, C=best_C)
final_model.fit(X_train, y_train)

# Evaluate the final model
y_train_pred_final = final_model.predict(X_train)
y_test_pred_final = final_model.predict(X_test)

training_accuracy_final = accuracy_score(y_train, y_train_pred_final)
training_precision_final = precision_score(y_train, y_train_pred_final, average='macro', zero_division=0)
training_recall_final = recall_score(y_train, y_train_pred_final, average='macro')
training_f1_final = f1_score(y_train, y_train_pred_final, average='macro')

try:
    training_roc_auc_final = roc_auc_score(y_train, final_model.predict_proba(X_train), multi_class='ovr')
except ValueError:
    print("Error calculating training ROC AUC. Check y_true and y_score shapes.")
    training_roc_auc_final = 0

testing_accuracy_final = accuracy_score(y_test, y_test_pred_final)
testing_precision_final = precision_score(y_test, y_test_pred_final, average='macro', zero_division=0)
testing_recall_final = recall_score(y_test, y_test_pred_final, average='macro')
testing_f1_final = f1_score(y_test, y_test_pred_final, average='macro')

try:
    testing_roc_auc_final = roc_auc_score(y_test, final_model.predict_proba(X_test), multi_class='ovr')
except ValueError:
    print("Error calculating testing ROC AUC. Check y_true and y_score shapes.")
    testing_roc_auc_final = 0

print("Final Model Performance:")
print("Training Set:")
print(f"- Accuracy: {training_accuracy_final:.4f}")
print(f"- Precision: {training_precision_final:.4f}")
print(f"- Recall: {training_recall_final:.4f}")
print(f"- F1-score: {training_f1_final:.4f}")
print(f"- AUC-ROC: {training_roc_auc_final:.4f}")

print("\nTesting Set:")
print(f"- Accuracy: {testing_accuracy_final:.4f}")
print(f"- Precision: {testing_precision_final:.4f}")
print(f"- Recall: {testing_recall_final:.4f}")
print(f"- F1-score: {testing_f1_final:.4f}")
print(f"- AUC-ROC: {testing_roc_auc_final:.4f}")

Final Model Performance:
Training Set:
- Accuracy: 0.9859
- Precision: 0.9862
- Recall: 0.9857
- F1-score: 0.9858
- AUC-ROC: 0.9994

Testing Set:
- Accuracy: 0.6500
- Precision: 0.6625
- Recall: 0.6516
- F1-score: 0.6538
- AUC-ROC: 0.8548
