# Baseline Model: TF-IDF + Logistic Regression

This notebook implements a baseline text classification model using TF-IDF features and Logistic Regression.


In [7]:
import sys
import os
# Add project root to Python path
project_root = os.path.dirname(os.getcwd())
sys.path.insert(0, project_root)

import numpy as np
import pandas as pd
import joblib
from sklearn.linear_model import LogisticRegression
from src.data_utils import load_text_classification_data
from src.text_preprocess import basic_clean
from src.features import build_tfidf_vectorizer
from src.evaluate import evaluate_classification

print("Imports successful!")


Imports successful!


## 1. Load Data


In [8]:
# Load training data
train_texts, train_labels, label2id, id2label = load_text_classification_data('train')
val_texts, val_labels, _, _ = load_text_classification_data('val')
test_texts, test_labels, _, _ = load_text_classification_data('test')

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")
print(f"Test samples: {len(test_texts)}")
print(f"Label mapping: {label2id}")
print(f"Label distribution (train): {np.bincount(train_labels)}")


Training samples: 19782
Validation samples: 4239
Test samples: 4240
Label mapping: {'high': 0, 'low': 1, 'medium': 2}
Label distribution (train): [7698 4043 8041]


## 2. Preprocess Text


In [9]:
# Clean text data
train_texts_clean = [basic_clean(text) for text in train_texts]
val_texts_clean = [basic_clean(text) for text in val_texts]
test_texts_clean = [basic_clean(text) for text in test_texts]

print(f"Sample cleaned text: {train_texts_clean[0][:200]}...")


Sample cleaned text: enhance investment strategy with machine learning hello customer support team i am reaching out to explore the use of machine learning algorithms in enhancing our investment portfolios by leveraging r...


## 3. Build TF-IDF Vectorizer


In [10]:
# Build TF-IDF vectorizer
tfidf_vectorizer = build_tfidf_vectorizer(max_features=50000, ngram_range=(1, 2), min_df=5)

# Fit on training data
print("Fitting TF-IDF vectorizer...")
X_train = tfidf_vectorizer.fit_transform(train_texts_clean)
X_val = tfidf_vectorizer.transform(val_texts_clean)
X_test = tfidf_vectorizer.transform(test_texts_clean)

print(f"TF-IDF matrix shape (train): {X_train.shape}")
print(f"TF-IDF matrix shape (val): {X_val.shape}")
print(f"TF-IDF matrix shape (test): {X_test.shape}")


Fitting TF-IDF vectorizer...
TF-IDF matrix shape (train): (19782, 24463)
TF-IDF matrix shape (val): (4239, 24463)
TF-IDF matrix shape (test): (4240, 24463)


## 4. Train Logistic Regression


In [11]:
# Train Logistic Regression
print("Training Logistic Regression...")
logreg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42, n_jobs=-1)
logreg.fit(X_train, train_labels)

print("Training completed!")


Training Logistic Regression...
Training completed!


## 5. Evaluate on Validation Set


In [12]:
# Predict on validation set
val_pred = logreg.predict(X_val)

# Evaluate
val_results = evaluate_classification(val_labels, val_pred)
print("Validation Results:")
print(f"Accuracy: {val_results['accuracy']:.4f}")
print(f"F1 Macro: {val_results['f1_macro']:.4f}")
print("\nClassification Report:")
print(val_results['report'])


Validation Results:
Accuracy: 0.6530
F1 Macro: 0.6471

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.67      0.68      1615
           1       0.55      0.68      0.61       855
           2       0.67      0.63      0.65      1769

    accuracy                           0.65      4239
   macro avg       0.64      0.66      0.65      4239
weighted avg       0.66      0.65      0.65      4239



## 6. Evaluate on Test Set


In [13]:
# Predict on test set
test_pred = logreg.predict(X_test)

# Evaluate
test_results = evaluate_classification(test_labels, test_pred)
print("Test Results:")
print(f"Accuracy: {test_results['accuracy']:.4f}")
print(f"F1 Macro: {test_results['f1_macro']:.4f}")
print("\nClassification Report:")
print(test_results['report'])


Test Results:
Accuracy: 0.6436
F1 Macro: 0.6363

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.67      0.68      1604
           1       0.54      0.65      0.59       876
           2       0.67      0.62      0.64      1760

    accuracy                           0.64      4240
   macro avg       0.63      0.65      0.64      4240
weighted avg       0.65      0.64      0.65      4240



## 7. Save Models


In [14]:
# Create model directory if it doesn't exist
os.makedirs('../src/model', exist_ok=True)

# Save models
joblib.dump(logreg, '../src/model/baseline_logreg.joblib')
joblib.dump(tfidf_vectorizer, '../src/model/tfidf_vectorizer.joblib')

print("Models saved successfully!")
print("- baseline_logreg.joblib")
print("- tfidf_vectorizer.joblib")


Models saved successfully!
- baseline_logreg.joblib
- tfidf_vectorizer.joblib
