In [1]:
import sys
sys.path.append("..")

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer   
from src.models.text_classifier import TextClassifier

texts = [
    "This movie is fantastic and I love it!",
    "I hate this film, it's terrible.",
    "The acting was superb, a truly great experience.",
    "What a waste of time, absolutely boring.",
    "Highly recommend this, a masterpiece.",
    "Could not finish watching, so bad."
]
labels = [1, 0, 1, 0, 1, 0]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)
print(f"Train: {len(X_train)}  Test: {len(X_test)}")

# Instantiate your RegexTokenizer and TfidfVectorizer (use sklearn TF‑IDF ) 
vectorizer = TfidfVectorizer(
    lowercase=True, 
    token_pattern=r"\w+|[^\w\s]",
    norm='l2' # chuẩn hóa L2
)
# Instantiate your TextClassifier with the vectorizer.
clf = TextClassifier(vectorizer)

# Train the classifier using the training data.
clf.fit(X_train, y_train)
# Make predictions on the test data.
pred = clf.predict(X_test)
# Evaluate the predictions and print the metrics.
metrics = clf.evaluate(y_test, pred)

print("\n=== EVALUATION RESULTS ===")
for k, v in metrics.items():
    print(f"{k:10}: {v:.4f}")

print("\nPredictions vs True labels:")
for txt, true, p in zip(X_test, y_test, pred):
    print(f"Text: {txt}")
    print(f"   True: {'POSITIVE' if true else 'NEGATIVE'} | "
          f"Pred: {'POSITIVE' if p else 'NEGATIVE'}\n")

Train: 4  Test: 2

=== EVALUATION RESULTS ===
accuracy  : 0.0000
precision : 0.0000
recall    : 0.0000
f1        : 0.0000

Predictions vs True labels:
Text: This movie is fantastic and I love it!
   True: POSITIVE | Pred: NEGATIVE

Text: Could not finish watching, so bad.
   True: NEGATIVE | Pred: POSITIVE

