In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [7]:
good = [
    "Excellent product quality.",
    "Very satisfied, works great.",
    "Highly recommend this item."
]
bad = [
    "Poor quality, very disappointed.",
    "Does not work as advertised.",
    "Waste of money."
]
texts = [np.random.choice(good) for _ in range(50)] + \
        [np.random.choice(bad) for _ in range(50)]
labels = ['good'] * 50 + ['bad'] * 50
df = pd.DataFrame({'Text': texts, 'Label': labels})
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=300, lowercase=True, stop_words='english')),
    ('clf', LogisticRegression(solver='liblinear'))
])


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    df['Text'], df['Label'], test_size=0.25, random_state=42
)


In [12]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred, target_names=['bad', 'good']))


              precision    recall  f1-score   support

         bad       1.00      1.00      1.00         8
        good       1.00      1.00      1.00        17

    accuracy                           1.00        25
   macro avg       1.00      1.00      1.00        25
weighted avg       1.00      1.00      1.00        25



In [None]:
def text_preprocess_vectorize(texts, vectorizer):