In [None]:
!pip install pandas scikit-learn

In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(class_weight='balanced'))
])

cwd = os.getcwd()
data_dir = os.path.join(os.path.dirname(cwd), "data")
data = pd.read_csv(os.path.join(data_dir, "data.csv"))
data["label"] = data["is_positive"].map({"t": 1, "f": 0}).astype("float").values
df = data[["title", "label"]]

X_train, X_test, y_train, y_test = train_test_split(df["title"], df["label"], test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))

In [None]:
joblib_file = os.path.join(os.path.dirname(os.getcwd()), "data", "models", "tf-idf-logistic.joblib")
joblib.dump(pipeline, joblib_file)