In [4]:
import os
import joblib
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Configuration
SAMPLE_SIZE = 10000
SEED = 42
MAX_FEATURES = 5000
x_col = 'sentence'
y_col = 'sentiment'

# Load your dataset
df = pd.read_csv("./dataset/combined_sentiment_data.csv")  # Replace with actual path

# Sample if large
if len(df) > SAMPLE_SIZE:
    df = df.sample(n=SAMPLE_SIZE, random_state=SEED)

# Encode labels
label_list = sorted(df[y_col].unique().tolist())
label_to_index = {label: idx for idx, label in enumerate(label_list)}
df['label'] = df[y_col].map(label_to_index)

# Split
X_train, X_test, y_train, y_test = train_test_split(
    df[x_col], df['label'], test_size=0.3, stratify=df['label'], random_state=SEED
)

# Create and train pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=MAX_FEATURES, stop_words='english', ngram_range=(1, 2))),
    ('classifier', LogisticRegression())
])

pipeline.fit(X_train, y_train)

# Save pipeline
model_path = './save_models/sentiment_pipeline.pkl'
os.makedirs(os.path.dirname(model_path), exist_ok=True)
joblib.dump(pipeline, model_path)


['./save_models/sentiment_pipeline.pkl']