# 📘 Pełny pipeline: Klasyfikacja sentymentu po polsku (PolEmo 2.0)

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import joblib

In [2]:
# 📥 Wczytanie danych
def load_dataset(path):
    texts, labels = [], []
    with open(path, encoding='utf-8') as f:
        for line in f:
            if "__label__" in line:
                parts = line.rsplit(" __label__", 1)
                texts.append(parts[0].strip())
                labels.append("__label__" + parts[1].strip())
    return pd.DataFrame({"text": texts, "label": labels})

label_map = {
    "__label__meta_plus_m": "positive",
    "__label__meta_minus_m": "negative",
    "__label__meta_zero": "neutral",
    "__label__meta_amb": "ambiguous"
}

train_df = load_dataset("all.text.train.txt")
dev_df = load_dataset("all.text.dev.txt")
test_df = load_dataset("all.text.test.txt")

for df in [train_df, dev_df, test_df]:
    df["sentiment"] = df["label"].map(label_map)
    df.dropna(subset=["sentiment"], inplace=True)
    df.drop(df[df["sentiment"] == "ambiguous"].index, inplace=True)

In [3]:
# 🔠 Wektoryzacja
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(train_df['text'])
X_dev_vec = vectorizer.transform(dev_df['text'])
X_test_vec = vectorizer.transform(test_df['text'])

y_train = train_df['sentiment']
y_dev = dev_df['sentiment']
y_test = test_df['sentiment']

In [4]:
# 🤖 Trening i ewaluacja
model = MultinomialNB()
model.fit(X_train_vec, y_train)
y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.86      0.99      0.92       339
     neutral       1.00      0.88      0.94       118
    positive       0.97      0.81      0.89       227

    accuracy                           0.91       684
   macro avg       0.94      0.89      0.91       684
weighted avg       0.92      0.91      0.91       684



In [5]:
# 💾 Zapis modelu i wektoryzatora
joblib.dump(model, "sentiment_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

In [6]:
# 🧠 Wczytanie i predykcja nowego tekstu
model = joblib.load("sentiment_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

def predict_sentiment(text):
    vec = vectorizer.transform([text])
    return model.predict(vec)[0]

# Przykład
print(predict_sentiment("Obsługa była bardzo miła, a pokój czysty i nowoczesny."))

positive
