In [7]:
import pandas as pd

splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/cirimus/super-emotion/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/cirimus/super-emotion/" + splits["test"])

In [8]:
import neattext.functions as nfx

def preprocess_text(text):
    text = nfx.remove_stopwords(text)  # Remove common words like 'is', 'the', etc.
    text = nfx.remove_special_characters(text)  # Remove punctuation
    return text

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, y_train = [preprocess_text(text) for text in df_train["text"]], df_train["label"]
X_test, y_test = [preprocess_text(text) for text in df_test["text"]], df_test["label"]

emotion_model = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LogisticRegression(solver="liblinear"))
])

# Train the Model
emotion_model.fit(X_train, y_train)

def label_to_emotion(label):
    return ["neutral", "surprise", "fear", "sadness", "joy", "anger", "love"][label]

def predict_emotion(text):
    processed_text = preprocess_text(text)
    emotion = label_to_emotion(emotion_model.predict([processed_text])[0])
    return emotion


In [19]:
from sklearn.metrics import accuracy_score

accuracy_score(df_test["label"], emotion_model.predict(X_test))

0.8041555673947789

In [20]:
predict_emotion("my name is xyz")

'neutral'