In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


Load Dataset
- Load the dataset from a .txt file and convert it into a DataFrame.

In [2]:
df = pd.read_csv(
    "train.txt",
    sep=";",
    header=None,
    names=["text", "emotion"]
)


Encode Emotion Labels
- Convert categorical emotion labels into numeric form.

In [3]:
unique_emotions = df["emotion"].unique()

emotion_numbers = {}
i = 0
for emo in unique_emotions:
    emotion_numbers[emo] = i
    i += 1

df["emotion"] = df["emotion"].map(emotion_numbers)


In [4]:
# Convert Text to Lowercase
df["text"] = df["text"].apply(lambda x: x.lower())

In [5]:
# Remove Punctuation
import string

def remove_punc(txt):
    return txt.translate(str.maketrans("", "", string.punctuation))

df["text"] = df["text"].apply(remove_punc)


In [6]:
#Remove Numbers
def remove_numbers(txt):
    new = ""
    for i in txt:
        if not i.isdigit():
            new = new + i
    return new

df["text"] = df["text"].apply(remove_numbers)


In [7]:
# Remove Links
def remove_links(txt):
    new = ""
    words = txt.split()
    for i in words:
        if not (i.startswith("http") or i.startswith("www")):
            new = new + i + " "
    return new.strip()

df["text"] = df["text"].apply(remove_links)


In [8]:
# Remove Emojis
def remove_emojis(txt):
    new = ""
    for i in txt:
        if i.isascii():
            new = new + i
    return new

df["text"] = df["text"].apply(remove_emojis)


Stopword Removal
-Remove common English stopwords using NLTK.

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [10]:
# Download NLTK Resources
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ksony\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ksony\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ksony\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# Stopword Cleaning Function
stop_words = set(stopwords.words("english"))
len(stop_words)

def remove(txt):
    words = word_tokenize(txt)
    cleaned = []
    for i in words:
        if not i in stop_words:
            cleaned.append(i)
    return " ".join(cleaned)

df["text"] = df["text"].apply(remove)


In [12]:
# Trainâ€“Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["text"],
    df["emotion"],
    test_size=0.20,
    random_state=42
)


In [13]:
# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


Model Training & Evaluation

In [14]:
# Bag of Words + Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

bow_vectorizer = CountVectorizer()

X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train)

pred_bow = nb_model.predict(X_test_bow)
print(accuracy_score(y_test, pred_bow))


0.7684375


In [15]:
# TF-IDF + Naive Bayes
tfidf_vectorizer = TfidfVectorizer()

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

nb2_model = MultinomialNB()
nb2_model.fit(X_train_tfidf, y_train)

y_pred = nb2_model.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred))


0.66125


In [16]:
# TF-IDF + Logistic Regression
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_tfidf, y_train)

log_pred = logistic_model.predict(X_test_tfidf)
print(accuracy_score(y_test, log_pred))


0.860625


In [17]:
#pip install streamlit

Note: you may need to restart the kernel to use updated packages.


In [18]:
import pickle

# Save the TF-IDF vectorizer
pickle.dump(tfidf_vectorizer, open("tfidf_vectorizer.pkl", "wb"))

# Save the Logistic Regression model
pickle.dump(logistic_model, open("logistic_model.pkl", "wb"))
