In [5]:
import pandas as pd

# Load the training, testing, and validation datasets
train_data = pd.read_csv(r'/Users/kaustubhmestri/Projects/MoodSense/data/test.txt', sep=';', names=['text','emotion'])
test_data = pd.read_csv(r'/Users/kaustubhmestri/Projects/MoodSense/data/train.txt', sep=';', names=['text','emotion'])
val_data = pd.read_csv(r'/Users/kaustubhmestri/Projects/MoodSense/data/val.txt', sep=';', names=['text','emotion'])

# Check the first few rows
print(train_data.head())


                                                text  emotion
0  im feeling rather rotten so im not very ambiti...  sadness
1          im updating my blog because i feel shitty  sadness
2  i never make her separate from me because i do...  sadness
3  i left with my bouquet of red and yellow tulip...      joy
4    i was feeling a little vain when i did this one  sadness


In [6]:
import nltk
import re

# Download stopwords for text cleaning
nltk.download('stopwords')
from nltk.corpus import stopwords

# Function to clean the text
def clean_text(text):
    text = text.lower()  # lowercase text
    text = re.sub(r'[^a-z\s]', '', text)  # remove non-alphabetic characters
    return text

# Apply cleaning function to the text data
train_data['cleaned_text'] = train_data['text'].apply(clean_text)
test_data['cleaned_text'] = test_data['text'].apply(clean_text)
val_data['cleaned_text'] = val_data['text'].apply(clean_text)

# Check cleaned data
print(train_data.head())


                                                text  emotion  \
0  im feeling rather rotten so im not very ambiti...  sadness   
1          im updating my blog because i feel shitty  sadness   
2  i never make her separate from me because i do...  sadness   
3  i left with my bouquet of red and yellow tulip...      joy   
4    i was feeling a little vain when i did this one  sadness   

                                        cleaned_text  
0  im feeling rather rotten so im not very ambiti...  
1          im updating my blog because i feel shitty  
2  i never make her separate from me because i do...  
3  i left with my bouquet of red and yellow tulip...  
4    i was feeling a little vain when i did this one  


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kaustubhmestri/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=5000)

# Fit and transform the training data
X_train_tfidf = tfidf.fit_transform(train_data['cleaned_text'])

# Transform the test and validation data
X_test_tfidf = tfidf.transform(test_data['cleaned_text'])
X_val_tfidf = tfidf.transform(val_data['cleaned_text'])

# Check the shape of the transformed data
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(X_val_tfidf.shape)

(2000, 4645)
(16000, 4645)
(2000, 4645)


In [8]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the encoder on the training labels and transform them
y_train = label_encoder.fit_transform(train_data['emotion'])
y_test = label_encoder.transform(test_data['emotion'])
y_val = label_encoder.transform(val_data['emotion'])

# Check label encoding
print(label_encoder.classes_)


['anger' 'fear' 'joy' 'love' 'sadness' 'surprise']


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy on Test Data:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy on Test Data: 0.645875
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.35      0.50      2159
           1       0.83      0.27      0.41      1937
           2       0.58      0.94      0.71      5362
           3       0.90      0.08      0.15      1304
           4       0.69      0.84      0.76      4666
           5       0.73      0.01      0.03       572

    accuracy                           0.65     16000
   macro avg       0.77      0.42      0.43     16000
weighted avg       0.71      0.65      0.59     16000



In [11]:
import joblib

# Save the trained model
joblib.dump(model, 'sentiment_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(tfidf, 'tfidf.pkl')

# Save label encoder (optional but useful for inference)
joblib.dump(label_encoder, 'label_encoder.pkl')


['label_encoder.pkl']