In [6]:
import pandas as pd

# Load the training, testing, and validation datasets
train_data = pd.read_csv(r'D:\MoodSense\data\train.txt', sep=';', names=['text','emotion'])
test_data = pd.read_csv(r'D:\MoodSense\data\test.txt', sep=';', names=['text','emotion'])
val_data = pd.read_csv(r'D:\MoodSense\data\val.txt', sep=';', names=['text','emotion'])

# Check the first few rows
print(train_data.head())


                                                text  emotion
0                            i didnt feel humiliated  sadness
1  i can go from feeling so hopeless to so damned...  sadness
2   im grabbing a minute to post i feel greedy wrong    anger
3  i am ever feeling nostalgic about the fireplac...     love
4                               i am feeling grouchy    anger


In [8]:
import nltk
import re

# Download stopwords for text cleaning
nltk.download('stopwords')
from nltk.corpus import stopwords

# Function to clean the text
def clean_text(text):
    text = text.lower()  # lowercase text
    text = re.sub(r'[^a-z\s]', '', text)  # remove non-alphabetic characters
    return text

# Apply cleaning function to the text data
train_data['cleaned_text'] = train_data['text'].apply(clean_text)
test_data['cleaned_text'] = test_data['text'].apply(clean_text)
val_data['cleaned_text'] = val_data['text'].apply(clean_text)

# Check cleaned data
print(train_data.head())


                                                text  emotion  \
0                            i didnt feel humiliated  sadness   
1  i can go from feeling so hopeless to so damned...  sadness   
2   im grabbing a minute to post i feel greedy wrong    anger   
3  i am ever feeling nostalgic about the fireplac...     love   
4                               i am feeling grouchy    anger   

                                        cleaned_text  
0                            i didnt feel humiliated  
1  i can go from feeling so hopeless to so damned...  
2   im grabbing a minute to post i feel greedy wrong  
3  i am ever feeling nostalgic about the fireplac...  
4                               i am feeling grouchy  


[nltk_data] Downloading package stopwords to C:\Users\Prerna
[nltk_data]     Patriwar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=5000)

# Fit and transform the training data
X_train_tfidf = tfidf.fit_transform(train_data['cleaned_text'])

# Transform the test and validation data
X_test_tfidf = tfidf.transform(test_data['cleaned_text'])
X_val_tfidf = tfidf.transform(val_data['cleaned_text'])

# Check the shape of the transformed data
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(X_val_tfidf.shape)

(16000, 5000)
(2000, 5000)
(2000, 5000)


In [13]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the encoder on the training labels and transform them
y_train = label_encoder.fit_transform(train_data['emotion'])
y_test = label_encoder.transform(test_data['emotion'])
y_val = label_encoder.transform(val_data['emotion'])

# Check label encoding
print(label_encoder.classes_)


['anger' 'fear' 'joy' 'love' 'sadness' 'surprise']


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy on Test Data:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy on Test Data: 0.87
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.84      0.86       275
           1       0.88      0.81      0.84       224
           2       0.85      0.95      0.90       695
           3       0.81      0.62      0.70       159
           4       0.90      0.92      0.91       581
           5       0.87      0.50      0.63        66

    accuracy                           0.87      2000
   macro avg       0.86      0.77      0.81      2000
weighted avg       0.87      0.87      0.87      2000



In [15]:
import joblib

# Save the trained model
joblib.dump(model, 'sentiment_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(tfidf, 'tfidf.pkl')

# Save label encoder (optional but useful for inference)
joblib.dump(label_encoder, 'label_encoder.pkl')


['label_encoder.pkl']