In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chand\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)


In [4]:
def load_data(file_path):
    data = []
    labels = []
    with open(file_path, 'r') as file:
        for line in file:
            text, label = line.strip().split(';')
            data.append(preprocess_text(text))
            labels.append(label)
    return data, labels



In [7]:
# Load data
train_data, train_labels = load_data(r"C:\Users\chand\Documents\SENtiment Analysis\archive\train.txt")
val_data, val_labels = load_data(r"C:\Users\chand\Documents\SENtiment Analysis\archive\val.txt")
test_data, test_labels = load_data(r"C:\Users\chand\Documents\SENtiment Analysis\archive\test.txt")


In [8]:
# Feature extraction
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data)
X_val = vectorizer.transform(val_data)
X_test = vectorizer.transform(test_data)


In [9]:
# Train model
model = MultinomialNB()
model.fit(X_train, train_labels)

In [10]:
# Evaluate on validation set
val_predictions = model.predict(X_val)
print("Validation Set Performance:")
print(classification_report(val_labels, val_predictions))


Validation Set Performance:
              precision    recall  f1-score   support

       anger       0.98      0.33      0.49       275
        fear       0.95      0.25      0.40       212
         joy       0.65      0.98      0.78       704
        love       1.00      0.05      0.10       178
     sadness       0.66      0.94      0.77       550
    surprise       1.00      0.01      0.02        81

    accuracy                           0.68      2000
   macro avg       0.87      0.43      0.43      2000
weighted avg       0.77      0.68      0.61      2000



In [11]:
# Evaluate on test set
test_predictions = model.predict(X_test)
print("Test Set Performance:")
print(classification_report(test_labels, test_predictions))

Test Set Performance:
              precision    recall  f1-score   support

       anger       0.95      0.29      0.45       275
        fear       0.91      0.30      0.45       224
         joy       0.65      0.98      0.78       695
        love       1.00      0.05      0.10       159
     sadness       0.68      0.91      0.78       581
    surprise       0.00      0.00      0.00        66

    accuracy                           0.68      2000
   macro avg       0.70      0.42      0.43      2000
weighted avg       0.73      0.68      0.62      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
def predict_sentiment(text, model, vectorizer):
    # Preprocess the input text
    processed_text = preprocess_text(text)
    
    # Transform the text using the same vectorizer
    text_vector = vectorizer.transform([processed_text])
    
    # Predict the sentiment
    prediction = model.predict(text_vector)
    
    return prediction[0]

new_text = "i hate yellow color"
predicted_sentiment = predict_sentiment(new_text, model, vectorizer)
print(f"Text: '{new_text}'")
print(f"Predicted sentiment: {predicted_sentiment}")

Text: 'i hate yellow color'
Predicted sentiment: sadness
