In [1]:
# Ensure necessary libraries are installed
!pip install numpy pandas scikit-learn joblib nltk



In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [3]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\salsa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\salsa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [4]:
# Function to preprocess text
def to_lowercase(text):
    return text.lower()

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def tokenize(text):
    return word_tokenize(text)

def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word not in stop_words]

def stem_words(words):
    ps = PorterStemmer()
    return [ps.stem(word) for word in words]

In [5]:
# Load dataset for emotion detection
train_path = 'train.txt'
val_path = 'val.txt'
test_path = 'test.txt'

train_data = pd.read_csv(train_path, delimiter='\t', header=None, names=['Text', 'Label'])
val_data = pd.read_csv(val_path, delimiter='\t', header=None, names=['Text', 'Label'])
test_data = pd.read_csv(test_path, delimiter='\t', header=None, names=['Text', 'Label'])

In [6]:
# Handle missing values
train_data.dropna(inplace=True)
val_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [7]:
# Ensure no empty documents
train_data = train_data[train_data['Text'].str.strip() != '']
val_data = val_data[val_data['Text'].str.strip() != '']
test_data = test_data[test_data['Text'].str.strip() != '']

In [8]:
# Combine training and validation data
X_train = pd.concat([train_data['Text'], val_data['Text']])
y_train = pd.concat([train_data['Label'], val_data['Label']])
X_test = test_data['Text']
y_test = test_data['Label']

In [11]:
# Preprocess text data
def preprocess(text_series):
    processed_texts = []
    for text in text_series:
        text = to_lowercase(text)
        text = remove_punctuation(text)
        words = tokenize(text)
        words = remove_stopwords(words)
        words = stem_words(words)
        processed_text = ' '.join(words)
        if processed_text.strip() == '':
            processed_text = 'emptydoc'  # placeholder for empty documents
        processed_texts.append(processed_text)
    return processed_texts

X_train = preprocess(X_train)
X_test = preprocess(X_test)

In [13]:
# Log to check if preprocessing results in empty documents
print("Sample processed training texts:")
print(X_train[:5])
print("Sample processed test texts:")
print(X_test[:5])

Sample processed training texts:
[]
Sample processed test texts:
[]


In [16]:
# Ensure there are no empty documents after preprocessing
if '' in X_train or '' in X_test:
    raise ValueError("Empty documents found after preprocessing")

In [30]:
# Convert text data to numerical data using CountVectorizer
vectorizer = CountVectorizer(stop_words=None, min_df=1)
X_test_counts = vectorizer.transform(X_test)
X_train_counts = vectorizer.fit_transform(X_train)

NotFittedError: Vocabulary not fitted or provided

In [25]:
# Train Naive Bayes model
model = MultinomialNB()
model.fit(X_train_counts, y_train)

In [26]:
# Predict on test data
y_pred = model.predict(X_test_counts)

NameError: name 'X_test_counts' is not defined

In [None]:
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)


In [None]:
# Save the trained model and vectorizer
joblib.dump(model, 'naive_bayes_model.pkl')
joblib.dump(vectorizer, 'count_vectorizer.pkl')

In [None]:
# Print evaluation results
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
print('Confusion Matrix:')
print(conf_matrix)