In [1]:
# Import necessary libraries
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Sample dataset
data = [
    ("I love NLP", "Positive"),
    ("I HATE THIS TECHNOLOGY", "Negative"),
    ("It's okay, nothing special", "Neutral")
]

# Separate sentences and labels
sentences, labels = zip(*data)

# Initialize stopwords for English
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess(text):
    # Tokenize the text and convert it to lowercase
    tokens = word_tokenize(text.lower())
    # Remove stopwords and non-alphanumeric tokens
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    # Join filtered tokens back into a string
    return ' '.join(filtered_tokens)

# Preprocess all sentences
preprocessed_sentences = [preprocess(sentence) for sentence in sentences]

# Feature extraction using TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_sentences)
y = np.array(labels)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training using Multinomial Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)

# Model prediction
y_pred = model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Output the results
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_rep)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Accuracy: 0.00%
Classification Report:
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       0.0
    Positive       0.00      0.00      0.00       1.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
