**Sentiment Annalysis Using Complement Naive Bayes**

---



In [16]:
import string
import pandas as pd
from sklearn.metrics import accuracy_score
import numpy as np
from collections import defaultdict
from sklearn.metrics import accuracy_score
import re
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
# Function to tokenize and clean the text
def tokenize(text):
    text = text.lower() # Convert to lowercase
    text = re.sub(r'\W+', ' ', text) # Remove punctuation
    tokens = text.split() # Split into tokens
    return tokens

# Function to count word frequencies
def count_words(data):
    word_counts = defaultdict(lambda: {'neutral': 0, 'positive': 0, 'negative': 0})
    for text, sentiment in data:
        tokens = tokenize(text)
        for token in tokens:
            word_counts[token][sentiment] += 1
    return word_counts

# Function to calculate word probabilities
def word_probabilities(word_counts, total_neutral, total_pos, total_neg, smoothing=1):
    probabilities = defaultdict(dict)
    for word in word_counts:
        probabilities[word]['neutral'] = \
          (total_neutral - word_counts[word]['neutral'] + smoothing) / (total_neutral + smoothing)
        probabilities[word]['positive'] = \
          (total_pos - word_counts[word]['positive'] + smoothing) / (total_pos + smoothing)
        probabilities[word]['negative'] = \
          (total_neg - word_counts[word]['negative'] + smoothing) / (total_neg + smoothing)
    return probabilities

# Function to classify a new text
def classify(text, word_probs, prior_neutral, prior_pos, prior_neg):
    text_tokens = tokenize(text)
    neutral_prob = prior_neutral
    pos_prob = prior_pos
    neg_prob = prior_neg
    for token in text_tokens:
        if token in word_probs:
            neutral_prob *= word_probs[token]['neutral']
            pos_prob *= word_probs[token]['positive']
            neg_prob *= word_probs[token]['negative']
    probabilities = {'neutral': neutral_prob, 'positive': pos_prob, 'negative': neg_prob}
    return max(probabilities, key=probabilities.get)

# Read the data
train_df = pd.read_csv('/content/test.csv', encoding='unicode_escape')

# Preprocess the data
train_data = train_df[['text', 'sentiment']].dropna().values.tolist()

# Count total samples for each sentiment
total_neutral = sum(1 for _, sentiment in train_data if sentiment == 'neutral')
total_pos = sum(1 for _, sentiment in train_data if sentiment == 'positive')
total_neg = sum(1 for _, sentiment in train_data if sentiment == 'negative')

# Train the classifier
word_counts = count_words(train_data)
word_probs = word_probabilities(word_counts, total_neutral, total_pos, total_neg)
prior_neutral = total_neutral / len(train_data)
prior_pos = total_pos / len(train_data)
prior_neg = total_neg / len(train_data)

# Test the classifier
test_df = pd.read_csv('/content/test.csv', encoding='unicode_escape')
test_data = test_df[['text', 'sentiment']].dropna().values.tolist()

predictions = [classify(text, word_probs, prior_neutral, prior_pos, prior_neg) for text, _ in test_data]
true_labels = [sentiment for _, sentiment in test_data]



# Compute confusion matrix
conf_matrix = confusion_matrix(true_labels, predictions, labels=['neutral', 'positive', 'negative'])

# Print confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print("Accuracy:", accuracy)





Confusion Matrix:
[[1412   18    0]
 [1103    0    0]
 [ 989   12    0]]
Accuracy: 0.39954725523486134
