# Email Sentiments Analysis using Vader and Naive Bayes

## Import Libraries

In [1]:
import nltk
import re
import csv
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.classify import NaiveBayesClassifier

# download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dgoya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dgoya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dgoya\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## Load dataset file

In [2]:
# load email data from CSV file
data_path = 'emails.csv'
messages = []
with open(data_path, 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        messages.append(row[0])

## Cleaning the dataset

In [3]:
# function to clean text by removing hyperlinks and stopwords
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # remove hyperlinks
    text = re.sub(r'\b\w{1,3}\b', '', text)  # remove short words
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = text.lower()  # convert to lowercase
    tokens = word_tokenize(text)  # tokenize text
    stop_words = set(stopwords.words('english'))  # get stop words
    filtered_tokens = [token for token in tokens if token not in stop_words]  # remove stop words
    cleaned_text = ' '.join(filtered_tokens)  # join tokens back into string
    return cleaned_text

## Initialize VADER sentiment analyzer

In [4]:
analyzer = SentimentIntensityAnalyzer()

In [5]:
# clean email messages and extract features for Naive Bayes classifier
featuresets = []
for message in messages:
    cleaned_message = clean_text(message)
    scores = analyzer.polarity_scores(cleaned_message)
    features = {
        'positive_score': scores['pos'],
        'negative_score': scores['neg'],
        'neutral_score': scores['neu'],
        'compound_score': scores['compound']
    }
    featuresets.append((features, 'positive' if scores['compound'] >= 0.05 else 'negative' if scores['compound'] <= -0.05 else 'neutral'))

## Train Naive Bayes Classifier

In [6]:
# train Naive Bayes classifier on featuresets
classifier = NaiveBayesClassifier.train(featuresets)

# test classifier on new email message
new_message = "Hi John, just wanted to touch base with you about the project we discussed last week. i also have surprised for you too"
cleaned_new_message = clean_text(new_message)
scores = analyzer.polarity_scores(cleaned_new_message)
features = {
    'positive_score': scores['pos'],
    'negative_score': scores['neg'],
    'neutral_score': scores['neu'],
    'compound_score': scores['compound']
}
classification = classifier.classify(features)

## print sentiment scores and classification

In [7]:
print("Positive score:", scores['pos'])
print("Negative score:", scores['neg'])
print("Neutral score:", scores['neu'])
print("Compound score:", scores['compound'])
print("Naive Bayes classification:", classification)

Positive score: 0.174
Negative score: 0.0
Neutral score: 0.826
Compound score: 0.2263
Naive Bayes classification: positive


In [2]:
import nltk
import re
import csv
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.classify import NaiveBayesClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

# load email data from CSV file
data_path = 'emails.csv'
messages = []
with open(data_path, 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        messages.append(row[0])

# function to clean text by removing hyperlinks and stopwords
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # remove hyperlinks
    text = re.sub(r'\b\w{1,3}\b', '', text)  # remove short words
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = text.lower()  # convert to lowercase
    tokens = word_tokenize(text)  # tokenize text
    stop_words = set(stopwords.words('english'))  # get stop words
    filtered_tokens = [token for token in tokens if token not in stop_words]  # remove stop words
    cleaned_text = ' '.join(filtered_tokens)  # join tokens back into string
    return cleaned_text

analyzer = SentimentIntensityAnalyzer()

# clean email messages and extract features for Naive Bayes classifier
featuresets = []
labels = []
for message in messages:
    cleaned_message = clean_text(message)
    scores = analyzer.polarity_scores(cleaned_message)
    features = {
        'positive_score': scores['pos'],
        'negative_score': scores['neg'],
        'neutral_score': scores['neu'],
        'compound_score': scores['compound']
    }
    featuresets.append(features)
    if scores['compound'] >= 0.05:
        labels.append('positive')
    elif scores['compound'] <= -0.05:
        labels.append('negative')
    else:
        labels.append('neutral')

# split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(featuresets, labels, test_size=0.2, random_state=42)

# train Naive Bayes classifier on training set
classifier = NaiveBayesClassifier.train(list(zip(X_train, y_train)))

# test classifier on testing set
y_pred = classifier.classify_many(X_test)

# print evaluation metrics
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dgoya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dgoya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dgoya\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Confusion Matrix:
 [[162   0   2]
 [  3 436   0]
 [  9   2 501]]
Classification Report:
               precision    recall  f1-score   support

    negative       0.93      0.99      0.96       164
     neutral       1.00      0.99      0.99       439
    positive       1.00      0.98      0.99       512

    accuracy                           0.99      1115
   macro avg       0.97      0.99      0.98      1115
weighted avg       0.99      0.99      0.99      1115

Accuracy: 0.9856502242152466
