In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import string

# Step 1: Data Loading and Initial Exploration
file_path = "C:/Users/shiva/Downloads/Naive Bayes and Text Mining/Naive Bayes and Text Mining/blogs_categories.csv"
df = pd.read_csv(file_path)

# Displaying the first few rows of the dataframe and checking its structure
print("First few rows of the dataset:")
print(df.head())
print("\nData structure:")
print(df.info())

# Step 2: Data Preprocessing
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert text to lowercase
    text = text.lower()
    return text

# Apply text preprocessing
df['clean_text'] = df['Data'].apply(preprocess_text)

# Step 3: Feature Extraction (TF-IDF)
# Define custom stop words including English stop words and any additional ones specific to the task
custom_stopwords = list(ENGLISH_STOP_WORDS)
# Define TF-IDF vectorizer with custom preprocessed text and custom stop words
tfidf_vectorizer = TfidfVectorizer(stop_words=custom_stopwords)

# Fit and transform the preprocessed text data into TF-IDF features
tfidf_features = tfidf_vectorizer.fit_transform(df['clean_text'])

# Display the TF-IDF features
print("\nTF-IDF Features:")
print(tfidf_features)


First few rows of the dataset:
   Unnamed: 0                                               Data       Labels
0           0  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49...  alt.atheism
1           1  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism
2           2  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism
3           3  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism
4           4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism

Data structure:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19997 entries, 0 to 19996
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  19997 non-null  int64 
 1   Data        19997 non-null  object
 2   Labels      19997 non-null  object
dtypes: int64(1), object(2)
memory usage: 468.8+ KB
None

TF-IDF Features:
  (0, 210183)	0.015038450321698189
  (0, 66915)	0.03030618023425967
  (0, 136860)	0.01027370970551676

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Step 1: Split the data into training and test sets
X = tfidf_features
y = df['Labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Implement a Naive Bayes classifier
naive_bayes_classifier = MultinomialNB()

# Step 3: Train the model on the training set
naive_bayes_classifier.fit(X_train, y_train)

# Step 4: Make predictions on the test set
predictions = naive_bayes_classifier.predict(X_test)

# Step 5: Evaluate the model
print("Classification Report:")
print(classification_report(y_test, predictions))


Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.73      0.79      0.76       173
           comp.graphics       0.87      0.91      0.89       179
 comp.os.ms-windows.misc       0.93      0.88      0.91       226
comp.sys.ibm.pc.hardware       0.85      0.85      0.85       204
   comp.sys.mac.hardware       0.88      0.96      0.92       205
          comp.windows.x       0.97      0.94      0.96       186
            misc.forsale       0.90      0.79      0.84       190
               rec.autos       0.92      0.95      0.93       203
         rec.motorcycles       1.00      0.97      0.98       218
      rec.sport.baseball       0.99      0.98      0.99       192
        rec.sport.hockey       0.97      0.99      0.98       203
               sci.crypt       0.91      0.98      0.95       200
         sci.electronics       0.94      0.89      0.91       227
                 sci.med       1.00      0.95      0

In [3]:
import pandas as pd

# Load the dataset
file_path = "C:/Users/shiva/Downloads/Naive Bayes and Text Mining/Naive Bayes and Text Mining/blogs_categories.csv"
df = pd.read_csv(file_path)

# Now you can proceed with the sentiment analysis code
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Step 1: Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Step 2: Define a function to perform sentiment analysis on each blog post
def analyze_sentiment(text):
    sentiment_score = analyzer.polarity_scores(text)
    compound_score = sentiment_score['compound']
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Step 3: Apply sentiment analysis to the blog posts and create a new column for sentiment
df['Sentiment'] = df['Data'].apply(analyze_sentiment)

# Step 4: Examine the distribution of sentiments across different categories
sentiment_distribution = df.groupby(['Labels', 'Sentiment']).size().unstack(fill_value=0)
print("Sentiment Distribution across Different Categories:")
print(sentiment_distribution)




Sentiment Distribution across Different Categories:
Sentiment                 Negative  Neutral  Positive
Labels                                               
alt.atheism                    366        7       627
comp.graphics                  112       44       844
comp.os.ms-windows.misc        180       42       778
comp.sys.ibm.pc.hardware       191       17       792
comp.sys.mac.hardware          231       46       723
comp.windows.x                 216       41       743
misc.forsale                   129       65       806
rec.autos                      299       22       679
rec.motorcycles                285       18       697
rec.sport.baseball             212       36       752
rec.sport.hockey               248       12       740
sci.crypt                      304        6       690
sci.electronics                183       29       788
sci.med                        318       21       661
sci.space                      255       18       727
soc.religion.christian        

In [4]:

pip install vaderSentiment


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Load the dataset
df = pd.read_csv("C:/Users/shiva/Downloads/Naive Bayes and Text Mining/Naive Bayes and Text Mining/blogs_categories.csv")

# Step 1: Preprocess Text Data
# Assuming 'Data' column contains the text data
# Here, we'll use TF-IDF vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Data'])
y = df['Labels']

# Step 2: Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train Model
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Step 4: Make Predictions
y_pred = nb_classifier.predict(X_test)

# Step 5: Evaluate Performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Naive Bayes Classifier Performance:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Step 6: Reflect on Sentiment Analysis
analyzer = SentimentIntensityAnalyzer()
sentiments = []

for text in df['Data']:
    sentiment = analyzer.polarity_scores(text)
    sentiments.append(sentiment)

# Step 7: Analyze Sentiment Distribution
# You can further analyze the sentiments list to examine the distribution across different categories and summarize your findings.


Naive Bayes Classifier Performance:
Accuracy: 0.89175
Precision: 0.8929008949579103
Recall: 0.89175
F1-score: 0.8911821637029316


In [None]:
To evaluate the performance of the Naive Bayes classifier and discuss the results, let's analyze the metrics and reflect on the sentiment analysis results:

### Evaluation of Naive Bayes Classifier:

1. **Accuracy:** It measures the overall correctness of the classifier's predictions.
2. **Precision:** It indicates the proportion of correctly predicted instances among all instances classified as positive by the model.
3. **Recall:** It represents the proportion of correctly predicted instances among all actual positive instances.
4. **F1-score:** It is the harmonic mean of precision and recall, providing a balance between the two metrics.

### Discussion of Model Performance:

After evaluating the classifier using these metrics, we can interpret the results as follows:

- **Accuracy:** The accuracy score gives an overall view of how well the model performs across all classes. A higher accuracy score indicates better performance.
- **Precision and Recall:** Precision and recall are important when dealing with imbalanced datasets or when certain classes are more critical than others. We should consider both precision and recall to understand the classifier's ability to correctly identify instances of each class.
- **Challenges:** Challenges encountered during the classification process might include handling imbalanced datasets, selecting appropriate features, and optimizing hyperparameters to improve model performance.

### Reflection on Sentiment Analysis Results:

- The sentiment analysis results provide insights into the emotional tone of the blog posts.
- By examining the sentiment distribution across different categories, we can identify trends or patterns in the sentiment expressed in the blog posts.
- Understanding the sentiment of blog posts can help in various applications such as understanding customer feedback, monitoring public opinion, or analyzing social media content.

In summary, evaluating the performance of the Naive Bayes classifier provides insights into its effectiveness in classifying blog posts. Reflecting on sentiment analysis results helps in understanding the emotional context of the content, which can be valuable for various applications.