# -------------- NLP & Naive Bayes -------------

# 1. Data Exploration and Preprocessing


In [33]:
import pandas as pd
import re
import nltk
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [34]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pbade\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pbade\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\pbade\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [35]:
# Load the dataset
try:
    df = pd.read_csv("blogs.csv")
except FileNotFoundError:
    print("Error: 'blogs.csv' not found. Please ensure the file is in the correct directory.")
    # Create a dummy DataFrame for illustration if the file isn't found, 
    # but the user should ensure the correct file is used.
    data = {'Data': ["This is a great blog post about religion.", 
                     "A very negative article on politics and government.",
                     "Neutral news on science is often boring.",
                     "Technology advances are amazing and positive."],
            'Labels': ['alt.atheism', 'talk.politics.misc', 'sci.space', 'comp.graphics']}
    df = pd.DataFrame(data)


print("--- Data Exploration ---")
print("\nFirst 5 rows of the dataset:")
print(df.head())

--- Data Exploration ---

First 5 rows of the dataset:
                                                Data       Labels
0  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism
1  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism
2  Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...  alt.atheism
3  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism
4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...  alt.atheism


In [36]:
print("\nDataset Info:")
df.info()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [37]:
print("\nCategory Distribution (Target Variable):")
print(df['Labels'].value_counts())


Category Distribution (Target Variable):
Labels
alt.atheism                 100
comp.graphics               100
talk.politics.misc          100
talk.politics.mideast       100
talk.politics.guns          100
soc.religion.christian      100
sci.space                   100
sci.med                     100
sci.electronics             100
sci.crypt                   100
rec.sport.hockey            100
rec.sport.baseball          100
rec.motorcycles             100
rec.autos                   100
misc.forsale                100
comp.windows.x              100
comp.sys.mac.hardware       100
comp.sys.ibm.pc.hardware    100
comp.os.ms-windows.misc     100
talk.religion.misc          100
Name: count, dtype: int64


In [38]:
# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
Data      0
Labels    0
dtype: int64


In [39]:
# Drop rows with missing values (if any, although based on info, it looks clean)
df.dropna(inplace=True)

In [40]:
# Initialize Lemmatizer and Stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Cleans, tokenizes, removes stopwords, and lemmatizes the input text.
    """
    # 1. Convert to lowercase
    text = text.lower()
    
    # 2. Remove non-alphanumeric characters and punctuation (keeping only letters)
    text = re.sub(r'[^a-z\s]', '', text) 
    
    # 3. Tokenize (split into words)
    words = text.split()
    
    # 4. Remove stopwords and Lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 1]
    
    # 5. Join back into a single string
    return " ".join(words)

In [41]:
# Apply preprocessing to the 'Data' column
print("\nStarting text preprocessing (cleaning, tokenizing, stopword removal, lemmatization)...")
df['Cleaned_Data'] = df['Data'].apply(preprocess_text)
print("Preprocessing complete.")


Starting text preprocessing (cleaning, tokenizing, stopword removal, lemmatization)...
Preprocessing complete.


In [42]:
print("\nOriginal vs Cleaned Data Example:")
print("Original:", df['Data'].iloc[0][:100], "...")
print("Cleaned:", df['Cleaned_Data'].iloc[0][:100], "...")


Original vs Cleaned Data Example:
Original: Path: cantaloupe.srv.cs.cmu.edu!magnesium.club.cc.cmu.edu!news.sei.cmu.edu!cis.ohio-state.edu!zaphod ...
Cleaned: path cantaloupesrvcscmuedumagnesiumclubcccmuedunewsseicmueducisohiostateeduzaphodmpsohiostateeduhowl ...


In [43]:
# Perform Feature Extraction using TF-IDF
X = df['Cleaned_Data']
y = df['Labels']

In [44]:
# Initialize TF-IDF Vectorizer
# max_features is used to limit the number of features (words) to the most frequent ones
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [45]:
# Fit and transform the cleaned text data
X_tfidf = tfidf_vectorizer.fit_transform(X)


In [46]:
print("\nTF-IDF Feature Matrix Shape:", X_tfidf.shape)
print("--- End of Data Exploration and Preprocessing ---")

print("\n" + "="*50 + "\n")



TF-IDF Feature Matrix Shape: (2000, 5000)
--- End of Data Exploration and Preprocessing ---




# 2. Naive Bayes Model for Text Classification


In [49]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.25, random_state=42, stratify=y)

print("--- Naive Bayes Model Training ---")
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

--- Naive Bayes Model Training ---
Training set size: 1500 samples
Test set size: 500 samples


In [50]:
# Initialize the Multinomial Naive Bayes classifier
# MultinomialNB is suitable for classification with discrete features (like word counts or TF-IDF scores)
nb_classifier = MultinomialNB()


In [51]:
# Train the model
nb_classifier.fit(X_train, y_train)


In [52]:
# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)


In [53]:
print("Model Training Complete.")
print("--- End of Naive Bayes Model Training ---")

print("\n" + "="*50 + "\n")


Model Training Complete.
--- End of Naive Bayes Model Training ---




# 3. Sentiment Analysis

In [54]:
# Initialize the VADER Sentiment Analyzer
# VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool 
# that is specifically attuned to sentiments expressed in social media.
sia = SentimentIntensityAnalyzer()

def get_sentiment(text):
    """
    Analyzes sentiment using VADER and returns 'Positive', 'Negative', or 'Neutral'.
    """
    # Use the original 'Data' column for sentiment analysis as cleaning 
    # and lemmatization might alter the natural flow/context/emojis needed by VADER
    vs = sia.polarity_scores(text)
    
    # Determine the overall sentiment based on the 'compound' score
    if vs['compound'] >= 0.05:
        return 'Positive'
    elif vs['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

print("--- Sentiment Analysis ---")
# Apply sentiment analysis to the original 'Data' column
df['Sentiment'] = df['Data'].apply(get_sentiment)

print("\nSentiment analysis complete. Adding 'Sentiment' column to DataFrame.")

print("\nSentiment Distribution across all blog posts:")
print(df['Sentiment'].value_counts())


--- Sentiment Analysis ---

Sentiment analysis complete. Adding 'Sentiment' column to DataFrame.

Sentiment Distribution across all blog posts:
Sentiment
Positive    1334
Negative     631
Neutral       35
Name: count, dtype: int64


In [57]:
# Examine the distribution of sentiments across different categories
sentiment_category_distribution = df.groupby('Labels')['Sentiment'].value_counts(normalize=True).mul(100).unstack(fill_value=0)

print("\nSentiment Distribution (%) by Blog Category:")
print(sentiment_category_distribution.sort_values(by='Negative', ascending=False).round(2))

print("\nSummary of Sentiment Findings:")
# Quick summary based on the table (e.g., which category is most/least positive)
most_negative_category = sentiment_category_distribution['Negative'].idxmax()
most_positive_category = sentiment_category_distribution['Positive'].idxmax()

print(f"- The category with the highest percentage of **Negative** sentiment is: **{most_negative_category}**.")
print(f"- The category with the highest percentage of **Positive** sentiment is: **{most_positive_category}**.")
print("--- End of Sentiment Analysis ---")

print("\n" + "="*50 + "\n")


Sentiment Distribution (%) by Blog Category:
Sentiment                 Negative  Neutral  Positive
Labels                                               
talk.politics.mideast         69.0      0.0      31.0
talk.politics.guns            67.0      2.0      31.0
talk.politics.misc            50.0      0.0      50.0
alt.atheism                   42.0      1.0      57.0
sci.med                       38.0      1.0      61.0
talk.religion.misc            36.0      0.0      64.0
sci.space                     32.0      3.0      65.0
rec.motorcycles               30.0      2.0      68.0
sci.crypt                     29.0      0.0      71.0
soc.religion.christian        29.0      0.0      71.0
rec.sport.hockey              28.0      1.0      71.0
rec.autos                     27.0      1.0      72.0
rec.sport.baseball            27.0      1.0      72.0
comp.sys.mac.hardware         24.0      3.0      73.0
comp.os.ms-windows.misc       24.0      2.0      74.0
comp.sys.ibm.pc.hardware      21.0  

# 4. Evaluation

In [58]:
print("--- Naive Bayes Model Evaluation ---")

# 1. Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy of Naive Bayes Classifier: {accuracy:.4f}")

# 2. Detailed Classification Report (Precision, Recall, F1-Score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 3. Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

--- Naive Bayes Model Evaluation ---

Accuracy of Naive Bayes Classifier: 0.8620

Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.82      0.72      0.77        25
           comp.graphics       0.88      0.88      0.88        25
 comp.os.ms-windows.misc       0.87      0.80      0.83        25
comp.sys.ibm.pc.hardware       0.59      0.76      0.67        25
   comp.sys.mac.hardware       0.91      0.80      0.85        25
          comp.windows.x       0.88      0.84      0.86        25
            misc.forsale       0.89      0.96      0.92        25
               rec.autos       0.92      0.92      0.92        25
         rec.motorcycles       0.96      0.92      0.94        25
      rec.sport.baseball       1.00      1.00      1.00        25
        rec.sport.hockey       1.00      1.00      1.00        25
               sci.crypt       0.93      1.00      0.96        25
         sci.electronics       0.87 

# Discussion
--- Discussion and Reflection ---

- **Model Performance Discussion:**
The model achieved an accuracy of {accuracy:.4f}. The performance metrics (Precision, Recall, F1-Score) from the classification report show how well the model performed for each category.
The **Precision** measures the proportion of positive identifications that were actually correct. The **Recall** measures the proportion of actual positives that were correctly identified. The **F1-Score** is the harmonic mean of Precision and Recall.
Typically, classification performance in text datasets like this can vary significantly by category, especially if some categories have much less data or similar-sounding content to others.

- **Challenges Encountered:**

 
**Text Preprocessing:** Choosing the right combination of cleaning steps (e.g., removing email headers, specific non-text data) is crucial. The **lemmatization** step was essential but time-consuming.

  
**Feature Sparsity:** TF-IDF creates a very high-dimensional, sparse feature matrix. The `max_features` parameter was used to manage this, but a different value might yield better results.

**Class Imbalance:** If the categories are not perfectly balanced (checked in the exploration phase), the model might perform poorly on minority classes. Stratified splitting helped mitigate this.

- **Sentiment Analysis Reflection:**
The sentiment analysis, performed using VADER, provides insight into the emotional tone of the blog posts. The analysis revealed varying sentiment distributions across categories. For example, 'talk.politics.misc' or 'alt.atheism' might be expected to have higher negative sentiment due to controversial topics, while categories like 'comp.graphics' or 'sci.space' might lean more neutral or positive.
This sentiment data can be used to better understand the nature of discussion within each category, complementing the topic classification.
