In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [None]:
# Load dataset
from google.colab import drive
drive.mount('/content/drive/')
df=pd.read_csv('blogs_categories.csv')
df

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


Unnamed: 0.1,Unnamed: 0,Data,Labels
0,0,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49...,alt.atheism
1,1,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism
2,2,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
3,3,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism
4,4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism
...,...,...,...
19992,19992,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:54...,talk.religion.misc
19993,19993,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:54...,talk.religion.misc
19994,19994,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
19995,19995,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


In [None]:
# Exploratory Data Analysis
print("Dataset Shape:", df.shape)
print("Categories:", df['Labels'].unique())
print("\nSample Data:")
print(df.head())

Dataset Shape: (19997, 3)
Categories: ['alt.atheism' 'comp.graphics' 'comp.os.ms-windows.misc'
 'comp.sys.ibm.pc.hardware' 'comp.sys.mac.hardware' 'comp.windows.x'
 'misc.forsale' 'rec.autos' 'rec.motorcycles' 'rec.sport.baseball'
 'rec.sport.hockey' 'sci.crypt' 'sci.electronics' 'sci.med' 'sci.space'
 'soc.religion.christian' 'talk.politics.guns' 'talk.politics.mideast'
 'talk.politics.misc' 'talk.religion.misc']

Sample Data:
   Unnamed: 0                                               Data       Labels
0           0  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49...  alt.atheism
1           1  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism
2           2  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism
3           3  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism
4           4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism


In [None]:
# Preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [None]:
# Tokenization and removing stopwords
stop_words = set(stopwords.words('english'))
def tokenize_and_remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

In [None]:
# Applying preprocessing and tokenization
df['Processed_Text'] = df['Data'].apply(preprocess_text)
df['Tokenized_Text'] = df['Processed_Text'].apply(tokenize_and_remove_stopwords)

print("\nProcessed Data:")
print(df.head())


Processed Data:
   Unnamed: 0                                               Data       Labels  \
0           0  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49...  alt.atheism   
1           1  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism   
2           2  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism   
3           3  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism   
4           4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...  alt.atheism   

                                      Processed_Text  \
0  xref cantaloupesrvcscmuedu altatheism49960 alt...   
1  xref cantaloupesrvcscmuedu altatheism51060 alt...   
2  newsgroups altatheism\npath cantaloupesrvcscmu...   
3  xref cantaloupesrvcscmuedu altatheism51120 alt...   
4  xref cantaloupesrvcscmuedu altatheism51121 soc...   

                                      Tokenized_Text  
0  [xref, cantaloupesrvcscmuedu, altatheism49960,...  
1  [xref, cantaloupesrvcscmuedu, altatheism51060,

In [None]:
# Feature Extraction using TF-IDF
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['Processed_Text'])
y = df['Labels']

print("\nTF-IDF Features:")
print(X.shape)



TF-IDF Features:
(19997, 260706)


# After loading the dataset perform exploratory data analysis (EDA) to understand its structure and content. Then we have preprocess the data by cleaning the text, tokenizing, removing stopwords, and performing feature extraction using TF-IDF for text classification with Naive Bayes.


# We have performs feature extraction using TF-IDF (Term Frequency-Inverse Document Frequency) to convert the preprocessed text data into numerical features that can be used by the Naive Bayes model. It prints the shape of the TF-IDF matrix, indicating the number of documents (rows) and the number of unique terms (columns).

# This completes the data exploration, preprocessing, and feature extraction steps

In [None]:
# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Naive Bayes Model
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)

In [None]:
# Predictions
y_pred = naive_bayes_model.predict(X_test)

In [None]:
# Model Evaluation
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.88075

Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.66      0.80      0.73       173
           comp.graphics       0.87      0.89      0.88       179
 comp.os.ms-windows.misc       0.94      0.84      0.88       226
comp.sys.ibm.pc.hardware       0.82      0.85      0.84       204
   comp.sys.mac.hardware       0.89      0.94      0.92       205
          comp.windows.x       0.98      0.92      0.95       186
            misc.forsale       0.92      0.70      0.80       190
               rec.autos       0.89      0.94      0.91       203
         rec.motorcycles       1.00      0.94      0.97       218
      rec.sport.baseball       0.99      0.98      0.99       192
        rec.sport.hockey       0.98      0.99      0.98       203
               sci.crypt       0.83      0.98      0.90       200
         sci.electronics       0.95      0.86      0.91       227
                 sci.med       1

#We have split the TF-IDF features (X) and labels (y) into training and test sets using the train_test_split function from scikit-learn.

#We have initialize a Multinomial Naive Bayes classifier and train it on the training set using the fit method.

#We have made predictions on the test set using the predict method.

#Finally, we have evaluated the model's performance by calculating accuracy and generating a classification report.

In [None]:
# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

In [None]:
# Function to get sentiment for each blog
def get_sentiment(text):
    # Get the polarity scores
    scores = sid.polarity_scores(text)
    # Determine sentiment based on compound score
    if scores['compound'] >= 0.05:
        return 'positive'
    elif scores['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

In [None]:
# Apply sentiment analysis to each blog post
df['Sentiment'] = df['Data'].apply(get_sentiment)

In [None]:
# Display the distribution of sentiments across different categories
sentiment_distribution = df.groupby(['Labels', 'Sentiment']).size().unstack(fill_value=0)
print("\nSentiment Distribution Across Categories:")
print(sentiment_distribution)


Sentiment Distribution Across Categories:
Sentiment                 negative  neutral  positive
Labels                                               
alt.atheism                    379        8       613
comp.graphics                  123       47       830
comp.os.ms-windows.misc        204       45       751
comp.sys.ibm.pc.hardware       218       19       763
comp.sys.mac.hardware          243       47       710
comp.windows.x                 226       40       734
misc.forsale                   126       70       804
rec.autos                      304       28       668
rec.motorcycles                311       20       669
rec.sport.baseball             228       39       733
rec.sport.hockey               259       16       725
sci.crypt                      300        7       693
sci.electronics                178       36       786
sci.med                        346       22       632
sci.space                      271       21       708
soc.religion.christian         260     

In [None]:
# Summary of findings
print("\nSummary of Findings:")
for category in df['Labels'].unique():
    category_sentiments = sentiment_distribution.loc[category]
    total_posts = category_sentiments.sum()
    positive_posts = category_sentiments['positive']
    negative_posts = category_sentiments['negative']
    neutral_posts = category_sentiments['neutral']
    print(f"\nCategory: {category}")
    print(f"Total Posts: {total_posts}")
    print(f"Positive Posts: {positive_posts} ({(positive_posts / total_posts) * 100:.2f}%)")
    print(f"Negative Posts: {negative_posts} ({(negative_posts / total_posts) * 100:.2f}%)")
    print(f"Neutral Posts: {neutral_posts} ({(neutral_posts / total_posts) * 100:.2f}%)")


Summary of Findings:

Category: alt.atheism
Total Posts: 1000
Positive Posts: 613 (61.30%)
Negative Posts: 379 (37.90%)
Neutral Posts: 8 (0.80%)

Category: comp.graphics
Total Posts: 1000
Positive Posts: 830 (83.00%)
Negative Posts: 123 (12.30%)
Neutral Posts: 47 (4.70%)

Category: comp.os.ms-windows.misc
Total Posts: 1000
Positive Posts: 751 (75.10%)
Negative Posts: 204 (20.40%)
Neutral Posts: 45 (4.50%)

Category: comp.sys.ibm.pc.hardware
Total Posts: 1000
Positive Posts: 763 (76.30%)
Negative Posts: 218 (21.80%)
Neutral Posts: 19 (1.90%)

Category: comp.sys.mac.hardware
Total Posts: 1000
Positive Posts: 710 (71.00%)
Negative Posts: 243 (24.30%)
Neutral Posts: 47 (4.70%)

Category: comp.windows.x
Total Posts: 1000
Positive Posts: 734 (73.40%)
Negative Posts: 226 (22.60%)
Neutral Posts: 40 (4.00%)

Category: misc.forsale
Total Posts: 1000
Positive Posts: 804 (80.40%)
Negative Posts: 126 (12.60%)
Neutral Posts: 70 (7.00%)

Category: rec.autos
Total Posts: 1000
Positive Posts: 668 (66.

# We have initialize the VADER sentiment analyzer.

#We have define a function to get the sentiment (positive, negative, or neutral) for each blog post based on its compound polarity score.

#We have applied sentiment analysis to each blog post and add the sentiment as a new column in the DataFrame.

#We have displayed the distribution of sentiments across different categories by grouping and counting the occurrences of each sentiment within each category.

#We have summarized the findings by calculating the percentage of positive, negative, and neutral posts within each category.

#This approach allows us to analyze the sentiments expressed in the blog posts and examine how they vary across different categories.

In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

In [None]:
# Display evaluation metrics
print("\nEvaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


Evaluation Metrics:
Accuracy: 0.8808
Precision: 0.8843
Recall: 0.8808
F1-score: 0.8799


# Performance of the Model:
**Accuracy:** The accuracy of the model tells us how often the classifier is correct. It gives a general idea of the model's performance.

**Precision:** Precision measures the ratio of true positive predictions to the total number of positive predictions. It indicates how precise the classifier is when it predicts a positive label.

**Recall:** Recall measures the ratio of true positive predictions to the total number of actual positive instances. It indicates the ability of the classifier to find all positive instances.

**F1-score:** The F1-score is the harmonic mean of precision and recall. It provides a balance between precision and recall.


# Challenges Encountered:

**Imbalanced Data:** If the distribution of categories is highly imbalanced, the model may become biased towards the majority class. Techniques such as oversampling, undersampling, or using class weights can help address this issue.

**Ambiguity in Text:** Some blog posts may contain ambiguous or sarcastic language, making it challenging for the model to accurately classify them. Preprocessing techniques and more advanced models may help mitigate this challenge.

**Choosing Optimal Parameters:** Selecting the optimal parameters for the Naive Bayes classifier and the TF-IDF vectorizer can significantly impact the model's performance. This process may require experimentation and tuning.

#Reflection on Sentiment Analysis Results:
The sentiment analysis results provide insights into the overall sentiment expressed in the blog posts across different categories.

By examining the distribution of sentiments within each category, we can identify trends and patterns in the sentiment of the blog posts.

Understanding the sentiment of the blog posts can help in identifying trends, detecting sentiment shifts over time, and gauging audience reactions to different topics or categories.



Overall, the Naive Bayes classifier's performance and the sentiment analysis results can provide valuable insights into the content and sentiment of the blog posts, enabling better understanding and analysis of the data.