In [1]:
import pandas as pd
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from textblob import TextBlob

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


# **1: Data Exploration and Preprocessing**
•	Load the "blogs_categories.csv" dataset and perform an exploratory data analysis to understand its structure and content.

•	Preprocess the data by cleaning the text (removing punctuation, converting to lowercase, etc.), tokenizing, and removing stopwords.

•	Perform feature extraction to convert text data into a format that can be used by the Naive Bayes model, using techniques such as TF-IDF.




In [2]:
df = pd.read_csv('/content/blogs.csv')

In [3]:
df

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism
...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,talk.religion.misc
1996,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
1997,Xref: cantaloupe.srv.cs.cmu.edu talk.origins:4...,talk.religion.misc
1998,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [5]:
# Function to clean the text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in text.split() if word not in stop_words]

    # Join tokens back to a single string
    cleaned_text = ' '.join(tokens)

    return cleaned_text

In [6]:
# Apply preprocessing to the 'Data' column
df['Cleaned_Text'] = df['Data'].apply(preprocess_text)

In [7]:
# Display the first few cleaned texts
print(df[['Data', 'Cleaned_Text']].head())

                                                Data  \
0  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...   
1  Newsgroups: alt.atheism\nPath: cantaloupe.srv....   
2  Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...   
3  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...   
4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...   

                                        Cleaned_Text  
0  path cantaloupesrvcscmuedumagnesiumclubcccmued...  
1  newsgroups altatheism path cantaloupesrvcscmue...  
2  path cantaloupesrvcscmuedudasnewsharvardedunoc...  
3  path cantaloupesrvcscmuedumagnesiumclubcccmued...  
4  xref cantaloupesrvcscmuedu altatheism53485 tal...  


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

In [9]:
# Fit and transform the cleaned text data
X = tfidf.fit_transform(df['Cleaned_Text'])

In [10]:
# Convert the labels into numerical format
y = df['Labels']

# **2: Naive Bayes Model for Text Classification**
•	Split the data into training and test sets.

•	Implement a Naive Bayes classifier to categorize the blog posts into their respective categories. You can use libraries like scikit-learn for this purpose.

•	Train the model on the training set and make predictions on the test set


In [11]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Initialize the Naive Bayes classifier
nb = MultinomialNB()

In [13]:
# Train the model
nb.fit(X_train, y_train)

In [14]:
# Make predictions
y_pred = nb.predict(X_test)

In [15]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.8225
                          precision    recall  f1-score   support

             alt.atheism       0.50      0.83      0.62        18
           comp.graphics       0.79      0.83      0.81        18
 comp.os.ms-windows.misc       0.86      0.82      0.84        22
comp.sys.ibm.pc.hardware       0.76      0.76      0.76        25
   comp.sys.mac.hardware       0.83      0.90      0.86        21
          comp.windows.x       0.91      0.84      0.87        25
            misc.forsale       0.82      0.78      0.80        18
               rec.autos       0.89      0.94      0.92        18
         rec.motorcycles       0.94      0.94      0.94        16
      rec.sport.baseball       0.77      0.94      0.85        18
        rec.sport.hockey       0.88      1.00      0.94        15
               sci.crypt       0.95      0.95      0.95        19
         sci.electronics       0.62      0.62      0.62        16
                 sci.med       0.88      0.88      0.88   

# **3: Sentiment Analysis**
•	Choose a suitable library or method for performing sentiment analysis on the blog post texts.

•	Analyze the sentiments expressed in the blog posts and categorize them as positive, negative, or neutral. Consider only the Data column and get the sentiment for each blog.

•	Examine the distribution of sentiments across different categories and summarize your findings.


In [16]:
# Function to get sentiment
def get_sentiment(text):
    analysis = TextBlob(text)
    # Determine sentiment polarity
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

In [17]:
# Apply sentiment analysis to the cleaned text
df['Sentiment'] = df['Cleaned_Text'].apply(get_sentiment)

In [18]:
# Display sentiment distribution
print(df['Sentiment'].value_counts())

Sentiment
Positive    1452
Negative     545
Neutral        3
Name: count, dtype: int64


In [19]:
# Display sentiment distribution across categories
print(pd.crosstab(df['Labels'], df['Sentiment']))

Sentiment                 Negative  Neutral  Positive
Labels                                               
alt.atheism                     35        0        65
comp.graphics                   27        0        73
comp.os.ms-windows.misc         24        0        76
comp.sys.ibm.pc.hardware        19        0        81
comp.sys.mac.hardware           26        0        74
comp.windows.x                  20        2        78
misc.forsale                    21        0        79
rec.autos                       24        0        76
rec.motorcycles                 28        0        72
rec.sport.baseball              35        0        65
rec.sport.hockey                40        0        60
sci.crypt                       19        0        81
sci.electronics                 25        0        75
sci.med                         34        0        66
sci.space                       28        0        72
soc.religion.christian          25        0        75
talk.politics.guns          

# **4: Evaluation of Naive Bayes Classifier**
•	Evaluate the performance of your Naive Bayes classifier using metrics such as accuracy, precision, recall, and F1-score.

•	Discuss the performance of the model and any challenges encountered during the classification process.

•	Reflect on the sentiment analysis results and their implications regarding the content of the blog posts.


In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

In [21]:
# Calculate precision, recall, and F1-score for each class
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

In [22]:
# Print the classification report for detailed metrics
class_report = classification_report(y_test, y_pred)

In [23]:
# Display the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("\nClassification Report:\n", class_report)

Accuracy: 0.8225
Precision: 0.8276
Recall: 0.8225
F1-Score: 0.8171

Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.50      0.83      0.62        18
           comp.graphics       0.79      0.83      0.81        18
 comp.os.ms-windows.misc       0.86      0.82      0.84        22
comp.sys.ibm.pc.hardware       0.76      0.76      0.76        25
   comp.sys.mac.hardware       0.83      0.90      0.86        21
          comp.windows.x       0.91      0.84      0.87        25
            misc.forsale       0.82      0.78      0.80        18
               rec.autos       0.89      0.94      0.92        18
         rec.motorcycles       0.94      0.94      0.94        16
      rec.sport.baseball       0.77      0.94      0.85        18
        rec.sport.hockey       0.88      1.00      0.94        15
               sci.crypt       0.95      0.95      0.95        19
         sci.electronics       0.62      0.62    

In [24]:
# Model performance discussion
print("The Naive Bayes classifier performed well in categorizing the blog posts, achieving an accuracy of {:.2f}%.".format(accuracy * 100))

# Challenges encountered
print("Challenges encountered during the classification process included:")
print("- Handling imbalanced categories: Some categories may have more blog posts than others, which can affect model performance.")
print("- Text preprocessing: Ensuring that the text was cleaned and tokenized properly was crucial for the model to perform well.")

The Naive Bayes classifier performed well in categorizing the blog posts, achieving an accuracy of 82.25%.
Challenges encountered during the classification process included:
- Handling imbalanced categories: Some categories may have more blog posts than others, which can affect model performance.
- Text preprocessing: Ensuring that the text was cleaned and tokenized properly was crucial for the model to perform well.


In [25]:
# Reflect on the sentiment analysis results
sentiment_distribution = df['Sentiment'].value_counts()
print("\nSentiment Analysis Reflection:")
print("The sentiment analysis of the blog posts revealed the following distribution of sentiments:")
print(sentiment_distribution)


Sentiment Analysis Reflection:
The sentiment analysis of the blog posts revealed the following distribution of sentiments:
Sentiment
Positive    1452
Negative     545
Neutral        3
Name: count, dtype: int64


In [26]:
# Sentiment distribution across different categories
sentiment_category_distribution = pd.crosstab(df['Labels'], df['Sentiment'])
print("\nSentiment Distribution Across Categories:\n", sentiment_category_distribution)


Sentiment Distribution Across Categories:
 Sentiment                 Negative  Neutral  Positive
Labels                                               
alt.atheism                     35        0        65
comp.graphics                   27        0        73
comp.os.ms-windows.misc         24        0        76
comp.sys.ibm.pc.hardware        19        0        81
comp.sys.mac.hardware           26        0        74
comp.windows.x                  20        2        78
misc.forsale                    21        0        79
rec.autos                       24        0        76
rec.motorcycles                 28        0        72
rec.sport.baseball              35        0        65
rec.sport.hockey                40        0        60
sci.crypt                       19        0        81
sci.electronics                 25        0        75
sci.med                         34        0        66
sci.space                       28        0        72
soc.religion.christian          25    

In [27]:
# Discussion of implications
print("\nThe sentiment analysis shows how the tone and sentiment of blog posts vary across different categories.")
print("For example, certain categories may have a more positive or negative tone based on the nature of their content.")
print("These insights could be useful for understanding audience engagement and tailoring content accordingly.")


The sentiment analysis shows how the tone and sentiment of blog posts vary across different categories.
For example, certain categories may have a more positive or negative tone based on the nature of their content.
These insights could be useful for understanding audience engagement and tailoring content accordingly.
