## Data Exploration and Preprocessing

In [60]:
# import the pakages
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from textblob import TextBlob
from sklearn.metrics import classification_report, accuracy_score

In [2]:
#load the dataset
data = pd.read_csv('blogs.csv')

In [3]:
data

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism
...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,talk.religion.misc
1996,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
1997,Xref: cantaloupe.srv.cs.cmu.edu talk.origins:4...,talk.religion.misc
1998,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


In [4]:
data.head()

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [6]:
## Exploratory Data Analysis (EDA)

In [8]:
# missing values

In [7]:
data.isnull().sum()

Data      0
Labels    0
dtype: int64

In [9]:
# Unique categories

In [10]:
data['Labels'].unique()

array(['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
       'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
       'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
       'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
       'sci.electronics', 'sci.med', 'sci.space',
       'soc.religion.christian', 'talk.politics.guns',
       'talk.politics.mideast', 'talk.politics.misc',
       'talk.religion.misc'], dtype=object)

In [11]:
## Data Cleaning

In [13]:
# Download stopwords

In [17]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\me\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# Function to clean text

In [19]:
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

In [21]:
data['cleaned_data'] = data['Data'].apply(clean_text)

In [22]:
data['cleaned_data']

0       path cantaloupesrvcscmuedumagnesiumclubcccmued...
1       newsgroups altatheism path cantaloupesrvcscmue...
2       path cantaloupesrvcscmuedudasnewsharvardedunoc...
3       path cantaloupesrvcscmuedumagnesiumclubcccmued...
4       xref cantaloupesrvcscmuedu altatheism53485 tal...
                              ...                        
1995    xref cantaloupesrvcscmuedu talkabortion120945 ...
1996    xref cantaloupesrvcscmuedu talkreligionmisc837...
1997    xref cantaloupesrvcscmuedu talkorigins41030 ta...
1998    xref cantaloupesrvcscmuedu talkreligionmisc836...
1999    xref cantaloupesrvcscmuedu sciskeptic43561 tal...
Name: cleaned_data, Length: 2000, dtype: object

In [23]:
## Feature Extraction

In [25]:
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(data['cleaned_data'])
y = data['Labels']

In [26]:
x

<2000x56713 sparse matrix of type '<class 'numpy.float64'>'
	with 268471 stored elements in Compressed Sparse Row format>

In [27]:
y

0              alt.atheism
1              alt.atheism
2              alt.atheism
3              alt.atheism
4              alt.atheism
               ...        
1995    talk.religion.misc
1996    talk.religion.misc
1997    talk.religion.misc
1998    talk.religion.misc
1999    talk.religion.misc
Name: Labels, Length: 2000, dtype: object

## Naive Bayes Model for Text Classification

In [29]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [31]:
X_train

<1600x56713 sparse matrix of type '<class 'numpy.float64'>'
	with 218792 stored elements in Compressed Sparse Row format>

In [32]:
X_test

<400x56713 sparse matrix of type '<class 'numpy.float64'>'
	with 49679 stored elements in Compressed Sparse Row format>

In [33]:
y_train

968          rec.sport.baseball
240     comp.os.ms-windows.misc
819             rec.motorcycles
692                misc.forsale
420       comp.sys.mac.hardware
                 ...           
1130                  sci.crypt
1294            sci.electronics
860             rec.motorcycles
1459                  sci.space
1126                  sci.crypt
Name: Labels, Length: 1600, dtype: object

In [34]:
y_test

1860          talk.politics.misc
353     comp.sys.ibm.pc.hardware
1333                     sci.med
905           rec.sport.baseball
1289             sci.electronics
                  ...           
965           rec.sport.baseball
1284             sci.electronics
1739       talk.politics.mideast
261      comp.os.ms-windows.misc
535               comp.windows.x
Name: Labels, Length: 400, dtype: object

In [35]:
# train and test the data
X_train, X_test, y_train, y_test

(<1600x56713 sparse matrix of type '<class 'numpy.float64'>'
 	with 218792 stored elements in Compressed Sparse Row format>,
 <400x56713 sparse matrix of type '<class 'numpy.float64'>'
 	with 49679 stored elements in Compressed Sparse Row format>,
 968          rec.sport.baseball
 240     comp.os.ms-windows.misc
 819             rec.motorcycles
 692                misc.forsale
 420       comp.sys.mac.hardware
                  ...           
 1130                  sci.crypt
 1294            sci.electronics
 860             rec.motorcycles
 1459                  sci.space
 1126                  sci.crypt
 Name: Labels, Length: 1600, dtype: object,
 1860          talk.politics.misc
 353     comp.sys.ibm.pc.hardware
 1333                     sci.med
 905           rec.sport.baseball
 1289             sci.electronics
                   ...           
 965           rec.sport.baseball
 1284             sci.electronics
 1739       talk.politics.mideast
 261      comp.os.ms-windows.misc
 535 

In [36]:
# Training the Naive Bayes Classifier

In [38]:
# Initialize the model

In [39]:
model = MultinomialNB()

In [40]:
model

In [41]:
# Train the model

In [42]:
model.fit(X_train, y_train)

In [43]:
# Make predictions

In [44]:
y_pred = model.predict(X_test)

In [45]:
y_pred

array(['talk.politics.misc', 'comp.sys.ibm.pc.hardware', 'sci.med',
       'rec.sport.baseball', 'sci.electronics', 'sci.electronics',
       'rec.sport.baseball', 'talk.politics.mideast', 'alt.atheism',
       'sci.med', 'alt.atheism', 'sci.electronics', 'sci.crypt',
       'rec.sport.baseball', 'comp.sys.ibm.pc.hardware',
       'comp.os.ms-windows.misc', 'rec.autos', 'comp.graphics',
       'talk.politics.guns', 'talk.politics.misc',
       'comp.sys.mac.hardware', 'alt.atheism', 'alt.atheism',
       'rec.sport.hockey', 'alt.atheism', 'sci.crypt',
       'talk.politics.misc', 'rec.sport.baseball', 'rec.autos',
       'alt.atheism', 'talk.politics.misc', 'rec.sport.hockey',
       'comp.windows.x', 'rec.sport.hockey', 'rec.sport.hockey',
       'comp.sys.mac.hardware', 'sci.med', 'sci.electronics',
       'rec.sport.hockey', 'comp.os.ms-windows.misc', 'sci.electronics',
       'soc.religion.christian', 'comp.os.ms-windows.misc',
       'talk.politics.misc', 'soc.religion.christian',

## Sentiment Analysis

In [47]:
pip install textblob

Defaulting to user installation because normal site-packages is not writeable
Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
   ---------------------------------------- 0.0/626.3 kB ? eta -:--:--
    --------------------------------------- 10.2/626.3 kB ? eta -:--:--
   -- ------------------------------------ 41.0/626.3 kB 653.6 kB/s eta 0:00:01
   ----------------- ---------------------- 276.5/626.3 kB 2.4 MB/s eta 0:00:01
   ------------------------------------ --- 573.4/626.3 kB 4.0 MB/s eta 0:00:01
   ---------------------------------------- 626.3/626.3 kB 3.9 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.18.0.post0
Note: you may need to restart the kernel to use updated packages.


In [48]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\me\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\me\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [49]:
from textblob import TextBlob

# Example function to analyze sentiment
def analyze_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity < 0:
        return 'negative'
    else:
        return 'neutral'

# Example usage
sample_text = "I love programming in Python!"
sentiment = analyze_sentiment(sample_text)
print(f"Sentiment: {sentiment}")

Sentiment: positive


In [53]:
pip install vaderSentiment

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [51]:
pip install transformers


Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
     ---------------------------------------- 0.0/44.4 kB ? eta -:--:--
     --------- ------------------------------ 10.2/44.4 kB ? eta -:--:--
     -------------------------- ----------- 30.7/44.4 kB 325.1 kB/s eta 0:00:01
     -------------------------------------- 44.4/44.4 kB 363.1 kB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.1-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp312-none-win_amd64.whl.metadata (3.9 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.1-cp312-none-win_amd64.whl.metadata (6.9 kB)
Downloading transformers-4.45.2-py3-none-any.whl (9.9 MB)
   ---------------------------------------- 0.0/9.9 MB ? eta -:--:--
   - --------



In [54]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to analyze sentiment using VADER
def analyze_sentiment_vader(text):
    score = analyzer.polarity_scores(text)
    if score['compound'] > 0.05:
        return 'positive'
    elif score['compound'] < -0.05:
        return 'negative'
    else:
        return 'neutral'

# Example usage
sample_text = "I love programming in Python!"
sentiment = analyze_sentiment_vader(sample_text)
print(f"Sentiment: {sentiment}")

Sentiment: positive


In [56]:
# Function to analyze sentiment
def analyze_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity < 0:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment analysis
data['sentiment'] = data['Data'].apply(analyze_sentiment)

In [58]:
data['sentiment']

0       positive
1       negative
2       positive
3       positive
4       positive
          ...   
1995    positive
1996    positive
1997    positive
1998    positive
1999    positive
Name: sentiment, Length: 2000, dtype: object

In [59]:
sentiment_distribution = data.groupby('Labels')['sentiment'].value_counts(normalize=True).unstack()
print(sentiment_distribution)

sentiment                 negative  positive
Labels                                      
alt.atheism                   0.23      0.77
comp.graphics                 0.24      0.76
comp.os.ms-windows.misc       0.22      0.78
comp.sys.ibm.pc.hardware      0.20      0.80
comp.sys.mac.hardware         0.24      0.76
comp.windows.x                0.27      0.73
misc.forsale                  0.16      0.84
rec.autos                     0.17      0.83
rec.motorcycles               0.26      0.74
rec.sport.baseball            0.29      0.71
rec.sport.hockey              0.34      0.66
sci.crypt                     0.19      0.81
sci.electronics               0.19      0.81
sci.med                       0.29      0.71
sci.space                     0.27      0.73
soc.religion.christian        0.13      0.87
talk.politics.guns            0.30      0.70
talk.politics.mideast         0.22      0.78
talk.politics.misc            0.22      0.78
talk.religion.misc            0.14      0.86


## Evaluation

In [61]:
# Evaluation metrics

In [62]:
#print the score pred
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.53      0.94      0.68        18
           comp.graphics       0.65      0.83      0.73        18
 comp.os.ms-windows.misc       0.95      0.86      0.90        22
comp.sys.ibm.pc.hardware       0.95      0.76      0.84        25
   comp.sys.mac.hardware       0.87      0.95      0.91        21
          comp.windows.x       1.00      0.80      0.89        25
            misc.forsale       0.92      0.67      0.77        18
               rec.autos       0.89      0.89      0.89        18
         rec.motorcycles       0.88      0.88      0.88        16
      rec.sport.baseball       0.80      0.89      0.84        18
        rec.sport.hockey       0.83      1.00      0.91        15
               sci.crypt       0.82      0.95      0.88        19
         sci.electronics       0.68      0.81      0.74        16
                 sci.med       0.94      0.88      0.91        17
         