In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
data = pd.read_csv(r"/Users/rahulpoojith/Documents/Excelr Datasets/Machine Learning Datasets/blogs.csv")
data

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism
...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,talk.religion.misc
1996,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
1997,Xref: cantaloupe.srv.cs.cmu.edu talk.origins:4...,talk.religion.misc
1998,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


In [18]:

# Preprocess the text (example)
data['Data'] = data['Data'].str.lower().str.replace('[^\w\s]', '', regex=True)

data['Data']

0       path cantaloupesrvcscmuedumagnesiumclubcccmued...
1       newsgroups altatheism\npath cantaloupesrvcscmu...
2       path cantaloupesrvcscmuedudasnewsharvardedunoc...
3       path cantaloupesrvcscmuedumagnesiumclubcccmued...
4       xref cantaloupesrvcscmuedu altatheism53485 tal...
                              ...                        
1995    xref cantaloupesrvcscmuedu talkabortion120945 ...
1996    xref cantaloupesrvcscmuedu talkreligionmisc837...
1997    xref cantaloupesrvcscmuedu talkorigins41030 ta...
1998    xref cantaloupesrvcscmuedu talkreligionmisc836...
1999    xref cantaloupesrvcscmuedu sciskeptic43561 tal...
Name: Data, Length: 2000, dtype: object

In [19]:
# Feature extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Data'])
y = data['Labels']



In [20]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [21]:
# Create and train the model
nb = MultinomialNB()
nb.fit(X_train, y_train)



In [22]:
print(y_test)

201     comp.os.ms-windows.misc
403       comp.sys.mac.hardware
281     comp.os.ms-windows.misc
633                misc.forsale
1307                    sci.med
                 ...           
476       comp.sys.mac.hardware
557              comp.windows.x
647                misc.forsale
1243            sci.electronics
1439                  sci.space
Name: Labels, Length: 400, dtype: object


In [23]:
# Make predictions
y_pred = nb.predict(X_test)


In [24]:

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.71
Precision: 0.8230880523167806
Recall: 0.71
F1-score: 0.7230251383756001


In [25]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [26]:
sia = SentimentIntensityAnalyzer()

In [28]:
for index, row in data.iterrows():
    # Get the text of the blog post
    text = row['Data']

    # Analyze the sentiment using the SentimentIntensityAnalyzer
    sentiment = sia.polarity_scores(text)

    # Categorize the sentiment based on the compound score
    if sentiment['compound'] > 0.05:
        sentiment_category = 'positive'
    elif sentiment['compound'] < -0.05:
        sentiment_category = 'negative'
    else:
        sentiment_category = 'neutral'

    # Add the sentiment category to the DataFrame
    data.loc[index, 'Sentiment'] = sentiment_category

In [29]:
text

'xref cantaloupesrvcscmuedu sciskeptic43561 talkpoliticsmisc179100 talkreligionmisc84001 misclegal61038\npath cantaloupesrvcscmuedurochesterudelgatechhowlandrestonansnetusccsutexaseduuunetirismbvlabwpafbafmilblackbirdafitafmilafterlifetecsun1descartesriggs\nfrom riggsdescartesetlarmymil bill riggs\nnewsgroups sciskeptictalkpoliticsmisctalkreligionmiscmisclegal\nsubject re whos next  mormons and jews\nmessageid 2017tecsun1tecarmymil\ndate 24 apr 93 001920 gmt\nreferences 1993apr200519029472gwwmichedu 2003tecsun1tecarmymil gerry93apr21132149onioncmuedu\nsender newstecsun1tecarmymil\nfollowupto sciskeptictalkpoliticsmisctalkreligionmiscmisclegal\norganization lnk corporation riverdale md\nlines 85\nnntppostinghost descartestecarmymil\n\n\n\ti was slightly surprised to see the guns of roston open up on me here\nbut gerry has his posting record and i have mine although im usually more \npolite than gerry is im not afraid of controversy either if this looks like\nan argument that he started 

In [30]:
sentiment

{'neg': 0.124, 'neu': 0.793, 'pos': 0.083, 'compound': -0.9894}

In [31]:
 data.loc[index, 'Sentiment']

'negative'

In [33]:
sentiment_counts = data['Sentiment'].value_counts()

In [34]:
sentiment_counts

Sentiment
positive    1309
negative     636
neutral       55
Name: count, dtype: int64

In [35]:
print("Sentiment Distribution:")
print(sentiment_counts)

Sentiment Distribution:
Sentiment
positive    1309
negative     636
neutral       55
Name: count, dtype: int64
