In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [4]:
df=pd.read_csv('/content/blogs.csv')

In [5]:
df.head()

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism


In [6]:
df.shape

(2000, 2)

In [7]:
df.describe()

Unnamed: 0,Data,Labels
count,2000,2000
unique,2000,20
top,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
freq,1,100


In [8]:
df.columns

Index(['Data', 'Labels'], dtype='object')

In [9]:

# Data Exploration and Preprocessing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

print("Data Shape:", df.shape)
print("Data Columns:", df.columns)
print("Data Head:")
print(df.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Data Shape: (2000, 2)
Data Columns: Index(['Data', 'Labels'], dtype='object')
Data Head:
                                                Data       Labels
0  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism
1  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism
2  Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...  alt.atheism
3  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism
4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...  alt.atheism


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
# Remove punctuation and convert to lowercase
df['Data'] = df['Data'].apply(lambda x: x.lower())

# Tokenize the text data
df['Data'] = df['Data'].apply(word_tokenize)

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['Data'] = df['Data'].apply(lambda x: [word for word in x if word not in stop_words])

In [11]:
# Join the tokenized words back into a string
from sklearn.feature_extraction.text import TfidfVectorizer

df['Data'] = df['Data'].apply(lambda x: ' '.join(x))



In [13]:
# Create a TF-IDF vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()


In [14]:
# Convert the text data to TF-IDF format
tfidf = TfidfVectorizer(max_features=5000, stop_words=None) # Remove stop_words argument
X = tfidf.fit_transform(df['Data']).toarray()
y = df['Labels']

In [15]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Naive Bayes Model for Text Classification
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [17]:
# Evaluate the performance of the classifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8025
Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.58      0.83      0.68        18
           comp.graphics       0.74      0.78      0.76        18
 comp.os.ms-windows.misc       0.81      1.00      0.90        22
comp.sys.ibm.pc.hardware       0.67      0.88      0.76        25
   comp.sys.mac.hardware       0.80      0.57      0.67        21
          comp.windows.x       0.92      0.48      0.63        25
            misc.forsale       0.78      0.78      0.78        18
               rec.autos       0.71      0.94      0.81        18
         rec.motorcycles       0.93      0.81      0.87        16
      rec.sport.baseball       0.84      0.89      0.86        18
        rec.sport.hockey       0.68      1.00      0.81        15
               sci.crypt       0.83      1.00      0.90        19
         sci.electronics       0.90      0.56      0.69        16
                 sci.med       0.88

In [18]:
# Sentiment Analysis
from textblob import TextBlob
def sentiment_analyzer(text):
    blob = TextBlob(text)
    if blob.sentiment.polarity > 0:
        return 'positive'
    elif blob.sentiment.polarity < 0:
        return 'negative'
    else:
        return 'neutral'

df['Sentiment'] = df['Data'].apply(sentiment_analyzer)

In [19]:

# Examine the distribution of sentiments across different categories
print("Sentiment Distribution:")
print(df.groupby('Labels')['Sentiment'].value_counts())

Sentiment Distribution:
Labels                    Sentiment
alt.atheism               positive     79
                          negative     21
comp.graphics             positive     76
                          negative     24
comp.os.ms-windows.misc   positive     81
                          negative     19
comp.sys.ibm.pc.hardware  positive     84
                          negative     16
comp.sys.mac.hardware     positive     81
                          negative     19
comp.windows.x            positive     74
                          negative     26
misc.forsale              positive     81
                          negative     19
rec.autos                 positive     83
                          negative     17
rec.motorcycles           positive     78
                          negative     22
rec.sport.baseball        positive     74
                          negative     26
rec.sport.hockey          positive     68
                          negative     32
sci.crypt       