In [14]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Ensure required NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Load dataset
df = pd.read_csv(r"C:\Users\91808\OneDrive\Pictures\Desktop\assignment\blogs.csv")

# Basic info
print(" Dataset Info:")
print(df.info(), "\n")
print(" Sample Records:")
print(df.head())

# Define text preprocessing function
def clean_text(text):
    # Remove punctuation and convert to lowercase
    text = text.translate(str.maketrans("", "", string.punctuation)).lower()
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    return " ".join(filtered_tokens)

# Apply preprocessing
df['cleaned_text'] = df['Data'].astype(str).apply(clean_text)

# Review cleaned text
print("\n Cleaned Text Examples:")
print(df['cleaned_text'].head())

      

 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB
None 

 Sample Records:
                                                Data       Labels
0  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism
1  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism
2  Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...  alt.atheism
3  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism
4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...  alt.atheism


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91808\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91808\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\91808/nltk_data'
    - 'C:\\ProgramData\\anaconda3\\nltk_data'
    - 'C:\\ProgramData\\anaconda3\\share\\nltk_data'
    - 'C:\\ProgramData\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\91808\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [26]:
import re
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Clean text function using regex and scikit-learn stopwords
def clean_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Tokenize manually (split by whitespace)
    tokens = text.split()
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    
    return " ".join(tokens)

In [27]:
# Apply to your blog Data column
df['cleaned_text'] = df['Data'].astype(str).apply(clean_text)

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_text'])

# Define target
y = df['Labels']

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Train model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict
predictions = model.predict(X_test)

# Evaluate
print(f"Model Accuracy: {accuracy_score(y_test, predictions):.4f}")
print("\nClassification Report:\n")
print(classification_report(y_test, predictions))

Model Accuracy: 0.8650

Classification Report:

                          precision    recall  f1-score   support

             alt.atheism       0.81      0.73      0.77        30
           comp.graphics       0.86      0.83      0.85        30
 comp.os.ms-windows.misc       0.74      0.87      0.80        30
comp.sys.ibm.pc.hardware       0.70      0.77      0.73        30
   comp.sys.mac.hardware       0.88      0.77      0.82        30
          comp.windows.x       0.96      0.83      0.89        30
            misc.forsale       0.93      0.83      0.88        30
               rec.autos       0.90      0.93      0.92        30
         rec.motorcycles       1.00      0.97      0.98        30
      rec.sport.baseball       0.97      1.00      0.98        30
        rec.sport.hockey       0.94      1.00      0.97        30
               sci.crypt       0.91      1.00      0.95        30
         sci.electronics       0.89      0.80      0.84        30
                 sci.med   

In [31]:
!pip install textblob


Defaulting to user installation because normal site-packages is not writeable
Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
  

In [32]:
from textblob import TextBlob

# Define function to classify sentiment
def get_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0.1:
        return "Positive"
    elif polarity < -0.1:
        return "Negative"
    else:
        return "Neutral"

# Apply sentiment function
df['Sentiment'] = df['Data'].astype(str).apply(get_sentiment)

# View sentiment distribution
print(df['Sentiment'].value_counts())


Sentiment
Neutral     1081
Positive     782
Negative     137
Name: count, dtype: int64
