In [None]:
!pip install pandas scikit-learn nltk 

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import time


from nltk import pos_tag
import re
from nltk.stem import WordNetLemmatizer as lemmatizer
from spellchecker import SpellChecker


nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [30]:
file_path = 'News_Category_Dataset_v3.json'
df2 = pd.read_json("News_Category_Dataset_v3.json", lines=True)
df2['text'] = df2['headline'] + " " + df2['short_description']
df2 = df2[['category', 'text']]

In [29]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())

    tokens = re.sub(r'[^a-z\s]', '', text)
    tokens = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    tokens = re.sub(r'<.*?>', '', text)
    tokens = re.sub(r'(.)\1{2,}', r'\1\1', text)
    tokens = [
        token if pos_tag([token])[0][1] != 'NNP'
        else token for token in tokens
        if token.isalpha() and token not in stop_words
        ]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    spell = SpellChecker()
    tokens = [spell.correction(token) for token in tokens]

    return ' '.join(tokens)


In [26]:
X_train, X_test, y_train, y_test = train_test_split(df2['processed_text'], df2['category'], test_size=0.6)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('classifier', MultinomialNB())
])

In [27]:
print("Training the model...")
start_time = time.time()
pipeline.fit(X_train, y_train)
end_time = time.time()
training_time = end_time - start_time
print(f"Training completed in {training_time:.2f} seconds ({training_time/60:.2f} minutes)")

print("Making predictions...")
y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=1))

Training the model...
Training completed in 2.70 seconds (0.04 minutes)
Making predictions...
Accuracy: 0.4916

Classification Report:
                precision    recall  f1-score   support

          ARTS       0.44      0.02      0.04       876
ARTS & CULTURE       1.00      0.00      0.01       768
  BLACK VOICES       0.59      0.07      0.12      2729
      BUSINESS       0.49      0.25      0.33      3606
       COLLEGE       0.50      0.00      0.00       696
        COMEDY       0.63      0.19      0.29      3254
         CRIME       0.55      0.43      0.48      2157
CULTURE & ARTS       1.00      0.00      0.01       651
       DIVORCE       0.88      0.39      0.54      2035
     EDUCATION       1.00      0.00      0.01       613
 ENTERTAINMENT       0.44      0.76      0.55     10346
   ENVIRONMENT       0.98      0.05      0.09       857
         FIFTY       1.00      0.00      0.00       834
  FOOD & DRINK       0.59      0.66      0.62      3856
     GOOD NEWS       1.0