In [1]:
!pip install pandas scikit-learn nltk

Collecting numpy>=1.22.4
  Using cached numpy-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
Collecting click
  Using cached click-8.1.7-py3-none-any.whl (97 kB)
Installing collected packages: numpy, click
Successfully installed click-8.1.7 numpy-2.1.1


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import time

nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [9]:
file_path = 'News_Category_Dataset_v3.json'
df2 = pd.read_json(file_path, lines=True)
df2['text'] = df2['headline'] + " " + df2['short_description']
df2 = df2[['category', 'text']]

In [10]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return ' '.join(tokens)

df2['processed_text'] = df2['text'].apply(preprocess_text)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df2['processed_text'], df2['category'], test_size=0.6)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('classifier', MultinomialNB()),
])

In [13]:
print("Training the model...")
start_time = time.time()
pipeline.fit(X_train, y_train)
end_time = time.time()
training_time = end_time - start_time
print(f"Training completed in {training_time:.2f} seconds ({training_time/60:.2f} minutes)")

print("Making predictions...")
y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=1))

Training the model...
Training completed in 2.57 seconds (0.04 minutes)
Making predictions...
Accuracy: 0.4934

Classification Report:
                precision    recall  f1-score   support

          ARTS       0.50      0.02      0.03       887
ARTS & CULTURE       1.00      0.00      0.00       804
  BLACK VOICES       0.64      0.06      0.11      2842
      BUSINESS       0.48      0.24      0.32      3563
       COLLEGE       0.40      0.00      0.01       678
        COMEDY       0.68      0.19      0.30      3252
         CRIME       0.53      0.44      0.49      2116
CULTURE & ARTS       1.00      0.00      0.01       635
       DIVORCE       0.87      0.43      0.57      2004
     EDUCATION       0.67      0.00      0.01       608
 ENTERTAINMENT       0.44      0.76      0.56     10440
   ENVIRONMENT       1.00      0.04      0.08       877
         FIFTY       1.00      0.00      0.00       837
  FOOD & DRINK       0.57      0.68      0.62      3770
     GOOD NEWS       0.6