In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np
from tensorflow.keras.datasets import reuters
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer

In [9]:
(X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=10000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters.npz
[1m2110848/2110848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [12]:
word_index = reuters.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters_word_index.json
[1m550378/550378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1us/step


In [14]:
def decode_review(text):
    reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in text])

# Decode all reviews
X_train_text = [decode_review(x) for x in X_train]
X_test_text = [decode_review(x) for x in X_test]

# Create a pipeline with TF-IDF Vectorizer and Naive Bayes classifier
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

# Fit the model
model.fit(X_train_text, y_train)

# Predict on the test set
y_pred = model.predict(X_test_text)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6589492430988424

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.62      0.71      0.66       105
           2       0.00      0.00      0.00        20
           3       0.81      0.90      0.85       813
           4       0.52      0.96      0.67       474
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00        14
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00        38
           9       1.00      0.08      0.15        25
          10       0.00      0.00      0.00        30
          11       0.62      0.64      0.63        83
          12       0.00      0.00      0.00        13
          13       1.00      0.03      0.05        37
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         9
          16       0.69    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
# Step 1: Import libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Step 2: Load the inbuilt dataset
newsgroups = fetch_20newsgroups(subset='train', categories=None, shuffle=True, random_state=42)

# Step 3: Define the pipeline with TF-IDF and Multinomial Naive Bayes
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),     # Vectorize the text data using TF-IDF
    ('clf', MultinomialNB()),         # Use Naive Bayes for classification
])

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)

# Step 5: Train the model
text_clf.fit(X_train, y_train)

# Step 6: Make predictions on the test set
y_pred = text_clf.predict(X_test)

# Step 7: Evaluate the model
print(metrics.classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.94      0.76      0.84        97
           1       0.85      0.81      0.83       104
           2       0.91      0.75      0.82       115
           3       0.66      0.80      0.72       123
           4       0.96      0.79      0.86       126
           5       0.89      0.90      0.89       106
           6       0.89      0.73      0.80       109
           7       0.90      0.91      0.91       139
           8       0.93      0.93      0.93       122
           9       0.92      0.96      0.94       102
          10       0.94      0.97      0.95       108
          11       0.78      1.00      0.88       125
          12       0.90      0.79      0.84       114
          13       0.99      0.89      0.94       119
          14       0.95      0.95      0.95       127
          15       0.49      0.96      0.65       122
          16       0.87      0.96      0.91       121
          17       0.89    