In [10]:
from sklearn.datasets import fetch_20newsgroups

# Download only the training subset of the 20 Newsgroups dataset
newsgroups_train = fetch_20newsgroups(subset='train')

# Print the number of data points in the training dataset
print(f"Number of data points in the training dataset: {len(newsgroups_train.data)}")

# Get unique classes (target names)
class_labels = newsgroups_train.target_names

print(class_labels)

# Create a dictionary to store one sample text from each class
sample_texts = {}

# Loop over each class and get one sample text
for label_index, label in enumerate(class_labels):
    # Find the first sample in the dataset that belongs to the current class
    for i, target in enumerate(newsgroups_train.target):
        if target == label_index:
            sample_texts[label] = newsgroups_train.data[i]
            break

# Print class labels and sample text
for label, sample in sample_texts.items():
    print(f"Class Label: {label}")
    print("Sample Text:")
    print(sample[:500])  # Print the first 500 characters of the sample
    print("-" * 80)


Number of data points in the training dataset: 11314
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
Class Label: alt.atheism
Sample Text:
From: mathew <mathew@mantis.co.uk>
Subject: Re: <Political Atheists?
Organization: Mantis Consultants, Cambridge. UK.
X-Newsreader: rusnews v1.01
Lines: 22

kmr4@po.CWRU.edu (Keith M. Ryan) writes:
> ( I am almost sure that Zyklon-B is immediate and painless method of 
> death. If not, insert soem other form. )
> 
>         And, ethnic and minority groups have been killed, mutilated and 
> exterminated through out history, so I guess it was not unusual.
> 
>         So, you would agree that the 
--------------

In [6]:
from sklearn.datasets import fetch_20newsgroups

# Download the training subset of the 20 Newsgroups dataset
newsgroups_train = fetch_20newsgroups(subset='train')

# Print the number of data points in the training dataset
print(f"Number of data points in the training dataset: {len(newsgroups_train.data)}")

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Download the training dataset
newsgroups_train = fetch_20newsgroups(subset='train')

# Preprocess the data using TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(newsgroups_train.data)
y = newsgroups_train.target

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Predict on the validation set
y_pred = clf.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

Number of data points in the training dataset: 11314
Validation Accuracy: 0.8833


In [23]:
def create_binary_targets(dataset):
    """
    Remaps the 20 newsgroups targets into two binary classes based on the assignment.
    
    Class 1 (Tech & Science) -> 0
    Class 2 (Others)         -> 1
    
    Args:
        dataset: The loaded scikit-learn newsgroups dataset object.
        
    Returns:
        A NumPy array of binary labels (0 or 1).
    """
    
    # Define the groupings from the assignment
    class_1_names = {
        'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
        'comp.sys.mac.hardware', 'comp.windows.x', 'sci.crypt', 'sci.electronics',
        'sci.med', 'sci.space'
    }

    # Get the list of 20 string names
    original_target_names = dataset.target_names
    # Get the list of 11,314 numbers (0-19)
    original_indices = dataset.target

    # Create an empty list to store our new binary (0 or 1) labels
    y_binary_list = []

    # Loop through every single index (0-19) in the original dataset
    for index in original_indices:
        # Get the *actual string name* for that document's index
        class_name = original_target_names[index]
        
        # Now, we compare the string name directly
        if class_name in class_1_names:
            # It's a Tech/Science class, so we append 0
            y_binary_list.append(0)
        else:
            # It's any other class, so we append 1
            y_binary_list.append(1)

    # Convert our Python list into a NumPy array
    return np.array(y_binary_list)

In [29]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 1. Download the training dataset
newsgroups_train = fetch_20newsgroups(subset='train')

# 2. Create the binary labels by calling "create_binary_targets" helper function
y_binary = create_binary_targets(newsgroups_train)

# 3. Preprocess the data using TF-IDF vectorizer
# We use the original text data
X = newsgroups_train.data
vectorizer = TfidfVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(X)

# 4. Split the data into training and validation sets
# We use our new 'y_binary' as the target
X_train, X_val, y_train, y_val = train_test_split(X_vec, y_binary, test_size=0.2, random_state=72)

# 5. Train a Naive Bayes classifier
clf_nb = MultinomialNB()
clf_nb.fit(X_train, y_train)

# 6. Predict on the validation set
y_pred_nb = clf_nb.predict(X_val)

# 7. Calculate accuracy and other metrics
accuracy_nb = accuracy_score(y_val, y_pred_nb)
print("--- Multinomial Naive Bayes Results ---")
print(f"Validation Accuracy: {accuracy_nb:.4f}\n")

# 0 = Tech & Science, 1 = Sports/Politics/Misc
print(classification_report(y_val, y_pred_nb, target_names=['Class 1 (Tech/Sci)', 'Class 2 (Others)']))

--- Multinomial Naive Bayes Results ---
Validation Accuracy: 0.9452

                    precision    recall  f1-score   support

Class 1 (Tech/Sci)       0.95      0.93      0.94      1066
  Class 2 (Others)       0.94      0.96      0.95      1197

          accuracy                           0.95      2263
         macro avg       0.95      0.94      0.94      2263
      weighted avg       0.95      0.95      0.95      2263

