In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

# Initial System Implementation
# fetch_20newsgroups includes news articles where the articles have been classified.
# We are going to treat the News Article like Emails, the principles in terms of Multi Class Classification are the same
# First we will map the individual labels into top level categories
# We then train a classifier to predict/assign a single top-level Category

# This provides us with a fully working example of the initial multi-class classification system suggested in the CA Brief

# The second step will be to implement Chained Multi-outputs multi-label classification system:

# Define category mapping
category_mapping = {
    'comp.graphics': 'technology',
    'comp.os.ms-windows.misc': 'technology',
    'comp.sys.ibm.pc.hardware': 'technology',
    'comp.sys.mac.hardware': 'technology',
    'comp.windows.x': 'technology',
    'sci.crypt': 'science',
    'sci.electronics': 'science',
    'sci.med': 'science',
    'sci.space': 'science',
    'rec.autos': 'sport',
    'rec.motorcycles': 'sport',
    'rec.sport.baseball': 'sport',
    'rec.sport.hockey': 'sport',
    'talk.politics.guns': 'politics',
    'talk.politics.mideast': 'politics',
    'talk.politics.misc': 'politics',
    'alt.atheism': 'religion',
    'soc.religion.christian': 'religion',
    'talk.religion.misc': 'religion'
}

# Load dataset
data = fetch_20newsgroups(subset='all', categories=category_mapping.keys(), remove=('headers', 'footers', 'quotes'))

# Map the original target labels to new labels
new_targets = [category_mapping[data.target_names[target]] for target in data.target]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(data.data, new_targets, test_size=0.25, random_state=42)

# Vectorize text data
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

# Train classifier
model = MultinomialNB()
model.fit(X_train_transformed, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_transformed)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

    politics       0.84      0.77      0.80       649
    religion       0.88      0.74      0.81       611
     science       0.84      0.74      0.79       985
       sport       0.88      0.89      0.88       993
  technology       0.80      0.97      0.87      1230

    accuracy                           0.84      4468
   macro avg       0.85      0.82      0.83      4468
weighted avg       0.84      0.84      0.84      4468

[[ 497   34   39   46   33]
 [  58  453   40   24   36]
 [  25   14  733   45  168]
 [   8    7   30  884   64]
 [   3    4   27    8 1188]]


In [3]:
# Map the original target labels to new labels
mapped_labels = [category_mapping[data.target_names[target]] for target in data.target]
original_labels = [data.target_names[target] for target in data.target]  # Get original labels

# Create a DataFrame
df = pd.DataFrame({
    'text': data.data,
    'original_category': original_labels,
    'mapped_category': mapped_labels
})

# Save the DataFrame to a CSV file
df.to_csv('mapped_newsgroups_with_original.csv', index=False)

# Optionally print the head of the DataFrame to verify
print(df.head())


                                                text   original_category  \
0  \nA(>  Can anyone tell me if a bloodcount of 4...             sci.med   
1                                                 \n    rec.sport.hockey   
2  \n\n  Currently, there is a bill before the Te...  talk.politics.guns   
3  \n    Nationwide, the immunization rate among ...  talk.politics.misc   
4  \n\n\nWell then given your definition of "best...  rec.sport.baseball   

  mapped_category  
0         science  
1           sport  
2        politics  
3        politics  
4           sport  


In [None]:
# Implement a Chained Multi-outputs multi-label classification system

