In [1]:
import os
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [None]:
folder_path = '/Users/mac/Documents/code/data mining/Assignment/Assignment2/NB/dataset-news'  

# create corpus and labels
corpus = []
labels = []

# read files
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        corpus.append(content)

        # extract label from filename
        match = re.match(r'^[A-Za-z]+', filename)
        if match:
            label = match.group()
        else:
            label = 'unknown'  
        labels.append(label)

labels[1] 

'pol'

In [9]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)
X = X.toarray()
print(X.shape)

(800, 14551)


In [11]:
gnb = GaussianNB()
gnb.fit(X, labels)

y_pred_gnb = gnb.predict(X)
print ("Number of mislabeled points out of a total %d points : %d" % \
(X.shape[0],(labels != y_pred_gnb).sum()))

Number of mislabeled points out of a total 800 points : 27


In [12]:
mnb = MultinomialNB(alpha=1.0)
mnb.fit(X, labels)

y_pred_mnb = mnb.predict(X)
print ("Number of mislabeled points out of a total %d points : %d" % \
(X.shape[0],(labels != y_pred_mnb).sum()))

Number of mislabeled points out of a total 800 points : 32


In [13]:
mnb_clf = MultinomialNB()
gnb_clf = GaussianNB()

kf = KFold(n_splits=10, shuffle=True, random_state=42)

mnb_accuracy = []
gnb_accuracy = []

# encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(labels)

for train_index, test_index in kf.split(X):
    # split the data
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    # multinomial
    mnb_clf.fit(X_train, y_train)
    y_pred_mnb = mnb_clf.predict(X_test)
    acc_mnb = accuracy_score(y_test, y_pred_mnb)
    mnb_accuracy.append(acc_mnb)
    
    # gaussian
    gnb_clf.fit(X_train, y_train)
    y_pred_gnb = gnb_clf.predict(X_test)
    acc_gnb = accuracy_score(y_test, y_pred_gnb)
    gnb_accuracy.append(acc_gnb)

# compute the average accuracy
print("Multinomial Naive Bayes Average Accuracy: {:.4f} (+/- {:.4f})".format(
    np.mean(mnb_accuracy), np.std(mnb_accuracy)))
print("Gaussian Naive Bayes Average Accuracy: {:.4f} (+/- {:.4f})".format(
    np.mean(gnb_accuracy), np.std(gnb_accuracy)))

Multinomial Naive Bayes Average Accuracy: 0.7913 (+/- 0.0426)
Gaussian Naive Bayes Average Accuracy: 0.7363 (+/- 0.0308)


In [14]:
# Question 2
# encode labels
le = LabelEncoder()
y = le.fit_transform(labels)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
random_accuracy = []

for train_index, test_index in kf.split(X):
    y_test = y[test_index]
    num_classes = len(le.classes_)
    np.random.seed(42)  # set random seed for reproducibility
    y_pred_random = np.random.choice(num_classes, size=len(y_test))
    acc = accuracy_score(y_test, y_pred_random)
    random_accuracy.append(acc)

print("Random Classifier Average Accuracy: {:.4f} (+/- {:.4f})".format(
    np.mean(random_accuracy), np.std(random_accuracy)))

Random Classifier Average Accuracy: 0.1113 (+/- 0.0360)


In [15]:
# Qustion 3
vectorizer = CountVectorizer()
X_1 = vectorizer.fit_transform(corpus)
X_1 = X_1.toarray()
print(X_1.shape)

(800, 14842)


In [16]:
mnb_clf = MultinomialNB()

kf = KFold(n_splits=10, shuffle=True, random_state=42)

mnb_accuracy = []

# encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(labels)

for train_index, test_index in kf.split(X_1):
    # split the data
    X_train, X_test = X_1[train_index], X_1[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    # multinomial
    mnb_clf.fit(X_train, y_train)
    y_pred_mnb = mnb_clf.predict(X_test)
    acc_mnb = accuracy_score(y_test, y_pred_mnb)
    mnb_accuracy.append(acc_mnb)

# compute the average accuracy
print("Multinomial Naive Bayes Average Accuracy: {:.4f} (+/- {:.4f})".format(
    np.mean(mnb_accuracy), np.std(mnb_accuracy)))

Multinomial Naive Bayes Average Accuracy: 0.7450 (+/- 0.0384)


In [None]:
# Question 4

# encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(labels)

# define the labels for each task
task1_labels = ['pol', 'hockey', 'mac'] # “use of guns”, “hockey”, “Mac hardware”
task2_labels = ['mac', 'ibm', 'electronics'] # “Mac hardware”, “IBM hardware”, “electronics”

# filter the documents for each task
task1_indices = [i for i, label in enumerate(labels) if label in task1_labels]
X_task1 = X[task1_indices]
y_task1 = y_encoded[task1_indices]

# filter the documents for each task
task2_indices = [i for i, label in enumerate(labels) if label in task2_labels]
X_task2 = X[task2_indices]
y_task2 = y_encoded[task2_indices]

# cross validation
def cross_validate(X, y):
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    accuracy = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        clf = MultinomialNB()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        accuracy.append(acc)

    return np.mean(accuracy), np.std(accuracy)

# task1 accuracy
task1_accuracy, task1_std = cross_validate(X_task1, y_task1)
print("Task 1 Average Accuracy: {:.4f} (+/- {:.4f})".format(task1_accuracy, task1_std))

# task2 accuracy
task2_accuracy, task2_std = cross_validate(X_task2, y_task2)
print("Task 2 Average Accuracy: {:.4f} (+/- {:.4f})".format(task2_accuracy, task2_std))

# analyze the results
if task1_accuracy > task2_accuracy:
    print("Task 1 is easier than Task 2.")
else:
    print("Task 2 is easier than Task 1.")

Task 1 Average Accuracy: 0.9367 (+/- 0.0433)
Task 2 Average Accuracy: 0.7600 (+/- 0.0593)
Task 1 is easier than Task 2.
