In [None]:
from nltk.corpus import brown
import nltk
import random
import re
documents = [(list(brown.words(fileid)), category)
    for category in brown.categories()
    for fileid in brown.fileids(category)]
random.shuffle(documents)

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer


def clean_text_list(text_list):
    cleaned_text_list = []
    for text in text_list:
        text = re.sub('[^a-zA-Z]', ' ', text)
        words = text.lower().split()
        stopwords_set = set(stopwords.words('english'))
        words = [word for word in words if word not in stopwords_set]
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
        
        # Join the cleaned words back into a single string
        cleaned_text = ' '.join(words)
        
        # Append the cleaned text to the list of cleaned text
        cleaned_text_list.append(cleaned_text)
    
    return cleaned_text_list


In [None]:
all_words = nltk.FreqDist(w.lower() for w in brown.words())
word_features = list(all_words)[:2000]
clean_text_list(word_features) #data cleaning at this steps, after this step we will requires the stem and tokens of words.
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [None]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_features = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

brown: The Brown Corpus, a general corpus of texts in English. Categories include adventure, belles_lettres, editorial, fiction, government, hobbies, humor, learned, lore, mystery, news, religion, reviews, romance, science_fiction, and others.

In [None]:
print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(5)


The accuracy isn't high enough, only 0.68 here. （Due to random shuffle, the accuracy may be different.) Therefore I gonna enlarge the sample sizes, and use other way
to improve the accuracy. Since there are more 2 categories at this corpora, a cross validation method will performe well.

In [None]:



classifier = nltk.NaiveBayesClassifier

def cross_validation(num_folds, documents):
    fold_size = len(documents) // num_folds
    accuracies = []
    for i in range(num_folds):
        test_set = documents[i*fold_size : (i+1)*fold_size]
        train_set = documents[:i*fold_size] + documents[(i+1)*fold_size:]
        train_feats = [(document_features(doc), category) for (doc, category) in train_set]
        test_feats = [(document_features(doc), category) for (doc, category) in test_set]
        classifier = nltk.NaiveBayesClassifier.train(train_feats)
        accuracy = nltk.classify.accuracy(classifier, test_feats)
        accuracies.append(accuracy)
    avg_accuracy = sum(accuracies) / num_folds
    return avg_accuracy

num_folds = 10
avg_accuracy = cross_validation(num_folds, documents)
print(f'Average accuracy over {num_folds}-fold cross-validation: {avg_accuracy:.2%}')
#I run this code several times, but this is the only time cross-validation accuracy is below naive bayes.

here we define a function cross_validation() that takes as input the number of folds to use, as well as the list of labeled documents. The function randomly shuffles the list of documents and then divides it into num_folds subsets of equal size. For each fold, the function trains a Naive Bayes classifier on the training set and evaluates its accuracy on the test set. It then computes the average accuracy over all folds and returns this value.

After the classifier made by me is done, I will show some examples of using this classifier dataset on some newspaper, it can also be used for a whole dataset of corpora as well.

In [None]:
import os
path = "/Users/apple/Downloads/archive (1)/entertainment"
files = os.listdir(path)
document_corpus = []
for file in files:
   for file in files:
    if not os.path.isdir(file):
        with open(path+"/"+file, encoding="latin-1") as f:
            str = f.read()
            document_corpus.append(str.encode('latin-1'))
print(document_corpus)
#open the folders where contain dataset

In [None]:
example1 = open("/Users/apple/Downloads/archive (1)/entertainment/entertainment_1.txt", "r", encoding="latin-1")
example2 = open("/Users/apple/Downloads/archive (1)/entertainment/entertainment_2.txt", "r", encoding="latin-1")
example3 = open("/Users/apple/Downloads/archive (1)/entertainment/entertainment_3.txt", "r", encoding="latin-1")

In [None]:
classifier = nltk.NaiveBayesClassifier.train(train_features)


classification1 = classifier.classify(document_features(clean_text_list(example1)))
print(classification1)

classification2 = classifier.classify(document_features(clean_text_list(example2)))
print(classification1)

classification3 = classifier.classify(document_features(clean_text_list(example3)))
print(classification1)