In [None]:
import nltk
from nltk.corpus import reuters
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Download the Reuters dataset from NLTK
nltk.download('reuters')

# Get all possible categories in the Reuters corpus
all_categories = reuters.categories()

# Print the list of categories
print("All Possible Categories:")
print(all_categories)

# Load a subset of the Reuters dataset
categories = ['earn', 'acq', 'nat-gas']
documents = [(reuters.raw(fileid), category)
             for category in categories
             for fileid in reuters.fileids(category)[:50]]  # Limit to 50 documents for simplicity

# Shuffle the documents
import random
random.shuffle(documents)

# Separate features (text) and labels (earn/acq)
X = [document for document, label in documents]
y = [label for document, label in documents]

# Vectorize the text data
vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Train a Multinomial Naive Bayes classifier with parameter tuning
nb_classifier = MultinomialNB()
nb_param_grid = {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]}
nb_grid_search = GridSearchCV(nb_classifier, nb_param_grid, cv=5)
nb_grid_search.fit(X_vec, y)
best_nb_classifier = nb_grid_search.best_estimator_

# Train a Logistic Regression classifier with parameter tuning
lr_classifier = LogisticRegression()
lr_param_grid = {'C': [0.1, 0.5, 1.0, 1.5, 2.0], 'max_iter': [100, 200, 300]}
lr_grid_search = GridSearchCV(lr_classifier, lr_param_grid, cv=5)
lr_grid_search.fit(X_vec, y)
best_lr_classifier = lr_grid_search.best_estimator_

# Train a Random Forest classifier with parameter tuning
rf_classifier = RandomForestClassifier()
rf_param_grid = {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20]}
rf_grid_search = GridSearchCV(rf_classifier, rf_param_grid, cv=5)
rf_grid_search.fit(X_vec, y)
best_rf_classifier = rf_grid_search.best_estimator_

# Test the classifiers
classifiers = {'Multinomial Naive Bayes': best_nb_classifier,
               'Logistic Regression': best_lr_classifier,
               'Random Forest': best_rf_classifier}

for name, classifier in classifiers.items():
    predicted_labels = classifier.predict(X_test)

    # Evaluate the accuracy
    accuracy = accuracy_score(y_test, predicted_labels)
    print(f"\n{name} Accuracy: {accuracy:.2%}")

    # Display classification report
    print(f"Classification Report for {name}:\n", classification_report(y_test, predicted_labels))

# Sample news articles
earnings_article = """
TechCorp announced record-breaking earnings in the third quarter of this year. The company's profits soared by 20%, driven by strong sales of their latest products.
"""

acquisitions_article = """
In a major move, Company XYZ has acquired ABC Corp, marking a significant consolidation in the industry. The acquisition is expected to strengthen their market presence and drive future growth.
"""
test_article = '''UK is importing'''
sugar_article = """
Natural gas price goes up.
"""
# Combine the articles into a list
news_articles = [earnings_article, acquisitions_article, sugar_article, test_article]

# Vectorize the text data
X_test_vec = vectorizer.transform(news_articles)

# Predict the labels using the tuned classifiers
for name, classifier in classifiers.items():
    predicted_labels = classifier.predict(X_test_vec)

    # Display the predicted labels for each article
    print(f"\n{name} Predicted Labels:")
    for article, label in zip(news_articles, predicted_labels):
        print(f"Article:\n{article}\nPredicted Label: {label}\n{'='*50}\n")
