In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score

# 1. Import required libraries (Done above)

# 2. Import 20news group dataset from scikit-learn datasets
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# 3. Load 20news group train subset
train_data = fetch_20newsgroups(subset='train')

# 4. Load 20news group test subset
test_data = fetch_20newsgroups(subset='test')

# 5. Print all target labels
print("Target Labels:", train_data.target_names)

# 6. Prepare subset of categories
categories = ['alt.atheism', 'comp.graphics', 'sci.space']

# 7. Load 20news group train subset with three categories
train_subset = fetch_20newsgroups(subset='train', categories=categories)

# 8. Load 20news group test subset with three categories
test_subset = fetch_20newsgroups(subset='test', categories=categories)

# 9. Print new training set target names (Labels)
print("New Training Set Labels:", train_subset.target_names)

# 10. Print news training data of the 5th article
print("5th Training Article:\n", train_subset.data[4])

# 11. Print shape of data and targets
print("Train Data Shape:", len(train_subset.data))
print("Train Target Shape:", len(train_subset.target))

# 12. Print training set filenames
print("Training Set Filenames:", train_subset.filenames[:5])

# 13. By using CountVectorizer, train data into numerical format
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(train_subset.data)

# 14. Use BernoulliNB for training
bernoulli_nb = BernoulliNB()
bernoulli_nb.fit(X_train_counts, train_subset.target)

# 15. By using CountVectorizer, convert test data into numeric format
X_test_counts = count_vectorizer.transform(test_subset.data)

# 16. Predict target labels for testing set
y_pred = bernoulli_nb.predict(X_test_counts)

# 17. Find accuracy score on test set
accuracy = accuracy_score(test_subset.target, y_pred)
print("BernoulliNB Accuracy:", accuracy)

# 18. Use TfidfVectorizer instead of CountVectorizer and use MultinomialNB
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_subset.data)
X_test_tfidf = tfidf_vectorizer.transform(test_subset.data)
multinomial_nb = MultinomialNB()
multinomial_nb.fit(X_train_tfidf, train_subset.target)

# 19. Find test set accuracy
y_pred_tfidf = multinomial_nb.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(test_subset.target, y_pred_tfidf)
print("MultinomialNB Accuracy:", accuracy_tfidf)

# 20. Try avoiding stopwords and repeat the same
tfidf_vectorizer_stopwords = TfidfVectorizer(stop_words='english')
X_train_tfidf_sw = tfidf_vectorizer_stopwords.fit_transform(train_subset.data)
X_test_tfidf_sw = tfidf_vectorizer_stopwords.transform(test_subset.data)
multinomial_nb_sw = MultinomialNB()
multinomial_nb_sw.fit(X_train_tfidf_sw, train_subset.target)

y_pred_tfidf_sw = multinomial_nb_sw.predict(X_test_tfidf_sw)
accuracy_tfidf_sw = accuracy_score(test_subset.target, y_pred_tfidf_sw)
print("MultinomialNB Accuracy (with stopwords removed):", accuracy_tfidf_sw)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load datasets
train_df = pd.read_csv("/mnt/data/nlp_train.csv")
test_df = pd.read_csv("/mnt/data/nlp_test.csv")

# Print all target labels
print("Unique target labels:", train_df['category'].unique())

# Select subset of categories
categories = ['alt.atheism', 'comp.graphics', 'sci.space']
train_df = train_df[train_df['category'].isin(categories)]
test_df = test_df[test_df['category'].isin(categories)]

# Print new training set target names
print("Filtered target labels:", train_df['category'].unique())

# Print training data of the 5th article
print("5th article text:", train_df.iloc[4]['text'])

# Print shape of data and targets
print("Training set shape:", train_df.shape)
print("Testing set shape:", test_df.shape)

# Print training set filenames (if available)
if 'filename' in train_df.columns:
    print("Training filenames:", train_df['filename'].head())

# Convert text to numerical format using CountVectorizer
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(train_df['text'])
X_test_counts = vectorizer.transform(test_df['text'])

# Convert labels to numerical format
y_train = train_df['category']
y_test = test_df['category']

# Train Bernoulli Naive Bayes model
bnb = BernoulliNB()
bnb.fit(X_train_counts, y_train)

# Predict target labels for test set
y_pred = bnb.predict(X_test_counts)

# Find accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy using BernoulliNB:", accuracy)

# Use TfidfVectorizer instead of CountVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['text'])

# Train Multinomial Naive Bayes model
mnb = MultinomialNB()
mnb.fit(X_train_tfidf, y_train)

# Predict and evaluate accuracy
y_pred_tfidf = mnb.predict(X_test_tfidf)
tfidf_accuracy = accuracy_score(y_test, y_pred_tfidf)
print("Accuracy using MultinomialNB with TF-IDF:", tfidf_accuracy)

# Try avoiding stopwords and repeat the same
vectorizer_stop = CountVectorizer(stop_words='english')
X_train_counts_stop = vectorizer_stop.fit_transform(train_df['text'])
X_test_counts_stop = vectorizer_stop.transform(test_df['text'])

bnb_stop = BernoulliNB()
bnb_stop.fit(X_train_counts_stop, y_train)
y_pred_stop = bnb_stop.predict(X_test_counts_stop)
stopword_accuracy = accuracy_score(y_test, y_pred_stop)
print("Accuracy using BernoulliNB with stopwords removed:", stopword_accuracy)


In [None]:
import pandas as pd

# Load the uploaded CSV files
train_file_path = "/mnt/data/newsgroups_train.csv"
test_file_path = "/mnt/data/newsgroups_test.csv"

# Read the CSV files
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

# Display basic information about the datasets
train_data.info(), test_data.info(), train_data.head(), test_data.head()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Import required libraries (already done)

# Step 2: Prepare subsets of categories
selected_categories = ["alt.atheism", "comp.graphics", "sci.space"]
train_subset = train_data[train_data["category"].isin(selected_categories)]
test_subset = test_data[test_data["category"].isin(selected_categories)]

# Step 3: Print all target labels in the training set
target_labels = train_subset["target"].unique()

# Step 4: Print new training set target names (Labels)
target_names = train_subset["category"].unique()

# Step 5: Print news training data of the 5th article
fifth_article = train_subset.iloc[4]["text"]

# Step 6: Print shape of data and targets
train_shape = train_subset.shape
test_shape = test_subset.shape

# Step 7: Print training set filenames (not applicable since no filenames column)

# Step 8: Convert text data into numerical format using CountVectorizer
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(train_subset["text"])
X_test_counts = count_vectorizer.transform(test_subset["text"])

# Step 9: Train using BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train_counts, train_subset["target"])

# Step 10: Convert test data into numerical format using CountVectorizer
X_test_counts = count_vectorizer.transform(test_subset["text"])

# Step 11: Predict target labels for the testing set
bnb_predictions = bnb.predict(X_test_counts)

# Step 12: Find accuracy score on the test set
bnb_accuracy = accuracy_score(test_subset["target"], bnb_predictions)

# Step 13: Use TfidfVectorizer instead of CountVectorizer and train MultinomialNB
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_subset["text"])
X_test_tfidf = tfidf_vectorizer.transform(test_subset["text"])

mnb = MultinomialNB()
mnb.fit(X_train_tfidf, train_subset["target"])

# Step 14: Predict target labels using MultinomialNB
mnb_predictions = mnb.predict(X_test_tfidf)

# Step 15: Find test set accuracy for MultinomialNB
mnb_accuracy = accuracy_score(test_subset["target"], mnb_predictions)

# Step 16: Try with avoiding stopwords and repeat the same process
count_vectorizer_stopwords = CountVectorizer(stop_words='english')
X_train_counts_sw = count_vectorizer_stopwords.fit_transform(train_subset["text"])
X_test_counts_sw = count_vectorizer_stopwords.transform(test_subset["text"])

bnb_sw = BernoulliNB()
bnb_sw.fit(X_train_counts_sw, train_subset["target"])
bnb_sw_predictions = bnb_sw.predict(X_test_counts_sw)
bnb_sw_accuracy = accuracy_score(test_subset["target"], bnb_sw_predictions)

# Prepare results
{
    "Target Labels": target_labels,
    "Target Names": target_names,
    "5th Article": fifth_article[:500],  # Limiting to 500 characters for display
    "Train Shape": train_shape,
    "Test Shape": test_shape,
    "BernoulliNB Accuracy": bnb_accuracy,
    "MultinomialNB Accuracy": mnb_accuracy,
    "BernoulliNB Accuracy (with stopwords removed)": bnb_sw_accuracy
}


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score

# Load 20 News Groups dataset (train and test)
categories = ['alt.atheism', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))

# Print all target labels
target_labels = list(newsgroups_train.target_names)
print("Target Labels:", target_labels)

# Print new training set target names
print("Training Target Names:", np.unique(newsgroups_train.target))

# Print 5th article text
print("\n5th Training Article:\n", newsgroups_train.data[4][:500], "...")  # Limited display

# Print shapes of data and targets
print("\nTrain Data Shape:", len(newsgroups_train.data), "Train Target Shape:", len(newsgroups_train.target))
print("Test Data Shape:", len(newsgroups_test.data), "Test Target Shape:", len(newsgroups_test.target))

# Convert text data to numerical format using CountVectorizer
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(newsgroups_train.data)
X_test_counts = count_vectorizer.transform(newsgroups_test.data)

# Train Naive Bayes classifier using BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train_counts, newsgroups_train.target)

# Convert test data into numeric format using CountVectorizer
X_test_counts = count_vectorizer.transform(newsgroups_test.data)

# Predict target labels for test set
y_pred_bnb = bnb.predict(X_test_counts)

# Find accuracy score on test set using BernoulliNB
accuracy_bnb = accuracy_score(newsgroups_test.target, y_pred_bnb)
print("\nAccuracy using BernoulliNB:", accuracy_bnb)

# Use TfidfVectorizer instead of CountVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(newsgroups_train.data)
X_test_tfidf = tfidf_vectorizer.transform(newsgroups_test.data)

# Train Naive Bayes classifier using MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train_tfidf, newsgroups_train.target)

# Predict target labels for test set
y_pred_mnb = mnb.predict(X_test_tfidf)

# Find accuracy score using MultinomialNB
accuracy_mnb = accuracy_score(newsgroups_test.target, y_pred_mnb)
print("Accuracy using MultinomialNB:", accuracy_mnb)

# Avoid stopwords and repeat the process
tfidf_vectorizer_sw = TfidfVectorizer(stop_words='english')
X_train_tfidf_sw = tfidf_vectorizer_sw.fit_transform(newsgroups_train.data)
X_test_tfidf_sw = tfidf_vectorizer_sw.transform(newsgroups_test.data)

# Train MultinomialNB with stopwords removed
mnb_sw = MultinomialNB()
mnb_sw.fit(X_train_tfidf_sw, newsgroups_train.target)

# Predict test labels with stopwords removed
y_pred_mnb_sw = mnb_sw.predict(X_test_tfidf_sw)

# Find accuracy with stopwords removed
accuracy_mnb_sw = accuracy_score(newsgroups_test.target, y_pred_mnb_sw)
print("Accuracy using MultinomialNB (without stopwords):", accuracy_mnb_sw)
