<a href="https://colab.research.google.com/github/sajeebkhan-gpt/Ai-and-data-science-roadmap/blob/main/News_Classification_with_bbc_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install pandas scikit-learn nltk matplotlib


In [None]:
import pandas as pd


df = pd.read_csv("bbc-text.csv")


print(df.head())
print(df['category'].value_counts())


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # Lowercase
    text = text.lower()
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stopwords & punctuation
    tokens = [t for t in tokens if t not in stop_words and t not in string.punctuation]
    # Lemmatize
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

df["clean_text"] = df["text"].apply(preprocess)
print(df.head())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X = df["clean_text"]
y = df["category"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Predict
y_pred = model.predict(X_test_tfidf)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9809782608695652

Classification Report:
                precision    recall  f1-score   support

     business       0.98      0.98      0.98        89
entertainment       0.99      0.99      0.99        72
     politics       0.97      0.97      0.97        58
        sport       0.99      1.00      0.99        82
         tech       0.98      0.97      0.98        67

     accuracy                           0.98       368
    macro avg       0.98      0.98      0.98       368
 weighted avg       0.98      0.98      0.98       368



In [None]:
def predict_news(text):
    clean = preprocess(text)
    vector = vectorizer.transform([clean])
    prediction = model.predict(vector)
    return prediction[0]

print(predict_news("The government announced new economic policies."))
print(predict_news("Manchester United won their latest football match."))


business
sport


In [None]:

while True:
    text = input("Enter a news text (or type 'exit' to quit): ")
    if text.lower() == "exit":

        break
    print("Predicted Category:", predict_news(text))


Predicted Category: sport
Predicted Category: politics
Predicted Category: sport
