In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

In [15]:
news_data = pd.read_csv(r"C:\Users\DELL\news-article-categories.csv")

In [17]:
news_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6877 entries, 0 to 6876
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  6877 non-null   object
 1   title     6877 non-null   object
 2   body      6872 non-null   object
dtypes: object(3)
memory usage: 161.3+ KB


In [18]:
news_data.head()

Unnamed: 0,category,title,body
0,ARTS & CULTURE,Modeling Agencies Enabled Sexual Predators For...,"In October 2017, Carolyn Kramer received a dis..."
1,ARTS & CULTURE,Actor Jeff Hiller Talks “Bright Colors And Bol...,This week I talked with actor Jeff Hiller abou...
2,ARTS & CULTURE,New Yorker Cover Puts Trump 'In The Hole' Afte...,The New Yorker is taking on President Donald T...
3,ARTS & CULTURE,Man Surprises Girlfriend By Drawing Them In Di...,"Kellen Hickey, a 26-year-old who lives in Huds..."
4,ARTS & CULTURE,This Artist Gives Renaissance-Style Sculptures...,There’s something about combining the traditio...


In [19]:
news_data.isnull().sum()

category    0
title       0
body        5
dtype: int64

In [20]:
news_data = news_data.dropna(subset=['body'])

In [21]:
news_data.isnull().sum()

category    0
title       0
body        0
dtype: int64

In [22]:
label_encoder = LabelEncoder()
news_data['category_encoded'] = label_encoder.fit_transform(news_data['category'])

In [23]:
X_train, X_test, y_train, y_test = train_test_split(news_data['body'],news_data['category_encoded'],test_size=0.2,random_state=42,stratify=news_data['category_encoded'])

In [24]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [25]:
print("Training TF-IDF shape:", X_train_tfidf.shape)
print("Testing TF-IDF shape:", X_test_tfidf.shape)

Training TF-IDF shape: (5497, 5000)
Testing TF-IDF shape: (1375, 5000)


In [26]:
nb_model = MultinomialNB()

In [27]:
logistic_model = LogisticRegression(max_iter=1000, random_state=42)

In [28]:
nn_model = MLPClassifier(hidden_layer_sizes=(128,), max_iter=10, random_state=42)

In [29]:
ensemble_model = VotingClassifier(
    estimators=[
        ('naive_bayes', nb_model),            
        ('logistic_regression', logistic_model),  
        ('neural_network', nn_model)              
    ], voting='soft'  )

In [30]:
print("Training the ensemble model...")
ensemble_model.fit(X_train_tfidf, y_train)

Training the ensemble model...




In [31]:
print("Making predictions on the test data...")
y_pred = ensemble_model.predict(X_test_tfidf)

Making predictions on the test data...


In [32]:
accuracy = accuracy_score(y_test, y_pred)
print("Ensemble Model Accuracy:", accuracy)

Ensemble Model Accuracy: 0.7941818181818182


In [33]:
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Classification Report:
                 precision    recall  f1-score   support

ARTS & CULTURE       0.76      0.91      0.83       201
      BUSINESS       0.72      0.68      0.70       100
        COMEDY       0.83      0.67      0.74        75
         CRIME       0.78      0.78      0.78        60
     EDUCATION       0.81      0.85      0.83        98
 ENTERTAINMENT       0.80      0.81      0.81       100
   ENVIRONMENT       0.82      0.81      0.81       100
         MEDIA       0.82      0.70      0.75        70
      POLITICS       0.74      0.78      0.76       100
      RELIGION       0.88      0.88      0.88       100
       SCIENCE       0.88      0.83      0.85        70
        SPORTS       0.88      0.88      0.88       100
          TECH       0.79      0.73      0.76       101
         WOMEN       0.70      0.65      0.67       100

      accuracy                           0.79      1375
     macro avg       0.80      0.78      0.79      1375
  weighted avg       