In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

In [10]:
# Load Dataset
# Place the CSV file in the working directory and update the path below if needed
path = '/content/News_Category_Dataset_v3.json'  # or .csv if converted

def load_data(path):
    # If JSON file (one object per line)
    try:
        df = pd.read_json(path, lines=True)
    except ValueError:
        df = pd.read_csv(path)
    # Keep only necessary columns
    df = df[['category', 'headline']].dropna()
    return df

# Main pipeline
if __name__ == '__main__':
    # 1. Load
    data = load_data(path)
    X = data['headline'].values
    y = data['category'].values

    # 2. Train/Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )



In [11]:
# 3. Build Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        lowercase=True,
        stop_words='english',
        max_df=0.8,
        min_df=5,
        ngram_range=(1,2)
    )),
    ('clf', LogisticRegression(
        solver='lbfgs',
        multi_class='multinomial',
        max_iter=200,
        C=1.0,
        n_jobs=-1,
        random_state=42
    ))
])

In [12]:
 # 4. Train
print("Training model...")
pipeline.fit(X_train, y_train)




Training model...




In [13]:
# 5. Evaluate
print("Evaluating model on test set...")
y_pred = pipeline.predict(X_test)

# 5a. Overall accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")

# 5b. Precision, Recall, F1
precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='macro'
)
print(f"Macro Precision: {precision:.4f}") # Corrected indentation
print(f"Macro Recall:    {recall:.4f}")    # Corrected indentation
print(f"Macro F1-score:  {f1:.4f}\n")      # Corrected indentation

# 5c. Detailed classification report
print("Classification Report:")
print(classification_report(
    y_test,
    y_pred,
    target_names=pipeline.classes_,
    digits=4
))

Evaluating model on test set...
Accuracy: 0.5809
Macro Precision: 0.5637
Macro Recall:    0.3924
Macro F1-score:  0.4347

Classification Report:
                precision    recall  f1-score   support

          ARTS     0.4138    0.1589    0.2297       302
ARTS & CULTURE     0.4000    0.1045    0.1657       268
  BLACK VOICES     0.5480    0.3675    0.4399       917
      BUSINESS     0.5112    0.4391    0.4724      1198
       COLLEGE     0.5141    0.3188    0.3935       229
        COMEDY     0.6367    0.4204    0.5064      1080
         CRIME     0.5460    0.5253    0.5354       712
CULTURE & ARTS     0.7778    0.2279    0.3525       215
       DIVORCE     0.8223    0.6146    0.7034       685
     EDUCATION     0.4952    0.2562    0.3377       203
 ENTERTAINMENT     0.5660    0.7691    0.6521      3473
   ENVIRONMENT     0.6667    0.2007    0.3085       289
         FIFTY     0.5455    0.1286    0.2081       280
  FOOD & DRINK     0.6205    0.6782    0.6481      1268
     GOOD NEWS