In [None]:
Theory:

In supervised learning, we use labeled data to train a model.
We combine the headline + short description to extract TF-IDF features.
Then we train a Multinomial Naive Bayes classifier, which predicts news articles into categories like Politics, Sports, Business, Technology, and Crime.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset (replace with your path)
df = pd.read_json(r"C:\Users\LENOVO\News_Dataset\News_Category_Dataset_v3.json", lines=True)

# Keep only relevant categories
df = df[df['category'].isin(['POLITICS', 'SPORTS', 'BUSINESS', 'TECH', 'CRIME'])]

# Combine headline + short_description
df['text'] = df['headline'] + " " + df['short_description']

# Encode category labels
categories = df['category'].unique()
label_map = {cat: i for i, cat in enumerate(categories)}
df['category_label'] = df['category'].map(label_map)

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['category_label'], test_size=0.2, random_state=42
)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train Multinomial Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = nb_model.predict(X_test_tfidf)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=categories))


Accuracy: 0.838746656476882
              precision    recall  f1-score   support

        TECH       0.80      0.25      0.38       424
      SPORTS       0.93      0.71      0.81      1008
    POLITICS       0.84      0.97      0.90      7109
       CRIME       0.82      0.60      0.69       725
    BUSINESS       0.78      0.49      0.61      1202

    accuracy                           0.84     10468
   macro avg       0.84      0.61      0.68     10468
weighted avg       0.84      0.84      0.82     10468



In [None]:
Theory / Explanation

News Article Classification (Supervised Learning)

In supervised learning, we use labeled data.

We train the model to assign each article to a category: Politics, Sports, Business, Technology, Crime.

Steps:

Load the dataset (Kaggle News Category Dataset)

Data preprocessing: convert text to lowercase, remove punctuation, remove stopwords

Feature extraction: TF-IDF

Train/Test split

Model training (Multinomial Naive Bayes)

Prediction & evaluation

In [None]:
 Step 1: Import Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
 Step 2: Load Dataset

In [1]:
import pandas as pd
import json

file_path = r"C:\Users\LENOVO\News_Dataset\News_Category_Dataset_v3.json"

data = []

# Read file line by line
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))  # each line is a separate JSON object

# Convert to DataFrame
df = pd.DataFrame(data)

# Show first 5 rows
print(df.head())


                                                link  \
0  https://www.huffpost.com/entry/covid-boosters-...   
1  https://www.huffpost.com/entry/american-airlin...   
2  https://www.huffpost.com/entry/funniest-tweets...   
3  https://www.huffpost.com/entry/funniest-parent...   
4  https://www.huffpost.com/entry/amy-cooper-lose...   

                                            headline   category  \
0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   
1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS   
2  23 Of The Funniest Tweets About Cats And Dogs ...     COMEDY   
3  The Funniest Tweets From Parents This Week (Se...  PARENTING   
4  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to predict...  Carla K. Johnson, AP   
1  He was subdued by passengers and crew when he ...        Mary Papenfuss   
2  "Until you have a dog y

In [None]:
Theory:
Clustering (Unsupervised Learning)

Unlike supervised learning, here we don’t use labels (ignore the category column).

We extract text features using TF-IDF.

Then we apply KMeans clustering to group similar articles.

Each cluster reveals a hidden topic (e.g., corruption, elections, finance).

In [2]:
# Step 1: Libraries
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Step 2: Dataset Load
file_path = r"C:\Users\LENOVO\News_Dataset\News_Category_Dataset_v3.json"
with open(file_path, 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

df = pd.DataFrame(data)
print("Dataset loaded:", df.shape)
df.head()

# Step 3: Merge Headline + Short Description for clustering
df['text'] = df['headline'] + " " + df['short_description']

# Step 4: TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df['text'])

# Step 5: KMeans Clustering (unsupervised)
num_clusters = 5  # aap cluster number change kar sakte ho
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# Step 6: Display sample articles per cluster
for i in range(num_clusters):
    print(f"\n--- Cluster {i} ---")
    display(df[df['cluster']==i][['headline','short_description']].head(5))


Dataset loaded: (209527, 6)

--- Cluster 0 ---


Unnamed: 0,headline,short_description
0,Over 4 Million Americans Roll Up Sleeves For O...,Health experts said it is too early to predict...
1,"American Airlines Flyer Charged, Banned For Li...",He was subdued by passengers and crew when he ...
2,23 Of The Funniest Tweets About Cats And Dogs ...,"""Until you have a dog you don't understand wha..."
4,Woman Who Called Cops On Black Bird-Watcher Lo...,Amy Cooper accused investment firm Franklin Te...
5,Cleaner Was Dead In Belk Bathroom For 4 Days B...,The 63-year-old woman was seen working at the ...



--- Cluster 1 ---


Unnamed: 0,headline,short_description
15,Russian Cosmonaut Valery Polyakov Who Broke Re...,Polyakov's record-breaking trip to outer space...
24,‘Beautiful And Sad At The Same Time’: Ukrainia...,An annual celebration took on a different feel...
47,Viola Davis Feared A Heart Attack During 'The ...,The Oscar winner said she worked out for five ...
58,Thousands Of Minnesota Nurses Launch 3-day Str...,They're pressing for salary increases they say...
71,Last Reactor At Ukraine's Zaporizhzhia Nuclear...,"The plant, one of the 10 biggest atomic power ..."



--- Cluster 2 ---


Unnamed: 0,headline,short_description
3,The Funniest Tweets From Parents This Week (Se...,"""Accidentally put grown-up toothpaste on my to..."
101,Harry Styles Say He Feels Like He Has 'No Idea...,"The ""As It Was"" singer said he also doesn’t se..."
147,A Nebraska Man Conducted An Active Shooter Dri...,A man hired to organize what looked like a mas...
173,"Jerry Allison, Buddy Holly's Drummer, Dead At 82",The Rock & Roll Hall of Fame member also co-wr...
201,LGBTQ Pride Events Will Have Priority Access T...,"An extra 50,000 vaccine doses will be made ava..."



--- Cluster 3 ---


Unnamed: 0,headline,short_description
46,Chick-Fil-A Worker Saves Woman With Baby From ...,Mykel Gordon rushed to the woman's aid and hel...
50,"Cardi B Donates $100,000 To Her Old Middle Sch...",The Grammy-winning rapper surprised star-struc...
127,Lisa Loeb Paved Her Own Way In The '90s — A Ra...,The ‘Stay (I Missed You)’ artist reflects on p...
138,This Breastfeeding Condition Makes Moms Feel D...,Moms with dysphoric milk-ejection reflex (D-ME...
188,Defense In Parkland School Shooter’s Trial Set...,Nikolas Cruz's attorneys on Monday will presen...



--- Cluster 4 ---


Unnamed: 0,headline,short_description
73,"US, Trump Team Propose Names For Arbiter In Ma...",The Justice Department proposed two retired ju...
106,Trump-Endorsed Wisconsin Gubernatorial Candida...,Tim Michels calls on supporters to go after me...
128,Rep. Jim Jordan's Judiciary Twitter Account Fi...,The 36-page filing described how Trump and his...
130,Republicans Ignore Trump's Hoarding Of Classif...,The GOP is continuing to contort itself in def...
131,‘Fox & Friends’ Host Questions Why Trump Kept ...,"""These are the biggest secrets in the world,"" ..."


In [3]:
# Step 1: Import Libraries
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Step 2: Load Dataset
file_path = r"C:\Users\LENOVO\News_Dataset\News_Category_Dataset_v3.json"
with open(file_path, 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]
df = pd.DataFrame(data)
print("Dataset loaded:", df.shape)
print(df.head())

# Step 3: Preprocess (combine headline + short_description)
df['text'] = df['headline'] + ' ' + df['short_description']

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['category'], test_size=0.2, random_state=42, stratify=df['category']
)
print("Train-Test split done:", X_train.shape, X_test.shape)

# Step 5: TF-IDF Vectorization (reduced features for speed)
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
print("TF-IDF transformation done:", X_train_tfidf.shape)

# Step 6: Logistic Regression Classifier
model = LogisticRegression(max_iter=500, class_weight='balanced', n_jobs=-1)
print("Training started...")
model.fit(X_train_tfidf, y_train)
print("Training completed!")

# Step 7: Predictions
y_pred = model.predict(X_test_tfidf)

# Step 8: Evaluation
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, zero_division=0))

# Step 9: Test Prediction Example
example_text = ["The government announced a new policy to boost economic growth."]
example_tfidf = vectorizer.transform(example_text)
predicted_category = model.predict(example_tfidf)[0]
print("\nPredicted Category for example text:", predicted_category)


Dataset loaded: (209527, 6)
                                                link  \
0  https://www.huffpost.com/entry/covid-boosters-...   
1  https://www.huffpost.com/entry/american-airlin...   
2  https://www.huffpost.com/entry/funniest-tweets...   
3  https://www.huffpost.com/entry/funniest-parent...   
4  https://www.huffpost.com/entry/amy-cooper-lose...   

                                            headline   category  \
0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   
1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS   
2  23 Of The Funniest Tweets About Cats And Dogs ...     COMEDY   
3  The Funniest Tweets From Parents This Week (Se...  PARENTING   
4  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to predict...  Carla K. Johnson, AP   
1  He was subdued by passengers and crew when he ...        Mary Papenfuss  

In [None]:
News_Classification_and_Clustering

In [4]:
# Step 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans

# Step 2: Load labeled dataset for supervised model
import pandas as pd

# Correct path (relative)
df_supervised = pd.read_json("News_Dataset/News_Category_Dataset_v3.json", lines=True)

# Check first 5 rows
print(df_supervised.head())

# Step 3: Train/test split
X_train, X_test, y_train, y_test = train_test_split(df_supervised['headline'], df_supervised['category'], test_size=0.2, random_state=42)

# Step 4: Build and train supervised model (category prediction)
supervised_model = make_pipeline(TfidfVectorizer(stop_words='english', max_features=5000), MultinomialNB())
supervised_model.fit(X_train, y_train)

# Step 5: Load recent unlabeled news for unsupervised clustering
# Example: list of recent news headlines
news_articles = [
    "Gaza students who won scholarships to UK face anxious wait for evacuation",
    "Fleetwood wins Tour Championship for first PGA Tour title",
    "Four journalists among 15 dead in Israeli strike on hospital, Gaza officials say",
    "Get ready for fracking, Reform UK tells energy firms",
    "A cut-off finger ended her comfortable family life. Now she's hiding from US officials"
]

# Step 6: Unsupervised clustering
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2))
X_tfidf = vectorizer.fit_transform(news_articles)
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X_tfidf)
clusters = kmeans.labels_

# Step 7: User article
user_article = "Gaza hospitals face new attacks as tensions rise in the region."

# Supervised category prediction
predicted_category = supervised_model.predict([user_article])[0]

# Unsupervised cluster assignment
user_vector = vectorizer.transform([user_article])
predicted_cluster = kmeans.predict(user_vector)[0]

print("Supervised category prediction:", predicted_category)
print("Unsupervised cluster/topic assignment:", predicted_cluster)


                                                link  \
0  https://www.huffpost.com/entry/covid-boosters-...   
1  https://www.huffpost.com/entry/american-airlin...   
2  https://www.huffpost.com/entry/funniest-tweets...   
3  https://www.huffpost.com/entry/funniest-parent...   
4  https://www.huffpost.com/entry/amy-cooper-lose...   

                                            headline   category  \
0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   
1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS   
2  23 Of The Funniest Tweets About Cats And Dogs ...     COMEDY   
3  The Funniest Tweets From Parents This Week (Se...  PARENTING   
4  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to predict...  Carla K. Johnson, AP   
1  He was subdued by passengers and crew when he ...        Mary Papenfuss   
2  "Until you have a dog y