<a href="https://colab.research.google.com/github/rahmamohax/Elevvo-Tasks/blob/master/News%20Category%20Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 2: News Category Classification



### Loading Datasets train/split

In [None]:
import kagglehub
import os
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("amananandrai/ag-news-classification-dataset")

df= pd.read_csv(os.path.join(path, "train.csv"))
df_test =pd.read_csv(os.path.join(path, "test.csv"))
df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


### Combining Title & Description to One Column

In [None]:
df["text"] = df["Title"] + " " + df["Description"]
df_test['text'] = df_test['Title'] + " " + df_test['Description']

df.drop(columns=["Title", "Description"], inplace=True)
df_test.drop(columns=["Title", "Description"], inplace=True)

In [None]:
df.head()

Unnamed: 0,Class Index,text
0,3,Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new..."


### Download NLTK Resources

In [None]:
import nltk
from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

### Preprocessing Text

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [None]:
df = df.dropna(subset=['text'])
df['text'] = df['text'].apply(preprocess_text)

df_test = df_test.dropna(subset=['text'])
df_test['text'] = df_test['text'].apply(preprocess_text)

### Identifing Each Class for its News

In [None]:
# 1 represents World, 2 represents Sports, 3 represents Business and 4 represents Sci/Tech.
def getcode(num):
    if num == 1:
        return 'World'
    elif num == 2:
        return 'Sports'
    elif num == 3:
        return 'Business'
    elif num == 4:
        return 'Sci/Tech'
    else:
        return 'Unknown'

In [None]:
df['text'][0]

'wall st bear claw back black reuters reuters shortsellers wall street dwindlingband ultracynics seeing green'

### Vectorize text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocess_text, max_features=5000)
X_train = tfidf_vectorizer.fit_transform(df['text']).toarray()
X_test = tfidf_vectorizer.transform(df_test['text']).toarray()

y_train = df['Class Index']
y_test = df_test['Class Index']

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(120000, 5000) (120000,)
(7600, 5000) (7600,)


### Import Models

In [None]:
# Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [None]:
# Model: Logistic Regression
lr_model = LogisticRegression(max_iter=1000)

lr_model.fit(X_train, y_train)
lr_y_pred = lr_model.predict(X_test)

print("Model: Logistic Regression")
print("Accuracy:", accuracy_score(y_test, lr_y_pred))
print(classification_report(y_test, lr_y_pred))
f1_score_lr = f1_score(y_test, lr_y_pred, average='weighted')
print("F1 Score:", f1_score_lr)

Model: Logistic Regression
Accuracy: 0.9063157894736842
              precision    recall  f1-score   support

           1       0.92      0.90      0.91      1900
           2       0.95      0.98      0.96      1900
           3       0.88      0.87      0.87      1900
           4       0.88      0.88      0.88      1900

    accuracy                           0.91      7600
   macro avg       0.91      0.91      0.91      7600
weighted avg       0.91      0.91      0.91      7600

F1 Score: 0.9060979902357494


In [None]:
# Model: Random Forest
rf_model = RandomForestClassifier(n_estimators=100)

rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)

print("Model: Random Forest")
print("Accuracy:", accuracy_score(y_test, rf_y_pred))
print(classification_report(y_test, rf_y_pred))
f1_score_rf = f1_score(y_test, rf_y_pred, average='weighted')
print("F1 Score:", f1_score_rf)

Model: Random Forest
Accuracy: 0.8859210526315789
              precision    recall  f1-score   support

           1       0.90      0.88      0.89      1900
           2       0.92      0.96      0.94      1900
           3       0.86      0.85      0.85      1900
           4       0.86      0.85      0.86      1900

    accuracy                           0.89      7600
   macro avg       0.89      0.89      0.89      7600
weighted avg       0.89      0.89      0.89      7600

F1 Score: 0.8854626573953938


In [None]:
# Model: SVC
svc_model = SVC(kernel='linear')

svc_model.fit(X_train, y_train)
svc_y_pred = svc_model.predict(X_test)

print("Model: SVC")
print("Accuracy:", accuracy_score(y_test, svc_y_pred))
print(classification_report(y_test, svc_y_pred))
f1_score_svc = f1_score(y_test, svc_y_pred, average='weighted')
print("F1 Score:", f1_score_svc)

### Apply XBG & LGBM Classifiers (Optional)

### XGB Classifier Model

In [None]:
from xgboost import XGBClassifier

XGB_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
XGB_model.fit(X_train, y_train)
y_pred_xgb = XGB_model.predict(X_test)
print("Model: XGBoost")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

### LGBM Classifier model

In [None]:
from lightgbm import LGBMClassifier

LGBM_model = LGBMClassifier()
LGBM_model.fit(X_train, y_train)
y_pred_lgbm = LGBM_model.predict(X_test)
print("Model: LightGBM")
print("Accuracy:", accuracy_score(y_test, y_pred_lgbm))
print(classification_report(y_test, y_pred_lgbm))


### Word Visualizations

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
for i in range(0, 4):
    text = ' '.join(df[df['Class Index'] == i]['text'])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for {getcode(i)} Category')
    plt.show()