In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier



from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

# Load Preprocessed Data

In [2]:
df=pd.read_csv('../DATASET/PreprocessedData.csv')

In [3]:
df.sample(5)

Unnamed: 0,title,Published_on,News_agency,URL,summary,Country,language,tag,year,Merged
12855,Batesville School District receives award for ...,2024-02-19 15:32:00+00:00,guardonline.com,https://news.google.com/rss/articles/CBMilwFod...,Batesville School District receives award for ...,RU,en,Education,2024,batesville school district receives award cons...
9872,Northland Symphony Orchestra presents 'Symphon...,2024-02-19 22:28:00+00:00,mycouriertribune.com,https://news.google.com/rss/articles/CBMilwFod...,Northland Symphony Orchestra presents 'Symphon...,JP,en,Entertainment,2024,northland symphony orchestra present symphonic...
13821,Indonesia Says 71 Election Workers Died From E...,2024-02-19 10:26:21+00:00,Bloomberg,https://news.google.com/rss/articles/CBMiamh0d...,Indonesia Says 71 Election Workers Died From E...,BR,en,Politics,2024,indonesia election worker exhaustion bloomberg...
15309,Sixteenth Finance Commission holds its first m...,2024-02-14 11:22:41+00:00,Mint,https://news.google.com/rss/articles/CBMiaWh0d...,Sixteenth Finance Commission holds its first m...,IN,en,Finance,2024,sixteenth finance commission hold first meetin...
14901,"Well Aware of the Problems, This Nurse Offers ...",2024-02-19 18:14:19+00:00,UVA Today,https://news.google.com/rss/articles/CBMiU2h0d...,"Well Aware of the Problems, This Nurse Offers ...",IN,en,Health,2024,aware problem nurse offer health today aware p...


# Splitting into training and testing data

In [4]:
X_train,X_test,y_train,y_test = train_test_split(df['Merged'], df.tag, test_size=0.3, random_state=42,stratify=df['tag'])


# Random Forest Classifier

In [5]:
# combining two different vectorization method 
rfc = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', RandomForestClassifier(n_estimators=1000,
                                                  criterion="gini"))])

rfc_model = rfc.fit(X_train, y_train)
rfc_pred = rfc_model.predict(X_test)
print("accuracy:", round(accuracy_score(y_test, rfc_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, rfc_pred))

accuracy: 96.55 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.98      0.93      0.95       386
    Education       0.98      0.97      0.97       387
Entertainment       0.93      0.99      0.96       392
  Environment       0.98      0.97      0.97       385
      Finance       0.96      0.96      0.96       383
       Health       0.99      0.97      0.98       380
       Nation       0.97      0.98      0.98       377
     Politics       0.95      0.95      0.95       371
      Science       0.96      0.97      0.97       381
       Sports       0.94      0.96      0.95       384
   Technology       0.99      0.96      0.98       381
        World       0.97      0.95      0.96       398

     accuracy                           0.97      4605
    macro avg       0.97      0.97      0.97      4605
 weighted avg       0.97      0.97      0.97      4605



In [6]:
# using countvectorizer only vectorization method 
rfc_vect = Pipeline([('vect', CountVectorizer()),
                 ('model', RandomForestClassifier(n_estimators=1000,
                                                  criterion="gini"))])

rfc_vect_model = rfc_vect.fit(X_train, y_train)
rfc_vect_pred = rfc_vect_model.predict(X_test)
print("accuracy:", round(accuracy_score(y_test, rfc_vect_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, rfc_vect_pred))

accuracy: 96.61 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.98      0.94      0.96       386
    Education       0.98      0.97      0.98       387
Entertainment       0.93      0.99      0.96       392
  Environment       0.97      0.97      0.97       385
      Finance       0.96      0.96      0.96       383
       Health       0.99      0.97      0.98       380
       Nation       0.97      0.98      0.98       377
     Politics       0.95      0.96      0.96       371
      Science       0.97      0.97      0.97       381
       Sports       0.94      0.96      0.95       384
   Technology       0.99      0.96      0.98       381
        World       0.97      0.95      0.96       398

     accuracy                           0.97      4605
    macro avg       0.97      0.97      0.97      4605
 weighted avg       0.97      0.97      0.97      4605



In [7]:
# using TFidf vectorization method 
tfidf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # TF-IDF vectorizer
    ('classifier', RandomForestClassifier(random_state=42))  # Example classifier
])

rfc_tfidf_model = tfidf_pipeline.fit(X_train, y_train)
rfc_tfidf_pred = tfidf_pipeline.predict(X_test)

print("accuracy:", round(accuracy_score(y_test, rfc_tfidf_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, rfc_tfidf_pred))

accuracy: 96.46 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.98      0.93      0.95       386
    Education       0.97      0.97      0.97       387
Entertainment       0.93      0.99      0.96       392
  Environment       0.98      0.97      0.98       385
      Finance       0.96      0.96      0.96       383
       Health       0.99      0.97      0.98       380
       Nation       0.97      0.98      0.98       377
     Politics       0.95      0.95      0.95       371
      Science       0.95      0.97      0.96       381
       Sports       0.93      0.96      0.95       384
   Technology       0.99      0.96      0.98       381
        World       0.97      0.95      0.96       398

     accuracy                           0.96      4605
    macro avg       0.97      0.96      0.96      4605
 weighted avg       0.97      0.96      0.96      4605



# Naive Bayes

In [None]:
nb_vect = Pipeline([('vect',CountVectorizer()),
                           ('model', MultinomialNB(alpha=1.0,
                                       fit_prior=True,
                                       class_prior=None))])

nb_vect_model = nb_vect.fit(X_train, y_train)

nb_vect_pred = nb_vect_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, nb_vect_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, nb_vect_pred))

In [9]:
nb_tfidf = Pipeline([('tfidf', TfidfVectorizer()),
                           ('model', MultinomialNB(alpha=1.0,
                                       fit_prior=True,
                                       class_prior=None))])

nb_tfidf_model = nb_tfidf.fit(X_train, y_train)

nb_tfidf_pred = nb_tfidf_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, nb_tfidf_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, nb_tfidf_pred))


Accuracy:  93.38 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.93      0.88      0.90       386
    Education       0.92      0.94      0.93       387
Entertainment       0.96      0.93      0.95       392
  Environment       0.93      0.95      0.94       385
      Finance       0.91      0.93      0.92       383
       Health       0.94      0.95      0.95       380
       Nation       0.93      0.94      0.94       377
     Politics       0.91      0.92      0.92       371
      Science       0.96      0.94      0.95       381
       Sports       0.94      0.94      0.94       384
   Technology       0.95      0.95      0.95       381
        World       0.93      0.92      0.92       398

     accuracy                           0.93      4605
    macro avg       0.93      0.93      0.93      4605
 weighted avg       0.93      0.93      0.93      4605



In [10]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
                           ('model', MultinomialNB(alpha=1.0,
                                       fit_prior=True,
                                       class_prior=None))])

nb_model = nb.fit(X_train, y_train)

nb_pred = nb_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, nb_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, nb_pred))

Accuracy:  93.38 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.93      0.88      0.90       386
    Education       0.92      0.94      0.93       387
Entertainment       0.96      0.93      0.95       392
  Environment       0.93      0.95      0.94       385
      Finance       0.91      0.93      0.92       383
       Health       0.94      0.95      0.95       380
       Nation       0.93      0.94      0.94       377
     Politics       0.91      0.92      0.92       371
      Science       0.96      0.94      0.95       381
       Sports       0.94      0.94      0.94       384
   Technology       0.95      0.95      0.95       381
        World       0.93      0.92      0.92       398

     accuracy                           0.93      4605
    macro avg       0.93      0.93      0.93      4605
 weighted avg       0.93      0.93      0.93      4605



# SVM

In [11]:
sv = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('model', svm.SVC(kernel='linear',
                                 C=1.0,
                                 max_iter=5000))])

sv_model = sv.fit(X_train, y_train)

sv_pred = sv_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, sv_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, sv_pred))


Accuracy:  96.46 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.97      0.94      0.96       386
    Education       0.98      0.96      0.97       387
Entertainment       0.97      0.98      0.98       392
  Environment       0.97      0.97      0.97       385
      Finance       0.97      0.95      0.96       383
       Health       0.98      0.97      0.97       380
       Nation       0.97      0.98      0.97       377
     Politics       0.92      0.96      0.94       371
      Science       0.97      0.97      0.97       381
       Sports       0.94      0.97      0.95       384
   Technology       0.99      0.96      0.98       381
        World       0.96      0.95      0.96       398

     accuracy                           0.96      4605
    macro avg       0.96      0.96      0.96      4605
 weighted avg       0.97      0.96      0.96      4605



In [12]:
sv = Pipeline([('vect', CountVectorizer()),
              # ('tfidf', TfidfTransformer()),
               
               ('model', svm.SVC(kernel='linear',
                                 C=1.0,
                                 max_iter=5000))])

sv_model = sv.fit(X_train, y_train)

sv_pred = sv_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, sv_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, sv_pred))

Accuracy:  96.35 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.97      0.94      0.95       386
    Education       0.98      0.96      0.97       387
Entertainment       0.97      0.98      0.98       392
  Environment       0.97      0.97      0.97       385
      Finance       0.97      0.95      0.96       383
       Health       0.98      0.96      0.97       380
       Nation       0.97      0.98      0.98       377
     Politics       0.89      0.97      0.93       371
      Science       0.97      0.97      0.97       381
       Sports       0.93      0.97      0.95       384
   Technology       0.99      0.96      0.98       381
        World       0.96      0.95      0.96       398

     accuracy                           0.96      4605
    macro avg       0.96      0.96      0.96      4605
 weighted avg       0.96      0.96      0.96      4605



In [13]:
sv_tfidf = Pipeline([('tfidf', TfidfTransformer()),
                            ('model', svm.SVC(kernel='linear',
                                 C=1.0,
                                 max_iter=5000))])

sv_tfidf_model = nb.fit(X_train, y_train)

sv_tfidf_pred = nb_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, sv_tfidf_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, sv_tfidf_pred))

Accuracy:  93.38 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.93      0.88      0.90       386
    Education       0.92      0.94      0.93       387
Entertainment       0.96      0.93      0.95       392
  Environment       0.93      0.95      0.94       385
      Finance       0.91      0.93      0.92       383
       Health       0.94      0.95      0.95       380
       Nation       0.93      0.94      0.94       377
     Politics       0.91      0.92      0.92       371
      Science       0.96      0.94      0.95       381
       Sports       0.94      0.94      0.94       384
   Technology       0.95      0.95      0.95       381
        World       0.93      0.92      0.92       398

     accuracy                           0.93      4605
    macro avg       0.93      0.93      0.93      4605
 weighted avg       0.93      0.93      0.93      4605



# Logistic Regression

In [14]:
lr = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('model', LogisticRegression(penalty='l2',
                                            C=1,
                                            solver='saga',
                                            multi_class='multinomial',
                                            max_iter=1000,
                                            class_weight='balanced'))])

lr_model = lr.fit(X_train, y_train)

lr_pred = lr_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, lr_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, lr_pred))

Accuracy:  96.24 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.96      0.93      0.94       386
    Education       0.97      0.96      0.96       387
Entertainment       0.97      0.97      0.97       392
  Environment       0.97      0.97      0.97       385
      Finance       0.95      0.95      0.95       383
       Health       0.97      0.97      0.97       380
       Nation       0.96      0.97      0.97       377
     Politics       0.91      0.97      0.94       371
      Science       0.96      0.97      0.97       381
       Sports       0.96      0.96      0.96       384
   Technology       0.99      0.97      0.98       381
        World       0.97      0.96      0.96       398

     accuracy                           0.96      4605
    macro avg       0.96      0.96      0.96      4605
 weighted avg       0.96      0.96      0.96      4605



# XG Boost

In [15]:
gb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('model', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=5, random_state=0))])

gb_model = gb.fit(X_train, y_train)

gb_pred = gb_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, gb_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, gb_pred))

Accuracy:  94.64 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.95      0.93      0.94       386
    Education       0.97      0.94      0.96       387
Entertainment       0.98      0.94      0.96       392
  Environment       0.94      0.94      0.94       385
      Finance       0.96      0.95      0.96       383
       Health       0.97      0.95      0.96       380
       Nation       0.96      0.97      0.97       377
     Politics       0.78      0.97      0.86       371
      Science       0.96      0.95      0.96       381
       Sports       0.98      0.91      0.95       384
   Technology       0.98      0.94      0.96       381
        World       0.96      0.95      0.95       398

     accuracy                           0.95      4605
    macro avg       0.95      0.95      0.95      4605
 weighted avg       0.95      0.95      0.95      4605



# K-Nearest Neighbour

In [16]:
kn = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('model', KNeighborsClassifier(n_neighbors=5))])

kn_model = kn.fit(X_train, y_train)

kn_pred = kn_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, kn_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, kn_pred,))

Accuracy:  89.19 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.87      0.84      0.85       386
    Education       0.88      0.89      0.89       387
Entertainment       0.92      0.89      0.91       392
  Environment       0.89      0.90      0.89       385
      Finance       0.90      0.89      0.90       383
       Health       0.91      0.90      0.90       380
       Nation       0.89      0.90      0.90       377
     Politics       0.89      0.89      0.89       371
      Science       0.89      0.94      0.91       381
       Sports       0.91      0.86      0.89       384
   Technology       0.87      0.92      0.89       381
        World       0.90      0.88      0.89       398

     accuracy                           0.89      4605
    macro avg       0.89      0.89      0.89      4605
 weighted avg       0.89      0.89      0.89      4605



# Decision Tree

In [17]:
dtc = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', DecisionTreeClassifier(criterion= 'gini',
                                           max_depth = 1000, 
                                           splitter='best', 
                                           random_state=42))])
dtc_model = dtc.fit(X_train, y_train)

dtc_pred = dtc_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, dtc_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, dtc_pred))

Accuracy:  95.74 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.97      0.93      0.95       386
    Education       0.96      0.97      0.96       387
Entertainment       0.97      0.95      0.96       392
  Environment       0.97      0.97      0.97       385
      Finance       0.96      0.96      0.96       383
       Health       0.98      0.96      0.97       380
       Nation       0.97      0.98      0.98       377
     Politics       0.96      0.93      0.95       371
      Science       0.95      0.97      0.96       381
       Sports       0.88      0.97      0.92       384
   Technology       0.97      0.96      0.96       381
        World       0.98      0.93      0.96       398

     accuracy                           0.96      4605
    macro avg       0.96      0.96      0.96      4605
 weighted avg       0.96      0.96      0.96      4605



# Saving Results in DataFrame 

In [18]:
res = []
res = pd.DataFrame(res)
res["models"] = ["SVM-tfidf", "SVM", "Naive Bayes", "Logistic Regression", "KNN", "Decision Tree", "Random Forest"]
res["accuracy"] = [round(accuracy_score(y_test, sv_tfidf_pred)*100,2), round(accuracy_score(y_test, sv_pred)*100,2), round(accuracy_score(y_test, nb_pred)*100,2), round(accuracy_score(y_test, lr_pred)*100,2), round(accuracy_score(y_test, kn_pred)*100,2), round(accuracy_score(y_test, dtc_pred)*100,2), round(accuracy_score(y_test, rfc_pred)*100,2)]
res = res.sort_values("accuracy", ascending=False).reset_index(drop=True)
res.to_csv('../DATASET/model_results.csv', index=False)
res

Unnamed: 0,models,accuracy
0,Random Forest,96.55
1,SVM,96.35
2,Logistic Regression,96.24
3,Decision Tree,95.74
4,SVM-tfidf,93.38
5,Naive Bayes,93.38
6,KNN,89.19


In [None]:
res = pd.read_csv("../DATASET/model_results.csv")
user_input = input("Enter a news to predict its tags: ")
res["predicted"] = [sv_tfidf_model.predict([user_input])[0], sv_model.predict([user_input])[0], nb_model.predict([user_input])[0], lr_model.predict([user_input])[0], kn_model.predict([user_input])[0], dtc_model.predict([user_input])[0], rfc_model.predict([user_input])[0]]
res