In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier



from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

# Load Preprocessed Data

In [2]:
df=pd.read_csv('DATASET/PreprocessedData.csv')

In [3]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,title,Published_on,News_agency,URL,summary,Country,language,tag,Merged
1135,1135,Qué son pestañas inactivas en Google Chrome y ...,"Sun, 18 Feb 2024 12:56:00 GMT",infobae,https://news.google.com/rss/articles/CBMiZ2h0d...,Qué son pestañas inactivas en Google Chrome y ...,US,es,Technology,pestañas inactivas google chrome activarlas in...
2355,2355,Crise imminente : l'Amazonie au bord du point ...,"Sun, 18 Feb 2024 15:00:00 GMT",Tameteo.com,https://news.google.com/rss/articles/CBMimAFod...,Crise imminente : l'Amazonie au bord du point ...,CA,fr,Science,crise imminente lamazonie point basculement es...
1202,1202,"Nieta de ""El Chapo"" Guzmán, Frida Guzmán, es c...","Mon, 19 Feb 2024 00:08:00 GMT",Marca USA,https://news.google.com/rss/articles/CBMiSGh0d...,"Nieta de ""El Chapo"" Guzmán, Frida Guzmán, es c...",US,es,Entertainment,nieta chapo guzmán frida guzmán captada cantan...
2194,2194,L'aviez-vous remarqué dans Google Maps sur And...,"Sun, 18 Feb 2024 13:10:01 GMT",Génération NT,https://news.google.com/rss/articles/CBMiSmh0d...,L'aviez-vous remarqué dans Google Maps sur And...,CA,fr,Technology,laviezvous remarqué google android génération ...
1634,1634,How to build a lot of housing that isn't ugly ...,"Sat, 17 Feb 2024 11:00:00 GMT",The Globe and Mail,https://news.google.com/rss/articles/CBMiXGh0d...,How to build a lot of housing that isn't ugly ...,CA,en,Business,build housing globe build housing globe


# Splitting into training and testing data

In [4]:
X_train,X_test,y_train,y_test = train_test_split(df['Merged'], df.tag, test_size=0.3, random_state=42)


# Random Forest Classifier

In [5]:
rfc = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', RandomForestClassifier(n_estimators=5000,
                                                  criterion="gini"))])

rfc_model = rfc.fit(X_train, y_train)
rfc_pred = rfc_model.predict(X_test)
print("accuracy:", round(accuracy_score(y_test, rfc_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, rfc_pred))

accuracy: 66.82 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.60      0.40      0.48        86
Entertainment       0.77      0.63      0.69       147
       Health       0.90      0.68      0.77       111
       Nation       0.62      0.60      0.61       107
      Science       0.41      0.89      0.56       109
       Sports       0.88      0.79      0.83       120
   Technology       0.78      0.69      0.74       118
        World       0.74      0.60      0.67        91

     accuracy                           0.67       889
    macro avg       0.71      0.66      0.67       889
 weighted avg       0.72      0.67      0.68       889



# Naive Bayes

In [6]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('model', MultinomialNB(alpha=1.0,
                                       fit_prior=True,
                                       class_prior=None))])

nb_model = nb.fit(X_train, y_train)

nb_pred = nb_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, nb_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, nb_pred))

Accuracy:  75.59 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.80      0.37      0.51        86
Entertainment       0.86      0.82      0.84       147
       Health       0.90      0.79      0.84       111
       Nation       0.54      0.73      0.62       107
      Science       0.75      0.84      0.79       109
       Sports       0.92      0.91      0.92       120
   Technology       0.61      0.81      0.70       118
        World       0.82      0.62      0.70        91

     accuracy                           0.76       889
    macro avg       0.78      0.74      0.74       889
 weighted avg       0.78      0.76      0.75       889



# SVM

In [7]:
sv = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('model', svm.SVC(kernel='linear',
                                 C=1.0,
                                 max_iter=5000))])

sv_model = sv.fit(X_train, y_train)

sv_pred = sv_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, sv_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, sv_pred))

Accuracy:  74.35 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.56      0.55      0.55        86
Entertainment       0.81      0.76      0.79       147
       Health       0.92      0.76      0.83       111
       Nation       0.53      0.73      0.61       107
      Science       0.71      0.82      0.76       109
       Sports       0.90      0.89      0.90       120
   Technology       0.77      0.75      0.76       118
        World       0.80      0.62      0.70        91

     accuracy                           0.74       889
    macro avg       0.75      0.73      0.74       889
 weighted avg       0.76      0.74      0.75       889



# Logistic Regression

In [8]:
lr = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('model', LogisticRegression(penalty='l2',
                                            C=1,
                                            solver='saga',
                                            multi_class='multinomial',
                                            max_iter=1000,
                                            class_weight='balanced'))])

lr_model = lr.fit(X_train, y_train)

lr_pred = lr_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, lr_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, lr_pred))

Accuracy:  75.48 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.58      0.59      0.59        86
Entertainment       0.81      0.78      0.79       147
       Health       0.92      0.78      0.84       111
       Nation       0.57      0.68      0.62       107
      Science       0.73      0.88      0.80       109
       Sports       0.88      0.89      0.89       120
   Technology       0.79      0.71      0.75       118
        World       0.76      0.65      0.70        91

     accuracy                           0.75       889
    macro avg       0.75      0.75      0.75       889
 weighted avg       0.76      0.75      0.76       889



# XG Boost

In [9]:
gb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('model', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=5, random_state=0))])

gb_model = gb.fit(X_train, y_train)

gb_pred = gb_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, gb_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, gb_pred))

Accuracy:  59.96 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.56      0.37      0.45        86
Entertainment       0.78      0.54      0.64       147
       Health       0.71      0.60      0.65       111
       Nation       0.53      0.49      0.51       107
      Science       0.74      0.63      0.68       109
       Sports       0.83      0.68      0.75       120
   Technology       0.38      0.82      0.52       118
        World       0.62      0.59      0.61        91

     accuracy                           0.60       889
    macro avg       0.64      0.59      0.60       889
 weighted avg       0.65      0.60      0.61       889



# K-Nearest Neighbour

In [10]:
kn = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('model', KNeighborsClassifier(n_neighbors=5))])

kn_model = kn.fit(X_train, y_train)

kn_pred = kn_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, kn_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, kn_pred,))

Accuracy:  58.83 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.39      0.52      0.45        86
Entertainment       0.67      0.62      0.65       147
       Health       0.62      0.66      0.64       111
       Nation       0.48      0.52      0.50       107
      Science       0.51      0.61      0.56       109
       Sports       0.78      0.72      0.75       120
   Technology       0.68      0.57      0.62       118
        World       0.59      0.42      0.49        91

     accuracy                           0.59       889
    macro avg       0.59      0.58      0.58       889
 weighted avg       0.60      0.59      0.59       889



# Decision Tree

In [11]:
dtc = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', DecisionTreeClassifier(criterion= 'gini',
                                           max_depth = 1000, 
                                           splitter='best', 
                                           random_state=42))])
dtc_model = dtc.fit(X_train, y_train)

dtc_pred = dtc_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, dtc_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, dtc_pred))

Accuracy:  60.18 %
Classification Report:

                precision    recall  f1-score   support

     Business       0.37      0.33      0.35        86
Entertainment       0.59      0.50      0.54       147
       Health       0.79      0.66      0.72       111
       Nation       0.56      0.50      0.53       107
      Science       0.63      0.68      0.65       109
       Sports       0.78      0.65      0.71       120
   Technology       0.48      0.80      0.60       118
        World       0.69      0.67      0.68        91

     accuracy                           0.60       889
    macro avg       0.61      0.60      0.60       889
 weighted avg       0.62      0.60      0.60       889



# Saving Results in DataFrame 

In [12]:
res = []
res = pd.DataFrame(res)
res["models"] = ["SVM", "Naive Bayes", "Logistic Regression", "KNN", "Decision Tree", "Random Forest"]
res["accuracy"] = [round(accuracy_score(y_test, sv_pred)*100,2), round(accuracy_score(y_test, nb_pred)*100,2), round(accuracy_score(y_test, lr_pred)*100,2), round(accuracy_score(y_test, kn_pred)*100,2), round(accuracy_score(y_test, dtc_pred)*100,2), round(accuracy_score(y_test, rfc_pred)*100,2)]
res = res.sort_values("accuracy", ascending=False).reset_index(drop=True)
res.to_csv('DATASET/model_results.csv', index=False)
res

Unnamed: 0,models,accuracy
0,Naive Bayes,75.59
1,Logistic Regression,75.48
2,SVM,74.35
3,Random Forest,66.82
4,Decision Tree,60.18
5,KNN,58.83


In [13]:
res = pd.read_csv("DATASET/model_results.csv")
user_input = input("Enter a news to predict its tags: ")
res["predicted"] = [sv_model.predict([user_input])[0], nb_model.predict([user_input])[0], lr_model.predict([user_input])[0], kn_model.predict([user_input])[0], dtc_model.predict([user_input])[0], rfc_model.predict([user_input])[0]]
res

Enter a news to predict its tags: L'aviez-vous remarqué dans Google Maps sur And..


Unnamed: 0,models,accuracy,predicted
0,Naive Bayes,75.59,Technology
1,Logistic Regression,75.48,Technology
2,SVM,74.35,Technology
3,Random Forest,66.82,Technology
4,Decision Tree,60.18,Technology
5,KNN,58.83,Technology
