In [1]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import joblib


In [2]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [6]:
df = pd.read_csv('news_data.csv')

In [7]:
df

Unnamed: 0,category,text
0,tech,New app ChatterAI trends among users
1,sports,Real Madrid wins the championship after budget...
2,politics,Leader David Kim addresses nation on education...
3,entertainment,Streaming platform releases The Last Dance
4,entertainment,Director Samantha Lee plans sequel to Space Rush
...,...,...
95,politics,Opposition questions decision on public health...
96,entertainment,Streaming platform releases War and Hope
97,sports,Manchester United wins the championship after ...
98,tech,Tech conference showcases brain-computer inter...


In [10]:
df['cleaned_text'] = df['text'].apply(clean_text)

In [11]:
df

Unnamed: 0,category,text,cleaed_text,cleaned_text
0,tech,New app ChatterAI trends among users,new app chatterai trends among users,new app chatterai trends among users
1,sports,Real Madrid wins the championship after budget...,real madrid wins the championship after budget...,real madrid wins the championship after budget...
2,politics,Leader David Kim addresses nation on education...,leader david kim addresses nation on education...,leader david kim addresses nation on education...
3,entertainment,Streaming platform releases The Last Dance,streaming platform releases the last dance,streaming platform releases the last dance
4,entertainment,Director Samantha Lee plans sequel to Space Rush,director samantha lee plans sequel to space rush,director samantha lee plans sequel to space rush
...,...,...,...,...
95,politics,Opposition questions decision on public health...,opposition questions decision on public health...,opposition questions decision on public health...
96,entertainment,Streaming platform releases War and Hope,streaming platform releases war and hope,streaming platform releases war and hope
97,sports,Manchester United wins the championship after ...,manchester united wins the championship after ...,manchester united wins the championship after ...
98,tech,Tech conference showcases brain-computer inter...,tech conference showcases braincomputer interface,tech conference showcases braincomputer interface


In [13]:
df = df.drop('cleaed_text',axis=1)

In [14]:
df

Unnamed: 0,category,text,cleaned_text
0,tech,New app ChatterAI trends among users,new app chatterai trends among users
1,sports,Real Madrid wins the championship after budget...,real madrid wins the championship after budget...
2,politics,Leader David Kim addresses nation on education...,leader david kim addresses nation on education...
3,entertainment,Streaming platform releases The Last Dance,streaming platform releases the last dance
4,entertainment,Director Samantha Lee plans sequel to Space Rush,director samantha lee plans sequel to space rush
...,...,...,...
95,politics,Opposition questions decision on public health...,opposition questions decision on public health...
96,entertainment,Streaming platform releases War and Hope,streaming platform releases war and hope
97,sports,Manchester United wins the championship after ...,manchester united wins the championship after ...
98,tech,Tech conference showcases brain-computer inter...,tech conference showcases braincomputer interface


In [15]:
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(df['cleaned_text'])

In [16]:
x.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.32520144, ..., 0.        , 0.        ,
        0.32520144],
       [0.        , 0.36734139, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.32846332, ..., 0.        , 0.        ,
        0.32846332],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.46269286, 0.        ,
        0.        ]])

In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['catagory_encoded'] = le.fit_transform(df['category'])

In [19]:
df

Unnamed: 0,category,text,cleaned_text,catagory_encoded
0,tech,New app ChatterAI trends among users,new app chatterai trends among users,4
1,sports,Real Madrid wins the championship after budget...,real madrid wins the championship after budget...,3
2,politics,Leader David Kim addresses nation on education...,leader david kim addresses nation on education...,2
3,entertainment,Streaming platform releases The Last Dance,streaming platform releases the last dance,1
4,entertainment,Director Samantha Lee plans sequel to Space Rush,director samantha lee plans sequel to space rush,1
...,...,...,...,...
95,politics,Opposition questions decision on public health...,opposition questions decision on public health...,2
96,entertainment,Streaming platform releases War and Hope,streaming platform releases war and hope,1
97,sports,Manchester United wins the championship after ...,manchester united wins the championship after ...,3
98,tech,Tech conference showcases brain-computer inter...,tech conference showcases braincomputer interface,4


In [20]:
y = df['catagory_encoded']

In [21]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.2,random_state=42)

In [22]:
model = MultinomialNB()
model.fit(x_train,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [23]:
y_pred = model.predict(x_test)

In [24]:
print(confusion_matrix(y_test,y_pred))

[[4 0 0 0 0]
 [0 5 0 0 0]
 [0 0 1 0 0]
 [0 0 0 2 0]
 [0 0 0 0 8]]


In [25]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         2
           4       1.00      1.00      1.00         8

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

