In [1]:
import pandas as pd


# Read our dataset using read_csv()
bbc_text = pd.read_csv(r"bbc-text.txt")
bbc_text=bbc_text.rename(columns = {'text': 'News_Headline'}, inplace = False)
bbc_text.head()

Unnamed: 0,category,News_Headline
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [2]:
bbc_text.category = bbc_text.category.map({'tech':0, 'business':1, 'sport':2, 'entertainment':3, 'politics':4})
bbc_text.category.unique()

array([0, 1, 2, 3, 4], dtype=int64)

In [3]:
from sklearn.model_selection import train_test_split
X = bbc_text.News_Headline
y = bbc_text.category
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.6, random_state = 1)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
vector = TfidfVectorizer(stop_words = 'english',lowercase=False)
# fit the vectorizer on the training data
vector.fit(X_train)
vector.vocabulary_


{'microsoft': 13830,
 'makes': 13193,
 'anti': 1986,
 'piracy': 15982,
 'says': 18570,
 'clamping': 4647,
 'people': 15724,
 'running': 18349,
 'pirated': 15984,
 'versions': 22489,
 'windows': 23102,
 'operating': 15047,
 'restricting': 17858,
 'access': 1213,
 'security': 18823,
 'features': 8421,
 'genuine': 9383,
 'advantage': 1444,
 'scheme': 18632,
 'means': 13618,
 'prove': 16740,
 'software': 19653,
 'mid': 13833,
 '2005': 365,
 'allow': 1728,
 'unauthorised': 21937,
 'copies': 5482,
 'crucial': 5791,
 'fixes': 8677,
 'automatic': 2466,
 'updates': 22253,
 'options': 15086,
 'limited': 12739,
 'releases': 17564,
 'regular': 17487,
 'protect': 16715,
 'pcs': 15646,
 'detect': 6493,
 'automatically': 2467,
 'users': 22327,
 'manually': 13296,
 'download': 7079,
 'site': 19392,
 'programs': 16624,
 'downloads': 7084,
 'add': 1351,
 'ons': 15019,
 'giant': 9440,
 'offers': 14939,
 'try': 21775,
 'patches': 15569,
 'let': 12600,
 'run': 18341,
 'automated': 2465,
 'checking': 4448,


In [5]:
X_transformed = vector.transform(X_train)
X_transformed.toarray()
# for test data
X_test_transformed = vector.transform(X_test)

In [6]:
from sklearn.naive_bayes import MultinomialNB
naivebayes = MultinomialNB()
naivebayes.fit(X_transformed, y_train)

MultinomialNB()

In [7]:
from sklearn.metrics import classification_report
print(classification_report(naivebayes.predict(X_test_transformed), y_test))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97       150
           1       0.98      0.96      0.97       214
           2       0.99      0.99      0.99       198
           3       0.96      1.00      0.98       152
           4       0.98      0.94      0.96       176

    accuracy                           0.97       890
   macro avg       0.97      0.97      0.97       890
weighted avg       0.97      0.97      0.97       890



In [8]:
headline1 = ['Portugal crash out of FIFA World Cup 2022, Ronaldo in tears']
vec = vector.transform(headline1).toarray()
print('Headline:', headline1)
print(str(list(naivebayes.predict(vec))[0]).replace('0', 'TECH').replace('1', 'BUSINESS').replace('2', 'SPORTS').replace('3','ENTERTAINMENT').replace('4','POLITICS'))
#naivebayes.predict(vec)[0]

Headline: ['Portugal crash out of FIFA World Cup 2022, Ronaldo in tears']
SPORTS


In [9]:
headline1 = ['There will be recession throughout the world as predicted by world bank']
vec = vector.transform(headline1).toarray()
print('Headline:', headline1)
print(str(list(naivebayes.predict(vec))[0]).replace('0', 'TECH').replace('1', 'BUSINESS').replace('2', 'SPORTS').replace('3','ENTERTAINMENT').replace('4','POLITICS'))

Headline: ['There will be recession throughout the world as predicted by world bank']
BUSINESS


In [10]:
#to save the model
import pickle

saved_model = pickle.dumps(naivebayes)

In [11]:
#load saved model
s = pickle.loads(saved_model)
s

MultinomialNB()

In [12]:
headline1 = ['There will be recession throughout the world as predicted by world bank']
vec = vector.transform(headline1).toarray()

s.predict(vec)[0]

1