In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from gensim.models import Word2Vec
from sklearn.preprocessing import MinMaxScaler

import pickle
import numpy as np

In [2]:
bbc_text = pd.read_csv('bbc-text.txt')
review=bbc_text.rename(columns = {'text': 'News_Headline'}, inplace = False)
review.category = bbc_text.category.map({'tech':0, 'business':1, 'sport':2, 'entertainment':3, 'politics':4})

In [3]:
X = review.News_Headline
y = review.category
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6, random_state=1)

In [4]:
# Tokenize the text data
X_train_tokenized = [review.split() for review in X_train]
X_test_tokenized = [review.split() for review in X_test]

In [5]:
# Train the Word2Vec model
word2vec_model = Word2Vec(sentences=X_train_tokenized, vector_size=100, window=5, min_count=1, workers=4)


In [6]:
# Transform the text data to word vectors
X_train_transformed = []
for review in X_train_tokenized:
    vectors = [word2vec_model.wv[word] for word in review if word in word2vec_model.wv]
    if vectors:
        review_vector = np.mean(vectors, axis=0)
        X_train_transformed.append(review_vector)
X_test_transformed = []
for review in X_test_tokenized:
    vectors = [word2vec_model.wv[word] for word in review if word in word2vec_model.wv]
    if vectors:
        review_vector = np.mean(vectors, axis=0)
        X_test_transformed.append(review_vector)

In [7]:
# Reshape the word vectors to 2D
X_train_transformed = np.vstack(X_train_transformed)
X_test_transformed = np.vstack(X_test_transformed)

In [8]:
# Scale the word vectors to make them non-negative
scaler = MinMaxScaler()
X_train_transformed = scaler.fit_transform(X_train_transformed)
X_test_transformed = scaler.transform(X_test_transformed)

In [9]:
# Train the model
naivebayes = MultinomialNB()
naivebayes.fit(X_train_transformed, y_train)

MultinomialNB()

In [10]:
from sklearn.metrics import classification_report
print(classification_report(naivebayes.predict(X_test_transformed), y_test))

              precision    recall  f1-score   support

           0       0.75      0.71      0.73       163
           1       0.81      0.63      0.71       272
           2       0.72      0.77      0.75       184
           3       0.31      0.53      0.39        93
           4       0.73      0.69      0.71       178

    accuracy                           0.68       890
   macro avg       0.66      0.67      0.66       890
weighted avg       0.71      0.68      0.69       890



In [11]:
headline1 = ['There will be recession throughout the world as predicted by world bank']
headline1_tokenized = headline1[0].split()
headline1_vectors = [word2vec_model.wv[word] for word in headline1_tokenized if word in word2vec_model.wv]
if headline1_vectors:
    headline1_vector = np.mean(headline1_vectors, axis=0)
    headline1_transformed = scaler.transform(headline1_vector.reshape(1, -1))

prediction = naivebayes.predict(headline1_transformed)
k = str(list(prediction)[0]).replace('0', 'TECH').replace('1', 'BUSINESS').replace('2', 'SPORTS').replace('3', 'ENTERTAINMENT').replace('4', 'POLITICS')
print('Headline:', headline1)
print('Prediction:', k)


Headline: ['There will be recession throughout the world as predicted by world bank']
Prediction: BUSINESS
