In [14]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [15]:
text_dataframe = pd.read_csv('bbc-text.csv')

In [16]:
text_dataframe.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [17]:
text_dataframe.category = text_dataframe.category.map({'tech':0, 'business':1, 'sport':2, 'entertainment':3, 'politics':4})
print(text_dataframe['category'].unique())

[0 1 2 3 4]


In [18]:
x_train, x_test, y_train, y_test = train_test_split(text_dataframe.text, text_dataframe.category, train_size = 0.8, random_state = 1)

In [19]:
vectorizer = TfidfVectorizer(stop_words = 'english')
vectorizer.fit(x_train)
print(len(vectorizer.get_feature_names()))

26758


In [20]:
x_train_transform = vectorizer.transform(x_train)
print(x_train_transform)

  (0, 26717)	0.05587807764963656
  (0, 26617)	0.025587873496872852
  (0, 26574)	0.0479150950550967
  (0, 26456)	0.01744230097795749
  (0, 26447)	0.05147297013390133
  (0, 26433)	0.021094448610051584
  (0, 26326)	0.06133429943683824
  (0, 26325)	0.03700265148069332
  (0, 26284)	0.037368841418367195
  (0, 26106)	0.04028429947661313
  (0, 26090)	0.043209122989996414
  (0, 26052)	0.06133429943683824
  (0, 25986)	0.06810638100165958
  (0, 25916)	0.028354744552360873
  (0, 25914)	0.021558339775093544
  (0, 25745)	0.04547365963606735
  (0, 25741)	0.07631332957506771
  (0, 25644)	0.028596968713628098
  (0, 25417)	0.025529249348903986
  (0, 25411)	0.07673317654216817
  (0, 25406)	0.02199792290947901
  (0, 25404)	0.04437262721285016
  (0, 25341)	0.042825063205568885
  (0, 25065)	0.03443110294865876
  (0, 24951)	0.06133429943683824
  :	:
  (1779, 5972)	0.05670468826722295
  (1779, 4956)	0.08932937325051649
  (1779, 4253)	0.053559219283503136
  (1779, 4179)	0.07966291285888974
  (1779, 3881)	0.075

In [21]:
naive_bayes = MultinomialNB()
naive_bayes.fit(x_train_transform, y_train)

MultinomialNB()

In [22]:
x_test_transform = vectorizer.transform(x_test)
print(x_test_transform)

  (0, 26621)	0.033062176504526614
  (0, 26521)	0.06457369953344005
  (0, 26450)	0.06526480396448622
  (0, 26136)	0.06327978050066899
  (0, 26081)	0.10345352695760723
  (0, 26016)	0.03635213315711279
  (0, 25979)	0.07828512935272965
  (0, 25914)	0.04131773847114211
  (0, 25872)	0.05751117411629345
  (0, 25678)	0.09663624729398569
  (0, 25536)	0.050262684583750966
  (0, 24782)	0.05422955959050565
  (0, 24582)	0.07312933709504633
  (0, 24355)	0.03431162634212532
  (0, 24275)	0.029701305129224018
  (0, 24140)	0.05201782990715864
  (0, 24036)	0.11143350353323332
  (0, 23404)	0.08440216866648166
  (0, 22999)	0.06898905241769764
  (0, 22882)	0.05035014265615312
  (0, 22863)	0.07437002012780079
  (0, 22613)	0.1362199599561635
  (0, 22501)	0.071619430475693
  (0, 22166)	0.07235550207934183
  (0, 21911)	0.05603487516340902
  :	:
  (444, 10114)	0.05425989609526761
  (444, 9826)	0.07617048826137043
  (444, 9533)	0.08579251046094569
  (444, 9137)	0.07101451147507185
  (444, 8884)	0.0710145114750718

In [23]:
y_predict = naive_bayes.predict(x_test_transform)
print("Accuracy = " + str(round(metrics.accuracy_score(y_test, y_predict) * 100, 4)) + "%")

Accuracy = 98.427%


In [24]:
confusion_matrix = metrics.confusion_matrix(y_test, y_predict)
print(confusion_matrix)

[[ 75   1   0   0   3]
 [  0 114   0   0   0]
 [  0   1 107   0   0]
 [  0   0   0  68   0]
 [  0   1   1   0  74]]


In [27]:
print("PRECISION SCORE : " + str(round(metrics.precision_score(y_test, y_predict, average = 'micro') * 100, 4)) + "%")
print("RECALL SCORE : " + str(round(metrics.recall_score(y_test, y_predict, average = 'micro') * 100, 4)) + "%")

PRECISION SCORE : 98.427%
RECALL SCORE : 98.427%
