In [54]:
# 1. import required libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB,MultinomialNB
from sklearn.metrics import accuracy_score,classification_report

In [55]:
#3. load 20 news group train subset

train_df = pd.read_csv("nlp_train.csv")
train_df.head()

Unnamed: 0,text,target,category
0,I was wondering if anyone out there could enli...,7,rec.autos
1,A fair number of brave souls who upgraded thei...,4,comp.sys.mac.hardware
2,"well folks, my mac plus finally gave up the gh...",4,comp.sys.mac.hardware
3,\nDo you have Weitek's address/phone number? ...,1,comp.graphics
4,"From article <C5owCB.n3p@world.std.com>, by to...",14,sci.space


In [56]:
train_df.shape

(11314, 3)

In [57]:
# 4. load 20 news group test subset

test_df = pd.read_csv("nlp_test.csv")

test_df.head()



Unnamed: 0,text,target,category
0,I am a little confused on all of the models of...,7,rec.autos
1,I'm not familiar at all with the format of the...,5,comp.windows.x
2,"\nIn a word, yes.\n",0,alt.atheism
3,\nThey were attacking the Iraqis to drive them...,17,talk.politics.mideast
4,\nI've just spent two solid months arguing tha...,19,talk.religion.misc


In [58]:
# 5. Print all target labels

print("Target labels train df: ", train_df["category"].unique())
print("\n\nTarget labels test df: ", test_df["category"].unique())

Target labels train df:  ['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Target labels test df:  ['rec.autos' 'comp.windows.x' 'alt.atheism' 'talk.politics.mideast'
 'talk.religion.misc' 'sci.med' 'soc.religion.christian' 'comp.graphics'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'comp.sys.mac.hardware'
 'misc.forsale' 'talk.politics.guns' 'sci.space'
 'comp.sys.ibm.pc.hardware' 'sci.crypt' 'rec.sport.baseball'
 'rec.sport.hockey' 'talk.politics.misc' 'sci.electronics']


In [59]:
#  6. Prepare subset of subset categories alt.athesim, comp.graphics and sci.space

cats = ["alt.atheism", "comp.graphics", "sci.space"]
train_sub = train_df[train_df["category"].isin(cats)]
test_sub = test_df[test_df["category"].isin(cats)]

train_sub.head() , test_sub.head()

(                                                 text  target       category
 3   \nDo you have Weitek's address/phone number?  ...       1  comp.graphics
 4   From article <C5owCB.n3p@world.std.com>, by to...      14      sci.space
 13  \n   {Description of "External Tank" option fo...      14      sci.space
 15  \nDon't be so sure.  Look what happened to Jap...       0    alt.atheism
 16  \nI certainly do use it whenever I have to do ...       1  comp.graphics,
                                                  text  target       category
 2                                 \nIn a word, yes.\n       0    alt.atheism
 9   :  \n: well, i have lots of experience with sc...       1  comp.graphics
 14  \nProbably because it IS rape.\n\n\nSo nothing...       0    alt.atheism
 17  Hello,\ni'm interested in those devices too.\n...       1  comp.graphics
 20  This is an invitation to send articles to the ...       1  comp.graphics)

In [60]:
# 7, 8 Load train subset with the above three categories



In [61]:
train_df.isnull().sum()

text        218
target        0
category      0
dtype: int64

In [62]:
#  drop the the missing value rows

train_df.dropna(subset=["text"],inplace=True)

train_df.head()

Unnamed: 0,text,target,category
0,I was wondering if anyone out there could enli...,7,rec.autos
1,A fair number of brave souls who upgraded thei...,4,comp.sys.mac.hardware
2,"well folks, my mac plus finally gave up the gh...",4,comp.sys.mac.hardware
3,\nDo you have Weitek's address/phone number? ...,1,comp.graphics
4,"From article <C5owCB.n3p@world.std.com>, by to...",14,sci.space


In [63]:
train_df.isnull().sum()

text        0
target      0
category    0
dtype: int64

In [64]:
# 9. Print new training set target labels

print("Target labels new train df: ", train_df["category"].unique())

Target labels new train df:  ['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


In [65]:
# 10. print news training data of 5th article

print("5th article : \n" , train_df["text"].iloc[4])

5th article : 
 From article <C5owCB.n3p@world.std.com>, by tombaker@world.std.com (Tom A Baker):


My understanding is that the 'expected errors' are basically
that don't have the right values in yet because they aren't
set till after launch, and suchlike. Rather than fix the code
and possibly introduce new bugs, they just tell the crew


In [66]:
print("5th article : " , train_df.iloc[4,0])

5th article :  From article <C5owCB.n3p@world.std.com>, by tombaker@world.std.com (Tom A Baker):


My understanding is that the 'expected errors' are basically
that don't have the right values in yet because they aren't
set till after launch, and suchlike. Rather than fix the code
and possibly introduce new bugs, they just tell the crew


In [67]:
# 11 . Print shape of data and targets

print("Shape of train : ",train_df.shape)
print("Shape of test df : ",test_df.shape)

Shape of train :  (11096, 3)
Shape of test df :  (7532, 3)


In [68]:
# 13. By using count vectorizor train data into numerical format considering

cv = CountVectorizer()

x_train = cv.fit_transform(train_df["text"])
y_train = train_df["category"]

print(x_train)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 1103627 stored elements and shape (11096, 101631)>
  Coords	Values
  (0, 95844)	4
  (0, 97181)	1
  (0, 48754)	2
  (0, 18915)	2
  (0, 68847)	1
  (0, 88638)	1
  (0, 30074)	1
  (0, 37335)	1
  (0, 60560)	1
  (0, 68080)	2
  (0, 88767)	4
  (0, 25775)	4
  (0, 80623)	1
  (0, 88532)	6
  (0, 68781)	1
  (0, 31990)	1
  (0, 51326)	2
  (0, 34809)	1
  (0, 84538)	1
  (0, 57390)	1
  (0, 89360)	1
  (0, 21987)	1
  (0, 41715)	2
  (0, 55746)	1
  (0, 9843)	1
  :	:
  (11095, 40387)	1
  (11095, 81792)	1
  (11095, 81742)	1
  (11095, 96497)	1
  (11095, 89804)	1
  (11095, 23302)	1
  (11095, 82660)	1
  (11095, 85524)	1
  (11095, 70066)	1
  (11095, 21258)	1
  (11095, 62086)	1
  (11095, 71992)	1
  (11095, 87730)	1
  (11095, 84605)	1
  (11095, 61975)	1
  (11095, 26205)	1
  (11095, 71786)	1
  (11095, 78365)	1
  (11095, 89465)	1
  (11095, 56719)	1
  (11095, 54033)	1
  (11095, 26208)	1
  (11095, 52230)	1
  (11095, 4486)	1
  (11095, 96707)	1


In [69]:
# 14. Use bernoulinb for training

model = BernoulliNB()
model.fit(x_train,y_train)

In [81]:
test_df.isnull().sum()

text        0
target      0
category    0
dtype: int64

In [82]:
test_df.dropna(inplace=True)

In [83]:
test_df.isnull().sum()

text        0
target      0
category    0
dtype: int64

In [84]:

#15. By using countvectorizer convert test data into numeric format considering only

x_test = cv.transform(test_df["text"])
y_test = test_df["category"]
x_test

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 665738 stored elements and shape (7370, 101631)>

In [74]:
#16. Predic target label for testing set

y_pred = model.predict(x_test)
print(y_pred[:5])

['rec.autos' 'rec.motorcycles' 'rec.motorcycles' 'talk.politics.mideast'
 'rec.motorcycles']


In [75]:
# 17. Find accuracy score on test set

accuracy = accuracy_score(y_test,y_pred)
print(f"Test set accuracy : {accuracy:.2f}")


Test set accuracy : 0.46


In [76]:
#  18. Use Tfidvectorizer instead of count vectorizer , 

tfv = TfidfVectorizer()

x_tr = tfv.fit_transform(train_df["text"])
y_tr = train_df["category"]

x_te = tfv.transform(test_df["text"])
y_te = test_df["category"]

In [77]:
# 18 b) and use multinomial nb

multi = MultinomialNB()
multi.fit(x_tr,y_tr)

In [78]:
# 18 c) and use predict the x_test

y_predict = multi.predict(x_te)

y_predict[:5]

array(['soc.religion.christian', 'comp.graphics',
       'soc.religion.christian', 'talk.politics.mideast', 'alt.atheism'],
      dtype='<U24')

In [79]:
# 19. Find test accuary for multinomial nb


ac = accuracy_score(y_te,y_predict)

print(f"Accuaracy of multinomial model : {ac:.2f}")

Accuaracy of multinomial model : 0.62
