### Implementing Naive Bayes classifier to detect the difficulty in differentiating an email related to hockey and baseball etc..

In [2]:
#importing email datasets from sklearn
from sklearn.datasets import fetch_20newsgroups

#importing naive bayes classifier
from sklearn.naive_bayes import MultinomialNB

#import countvectorizer to convert the text doc into matrix count
from sklearn.feature_extraction.text import CountVectorizer

emails = fetch_20newsgroups()

In [3]:
#to check the different categories of emails

emails.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [10]:
# As we are interested in hockey and baseball emails we will store those target emails in a variable 

email = fetch_20newsgroups(categories = ['rec.sport.baseball','rec.sport.hockey'])

In [13]:
email.data[5]

'From: mmb@lamar.ColoState.EDU (Michael Burger)\nSubject: More TV Info\nDistribution: na\nNntp-Posting-Host: lamar.acns.colostate.edu\nOrganization: Colorado State University, Fort Collins, CO  80523\nLines: 36\n\nUnited States Coverage:\nSunday April 18\n  N.J./N.Y.I. at Pittsburgh - 1:00 EDT to Eastern Time Zone\n  ABC - Gary Thorne and Bill Clement\n\n  St. Louis at Chicago - 12:00 CDT and 11:00 MDT - to Central/Mountain Zones\n  ABC - Mike Emerick and Jim Schoenfeld\n\n  Los Angeles at Calgary - 12:00 PDT and 11:00 ADT - to Pacific/Alaskan Zones\n  ABC - Al Michaels and John Davidson\n\nTuesday, April 20\n  N.J./N.Y.I. at Pittsburgh - 7:30 EDT Nationwide\n  ESPN - Gary Thorne and Bill Clement\n\nThursday, April 22 and Saturday April 24\n  To Be Announced - 7:30 EDT Nationwide\n  ESPN - To Be Announced\n\n\nCanadian Coverage:\n\nSunday, April 18\n  Buffalo at Boston - 7:30 EDT Nationwide\n  TSN - ???\n\nTuesday, April 20\n  N.J.D./N.Y. at Pittsburgh - 7:30 EDT Nationwide\n  TSN - ??

In [14]:
#email.target is a list of numbers corresponding to values in emails.target_name
email.target[5]

#1 corresponds to hockey

1

### Creating training and test dataset

In [15]:
#random_state is used so that every time split is performed in the same way

train_emails = fetch_20newsgroups(categories = ['rec.sport.baseball','rec.sport.hockey'], subset ='train',shuffle =True,random_state=108)

In [16]:

test_emails = fetch_20newsgroups(categories = ['rec.sport.baseball','rec.sport.hockey'], subset ='test',shuffle =True,random_state=108)

### Transforming the emails into the list of word counts using CountVectorizer

In [18]:
counter = CountVectorizer()

In [19]:
#We need to tell counter what possible words can exist in our emails
#.fit function of countvectorizer takes the entire data

counter.fit(train_emails.data + test_emails.data)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [20]:
#We can now make a list of the counts of our words in our training set.

train_counts = counter.transform(train_emails.data)

In [21]:
test_counts = counter.transform(test_emails.data)

### Making Naive Bayes Classifier

In [22]:
classifier = MultinomialNB()

In [23]:
classifier.fit(train_counts,train_emails.target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [24]:
classifier.score(test_counts,test_emails.target)

0.9723618090452262