## Overview

See Section 7.2 in the sklearn User Guide  http://scikit-learn.org/stable/user_guide.html for the dataset used in this notebook

## Load in the data - a subset from 20 News groups 

In [None]:
from sklearn.datasets import fetch_20newsgroups

categories = ['rec.sport.baseball', 'talk.politics.guns','comp.graphics', 'sci.med']
twentyTrain = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [None]:
# You can check the target names (categories) and some data files by following commands.
list(twentyTrain.target_names) #prints all the categories

In [None]:
type(twentyTrain)  # the type 

In [None]:
len(twentyTrain.data)  # the size


In [None]:
len(twentyTrain.filenames)

In [None]:
print(twentyTrain.data[0])   # print one instance of the data - .data is the data
print("\n".join(twentyTrain.data[0].split("\n")[:3]))  #print out the first 3 lines only
print("Target class is {}".format(twentyTrain.target_names[twentyTrain.target[0]]))   #print the class of that instance - .target is the class

### Remove the meta data so the classifier doesn't overfit to the headers etc.,

In [None]:
categories = ['rec.sport.baseball', 'talk.politics.guns','comp.graphics', 'sci.med']
twentyTrain = fetch_20newsgroups(subset='train', 
                                 categories=categories, 
                                 remove=('headers', 'footers', 'quotes'), 
                                 shuffle=True, 
                                 random_state=42)    # random seed 
print(twentyTrain.data[0])   # print one instance of the data - .data is the data

In [None]:
print(twentyTrain.target[:10])   #.target are the classes

In [None]:
for t in twentyTrain.target[:10]:
    print(twentyTrain.target_names[t])  # .target_names are the class names

##  Tokenising

The tokenising can be changed by changing the parameters to the Vectorizer:  
- `analyser` and `ngram_range` params will allow tokenising by char n-grams.  
- `max_df` and `min_df` will allow document frequency reduction to be performed

Look up the documentation to see what can be changed. 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()       
count_vect.get_params()      #shows the default parameters

## Create the Term-Document Matrix

In [None]:
tdm = count_vect.fit_transform(twentyTrain.data)   #tdm is a matrix - 2-d array
tdm.shape     


In [None]:
count_vect.vocabulary_.get('and')  #count_vect is a dictionary - show the freq of word 'and'

### Transform the TDM to a normalised tf or tf-idf matrix 

Check the `TfidfTransformer` parameters - they allow for tf vs tfidf and l1 vs l2 normalisation

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()   
transformer.get_params()     #show default parameters 

In [None]:
tdm_tfidf = transformer.fit_transform(tdm)   #transform the TDM
tdm_tfidf.shape



## Build a NB classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(tdm_tfidf, twentyTrain.target)    #build the classifier (data, classes)

docs_test = ['I am sick', 'No more gun control']      #set up 2 test instances

# transform the test data in the same way as the training through CountVector and TfidfTransformer
test_counts = count_vect.transform(docs_test)       # don't fit as the vocab has been generated from the training data
test_tfidf = transformer.transform(test_counts)

predicted = clf.predict(test_tfidf)   #predict  

for doc, category in zip(docs_test, predicted):
    print('%r => %s' % (doc, twentyTrain.target_names[category]))

## Use Pipeline to do it all in one

In [None]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),])

text_clf.fit(twentyTrain.data, twentyTrain.target)  

### Load the 20 NG test data

In [None]:
twenty_test = fetch_20newsgroups(subset='test', 
                                 categories=categories, 
                                 shuffle=True, 
                                 random_state=42)  
docs_test = twenty_test.data

import numpy as np
predicted = text_clf.predict(docs_test)   # predict
np.mean(predicted == twenty_test.target)  #report accuracy

### Using metrics package

In [None]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))    #print classification results

In [None]:
metrics.confusion_matrix(twenty_test.target, predicted)  #print confusion matrix

In [None]:
metrics.f1_score(twenty_test.target, predicted, average='macro')   #print f-score



## Using TfidfVectorizer 
TfidfVectorizer combines using CountVectorizer and TfidfTransformer 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
categories = ['alt.atheism', 'talk.religion.misc',
              'comp.graphics', 'sci.med']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=categories,
                                    shuffle=True,
                                     random_state=42)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)

newsgroups_test = fetch_20newsgroups(subset='test',
                                     categories=categories,
                                     shuffle=True,
                                     random_state=42)
vectors_test = vectorizer.transform(newsgroups_test.data)

classifier = MultinomialNB(alpha=.01)
classifier.fit(vectors, newsgroups_train.target)
predicted = classifier.predict(vectors_test)
print(metrics.classification_report(newsgroups_test.target, predicted,
    target_names=newsgroups_train.target_names))