#### This notebook aims to build  Document classification models using Logistic regression, SVC, & Multinomial Naive bayes.

In [9]:
# Import required libraries
import numpy as np
import pandas as pd
import nltk
import time
import pickle
import json
import unicodedata
import tabulate
from collections import Counter

from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

nltk.download('stopwords')   
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vijay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vijay\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Read pickle files from corpus

In [2]:
# Regex pattern for extracting documents, pickle files & categories 
doc_pattern = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.json'
pkl_pattern = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.pickle'
cat_pattern = r'([a-z_\s]+)/.*'

# Create a class PickledCorpusReader
class PickledCorpusReader(CategorizedCorpusReader,CorpusReader):

    def __init__(self, root, fileids=pkl_pattern, **kwargs):
        """
        Initialize the corpus reader. Categorized arguments
        ('cat_pattern', 'cat_map', and 'cat_file') are passed
        to the  'CategorizedCorpusReader' constructor. The remaining
        arguments are passed to the CorpusReader constructor.
        """
        # Add the default category pattern if not passed into the class
        if not any (key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = cat_pattern

        # Initialize NLP Corpus reader objects
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

    def resolve(self, fileids, categories):
        """
        Returns a list of fileids or categories depending on what is passed
        to each internal corpus reader function.
        """
        if fileids is not None and categories is not None:
            raise ValueError ("Specify fileids or categories, not both")

        if categories is not None:
            return self.fileids(categories)
        return fileids

    def docs(self, fileids=None, categories=None):
        """
        Returns the document from a pickled object for each file in corpus.
        """
        #List the fileids & categories
        fileids = self.resolve(fileids, categories)
        # Load one document into memory at a time
        for path, enc, fileid in self.abspaths(fileids, True, True):
            with open(path,'rb') as f:
                yield pickle.load(f)

    def paragraphs(self, fileids=None, categories=None):
        """
        Returns a genetator where each paragraph contains a list of sentences.
        """
        for doc in self.docs(fileids, categories):
            for paragraph in doc:
                yield paragraph

    def sentences(self, fileids=None, categories=None):
        """
        Returns a generator where each sentence contains a list of tokens
        """
        for paragraph in self.paragraphs(fileids, categories):
            for sent in paragraph:
                yield sent

    def tokens(self, fileids=None, categories=None):
        """
        Returns a list of tokens.
        """
        for sent in self.sentences(fileids,categories):
            for token in sent:
                yield token

    def words(self, fileids=None, categories=None):
        """
        Returns a list of (token, tag) tuples.
        """
        for token in self.tokens(fileids, categories):
            yield token[0]

In [3]:
# Use PickledCorpusReader class to read pickled files from Corpus
reader= PickledCorpusReader('C:\\Users\\vijay\\Desktop\\Rama\\Python NLP projects\\Document classification\\sample')
words = Counter(reader.words())
print(" Vocabulary_size: {:,} , Total_word_count : {:,} ".format(len(words.keys()),sum(words.values())))

 Vocabulary_size: 58,748 , Total_word_count : 1,624,862 


### Dataset information

Our dataset contains 12 document categories ('books', 'cinema', 'cooking', 'gaming', 'sports', 'tech', 'data_science',\   
 'design', 'news', 'politics', 'do_it_yourself', 'business').

Each categpry contains,

In [4]:
for category in reader.categories():

    n_docs = len(reader.fileids(categories=[category]))
    n_words = sum(1 for word in reader.words(categories=[category]))

    print(f'{category} contains {n_docs} documents and {n_words} words')

books contains 71 documents and 41438 words
business contains 389 documents and 222182 words
cinema contains 100 documents and 69153 words
cooking contains 30 documents and 37854 words
data_science contains 41 documents and 31354 words
design contains 55 documents and 18260 words
do_it_yourself contains 122 documents and 28050 words
gaming contains 128 documents and 70778 words
news contains 1159 documents and 850688 words
politics contains 149 documents and 88853 words
sports contains 118 documents and 68884 words
tech contains 176 documents and 97368 words


### Load the pickled files from reader 

In [5]:
# Create a CorpusLoader class
class CorpusLoader(object):
    # Initialize class variables
    def __init__(self, reader, folds=None, categories=None, shuffle=True):
        self.reader=reader
        self.folds=KFold(n_splits=folds,shuffle=shuffle)
        self.files=np.asarray(self.reader.fileids(categories=categories))

    # Method to access a listing of fileids by foldID.
    def fileids(self, idx=None):
        if idx is None:
            return self.files
        return self.files[idx]

    # Returns a genetator with one document at a time
    def documents(self,idx=None):
        for fileid in self.fileids(idx):
            yield list(self.reader.docs(fileids=[fileid]))

    # To look up label from the corpus & returns a label for each document.
    def labels(self, idx=None):
        return [
        self.reader.categories(fileids=[fileid])[0]
        for fileid in self.fileids(idx)
        ]

    # Iterator to split training & test data for each fold using KFold's Split()
    def __iter__(self):
        for train_index, test_index in self.folds.split(self.files):
            X_train = self.documents(train_index)
            y_train = self.labels(train_index)

            X_test = self.documents(test_index)
            y_test = self.labels(test_index)

            yield X_train, X_test, y_train, y_test

### Build the Document classifiers

In [6]:
# Dataset has already been tokenized, so create a identity for tokenizer.
def identity(words):
    return words

# Class for normalizing text.
class TextNormalizer(BaseEstimator,TransformerMixin):

    def __init__(self, language ='english'):
        self.stopwords = set(stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()

    def is_punct(self, token):
        """
        Check the punctuation character.
        """
        return all (
        unicodedata.category(char).startswith('P') for char in token
        )

    def is_stopword(self,token):
        return token.lower() in self.stopwords

    def doc_normalize(self, doc):
        """
        Removes stopwords and punctuation, lowercases, lemmatizes
        """
        return [
        self.lemmatize(token,tag).lower()
        for paragraph in doc
        for sent in paragraph
        for (token,tag) in sent
        if not self.is_punct(token) and not self.is_stopword(token)
        ]

    def lemmatize(self, token, pos_tag):
        """
        Return the WordNet POS tag from the Penn Treebank tag
        """
        tag = {
        'N' : wn.NOUN,
        'V' : wn.VERB,
        'R' : wn.ADV,
        'J' : wn.ADJ
        }.get(pos_tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

    def fit(self, X, y=None):
        return self

    def transform(self, documents):
        for doc in documents:
            yield self.doc_normalize(doc[0])

# Create pipeline for all models.
def pipeline(estimator):
    steps=[
    ('normalize',TextNormalizer()),
    ('vectorize',TfidfVectorizer(
    tokenizer = identity, preprocessor=None, lowercase=False
    ))
    ]
    # Add the estimator
    steps.append(('classifier',estimator))
    return Pipeline(steps)

labels = ['books','cinema','cooking','gaming','sports','tech','data_science',\
            'design','news','politics','do_it_yourself','business']

# Load the pickle files from coropus using CorpusLoader class
loader = CorpusLoader(reader, 10, shuffle=True, categories=labels)

models = []
names = [LogisticRegression, SGDClassifier, MultinomialNB]
for model in names:
    models.append(pipeline(model()))

def model_scores(models,loader):
    for model in models:
        name = model.named_steps['classifier'].__class__.__name__

        scores = {
        'model':str(model),
        'name':name,
        'accuracy':[],
        'precision':[],
        'recall':[],
        'f1':[],
        'time':[]
        }

        for X_train, X_test, y_train, y_test in loader:
            start = time.time()
            model.fit(X_train,y_train)
            y_pred = model.predict(X_test)

            scores['time'].append(time.time() - start)
            scores['accuracy'].append(accuracy_score(y_test, y_pred))
            scores['precision'].append(precision_score(y_test, y_pred,average='weighted'))
            scores['recall'].append(recall_score(y_test,y_pred,average='weighted'))
            scores['f1'].append(f1_score(y_test,y_pred,average='weighted'))

        scores['accuracy']=np.mean(scores['accuracy'])
        scores['precision']=np.mean(scores['precision'])
        scores['recall']=np.mean(scores['recall'])
        scores['f1']=np.mean(scores['f1'])
        yield scores

In [7]:
# Save the model scores in json file.
for scores in model_scores(models, loader):
    with open('results.json','a') as f:
        f.write(json.dumps(scores)+'\n')

### Let's have a look at the results.

In [11]:
headers= ['model','precision','recall','accuracy','f1']
# Read result file
df=pd.read_json('results.json',lines=True)
df=df.drop(['model','time'],axis=1)

table=list(df.values)
table.sort(key=lambda r:r[-1], reverse=True)
print(tabulate.tabulate(table,headers=headers))

model                 precision    recall    accuracy        f1
------------------  -----------  --------  ----------  --------
SGDClassifier          0.678107  0.67647     0.678107  0.657237
LogisticRegression     0.563833  0.587883    0.563833  0.472884
MultinomialNB          0.45863   0.302512    0.45863   0.290672
