#   Package Used
    
    --nltk Package has been used for data preprocessing
       . stopword removal
       . special character removed
       . part of speech tag(pos)
       . tokenizing
       . lemmatizing noun from tokenize token
       
    --CountVectorizer from sklearn to convert text into its Vector Form 
    --LDA(LatentDirichletAllocation) is used from sklearn, LDA can also be used using Gensim module
    

In [21]:
from sklearn.datasets import load_files
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import  CountVectorizer
from sklearn.pipeline import Pipeline
import numpy as np
import string
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords as sw
from nltk import wordpunct_tokenize
from nltk.corpus import wordnet as wn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline



# Data PreProcessing Step
   
   Class Name NLTKPreprocessor is made using BaseEstimator transformerMixin for data 
   preprocessing step
   
   identity function is created which is used in tokenize argument of CountVectorizer 
   which bypasses the tokenize function
   
   
   

In [4]:
def identity(arg):
    """
    Simple identity function works as a passthrough.
    """
    return arg

class NLTKPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = stopwords or set(sw.words('english'))
        self.punct      = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                if token in self.stopwords:
                    continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

In [5]:
preprocessor = NLTKPreprocessor()



#  Data Used
    
    link - http://ai.stanford.edu/~amaas/data/sentiment/
    ACL paper - 2011, 
    Author = = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and    
                Potts, Christopher},
    title     = {Learning Word Vectors for Sentiment Analysis},
    
    
    only test data is used for Topic Modelling which is of size 25000
    
    


In [6]:
data = load_files('/aclImdb/test')
train_data = data.data
text_train=[x.decode('utf-8') for x in train_data]



# Preprocessing Extracted Data



In [7]:
preprocess_data=preprocessor.transform(text_train)

# vectorizing word using CountVectorizer



In [8]:
vect = CountVectorizer(max_features=10000, max_df=.15,tokenizer=identity, preprocessor=None, lowercase=False)

In [9]:
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=0.15, max_features=10000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function identity at 0x7f15a8225268>, vocabulary=None)

In [10]:
vectorize_data=vect.fit_transform(preprocess_data)

# LDA package
  
  Number of topic for which whole  document is divided is 10

In [11]:
lda = LatentDirichletAllocation(n_topics = 10, learning_method="batch",max_iter=25, random_state=0)

In [12]:
lda

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=25, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=10, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [13]:
document_topics = lda.fit_transform(vectorize_data)





# Topic Visualisation
   
     used mglearn package for topic visualisation
    
     
   
   

In [17]:
def print_topics(topics, feature_names, sorting, topics_per_chunk=6,n_words=20):
    for i in range(0,len(topics),topics_per_chunk):
        these_topics = topics[i : i + topics_per_chunk]
        len_this_chunk = len(these_topics)
        print(("topic {:<8}" * len_this_chunk).format(*these_topics))
        print(("-------- {0:<5}" * len_this_chunk).format(""))
        for i in range(n_words):
            try:
                print(("{:<14}" * len_this_chunk).format(*feature_names[sorting[these_topics, i]]))
            except:
                pass
        print("\n")
            

In [18]:
sorting = np.argsort(lda.components_,axis=1)[:,::-1]
feature_names = np.array(vect.get_feature_names())
print_topics(topics = range(10), feature_names = feature_names, sorting=sorting, topics_per_chunk=5,n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
book          role          funny         guy           kill          
read          performance   comedy        minute        black         
dvd           song          kid           actually      killer        
series        star          laugh         horror        woman         
tv            cast          joke          nothing       action        
novel         music         fun           2                          
version       john          old           10            drug          
saw           musical       game          stupid        murder        
original      big           guy           act           cop           
write         comedy        episode       waste         death         


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
musi

In [24]:
model =  Pipeline([('preprocess',NLTKPreprocessor()),\
                      ('vect',CountVectorizer(max_features=10000, max_df=.15,tokenizer=identity, preprocessor=None, lowercase=False)),\
                       ('lda', LatentDirichletAllocation(n_topics = 10, learning_method="batch",max_iter=25, random_state=0))])

In [25]:
model

Pipeline(memory=None,
     steps=[('preprocess', NLTKPreprocessor(lower=True,
         punct={'.', '|', '?', '#', '*', '"', "'", ';', '&', ')', '(', '/', '`', ']', '>', '<', '=', '-', '@', '[', '^', '!', '}', '~', '_', '%', '$', '+', ',', '{', '\\', ':'},
         stopwords={'so', "isn't", 'after', 'o', 'our', "she's", 'itsel...           random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0))])

In [26]:
model.fit(text_train)



Pipeline(memory=None,
     steps=[('preprocess', NLTKPreprocessor(lower=True,
         punct={'.', '|', '?', '#', '*', '"', "'", ';', '&', ')', '(', '/', '`', ']', '>', '<', '=', '-', '@', '[', '^', '!', '}', '~', '_', '%', '$', '+', ',', '{', '\\', ':'},
         stopwords={'so', "isn't", 'after', 'o', 'our', "she's", 'itsel...           random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0))])

In [27]:
model.transform(['Hacksaw Ridge is a movie that shows us the pain of war.\
                  It brings our mind to virtualize a whole lot of things\
                  about the life of soldiers and their struggle to save the nation.\
                  It shows the real picture of world war-II and the real life history\
                  of an army man Doss. The movie is really a tribute to Doss and also\
                  to each and every army soldiers. Though the movie is not humorous, it\
                  shows the humanity towards mankind.'])

array([[ 0.00333389,  0.00333394,  0.00333413,  0.00333412,  0.00333376,
         0.00333413,  0.00333407,  0.96999388,  0.00333415,  0.00333393]])

# CONCLUSION

By Seeing the word of topic we can conclude about the movie review content , like topic 5 deal with music related, topic 7 is related with war related, topic made with Horror related and so on

For predicting the topic of new movie review we just need to create a pipeline of all three preprocess and use model transform function. Topic having maximum probability will belong to that topic

We can see that the comment for above Hacksaw ridge movie, the score with 7 position(indexing starting from 0) is maximum which belong to topic 7, which clearly indicate movie genre which is war
