In [1]:
# Basics
import os
import pandas as pd
import numpy as np
import pickle
from pprint import pprint

# NLP
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
import pyLDAvis # this package is case sensitive
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
class TopicModellingSklearn:
    def __init__(self,text,min_df,max_df,n_components,random_state):
        """ 
        Initialize class. 
        
        Arguments:
            text - DF column containing text block
            min_df = Minimum number of articles the word must appear in for the
                word to be considered.
            max_df = Threshold for unique words to considered (drop words 
               appearing too frequently, as in stopwords)
            n_topics = Number of topics to consider
            random_seed = Random seed to use for the modelling
        """
        # Set up internal class variables
        self.text = text
        self.min_df = min_df
        self.max_df = max_df
        self.n_components = n_components
        self.random_state = random_state
        
        # Fit an LDA model
        self.LDA_model, self.word_frequency, self.vocabulary = self.LDA_model()

            
    def LDA_model(self):
        """ Fit text to an LDA model """
        stop_words_all = list(nltk.corpus.stopwords.words('english'))
        print(len(stop_words_all))
        stop_words_new = ["new","like","example","see","code",
                          "use","used","using","user","one","two","also",
                          "analysis","data","dataset","row","column",
                         "set","list","index","item","array",
                          "let","input","return","function","python",
                         "panda","package","number","would","figure","make","get"]
        stop_words_all.extend(stop_words_new)
        print(len(stop_words_all))
        
        word_frequency = TfidfVectorizer(min_df = self.min_df,
                                        stop_words=stop_words_all)
                        
        vocabulary = word_frequency.fit_transform(
                self.text.values.astype('U'))
        
        feature_names = word_frequency.get_feature_names()
        corpus_index = [n for n in self.text.values]
        df = pd.DataFrame(vocabulary.todense(), index=corpus_index, columns=feature_names)
        print(df.head())
        print(df.shape)
                
#         tfidf = TfidfVectorizer(vocabulary = myvocabulary, ngram_range = (1,3))
#         tfs = tfidf.fit_transform(corpus.values())
            
        LDA = LatentDirichletAllocation(n_components = self.n_components,
                                        random_state = self.random_state)
        LDA_model = LDA.fit(vocabulary)
        
        return LDA_model, word_frequency, vocabulary

In [3]:
def lemmatize_text(article_text):
        lemmatizer = WordNetLemmatizer()
        
        stop_words_all = nltk.corpus.stopwords.words('english')
        
        article_text_proc = []
        article_text = article_text.split(" ")
        for word in article_text:
            word = word.lower()
            if word not in stop_words_all:
                article_text_proc.append(lemmatizer.lemmatize(word))
        processed_text = " ".join(article_text_proc)
                
        return processed_text

In [4]:
# Import data
filedir = os.path.dirname(os.path.realpath('__file__'))
filename = os.path.join('../data/processed/articles_python.csv')
filename = os.path.abspath(os.path.realpath(filename))
articles_python = pd.read_csv(filename,index_col = "postId")

In [5]:
print("Original text of article 10")
print(articles_python["text"].iloc[325][0:500])

Original text of article 10
Classification Model Evaluation

What is Model Evaluation?
Model evaluation is the process of choosing between models, different model types, tuning parameters, and features. Better evaluation processes lead to better, more accurate models in your applications
In this article we’ll be discussing Model Evaluation for a supervised classification model. We’ll cover evaluation procedures, evaluation metrics, and where to apply them.
Prerequisites
Python 3.+
Anaconda (Scikit Learn, Numpy, Pandas, Mat


In [6]:
for i in range(0,len(articles_python)):
    if np.remainder(i,250) == 0:
        print("Now lemmatizing article {}".format(i))
    articles_python["text"].iloc[i] = lemmatize_text(articles_python["text"].iloc[i])

Now lemmatizing article 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Now lemmatizing article 250
Now lemmatizing article 500
Now lemmatizing article 750
Now lemmatizing article 1000
Now lemmatizing article 1250


In [7]:
print(len(articles_python))
# Test that the lemmatization worked as intended
print("Lemmatized text of article 10")
print(articles_python["text"].iloc[325][0:500])

1438
Lemmatized text of article 10
classification model evaluation

what model evaluation?
model evaluation process choosing models, different model types, tuning parameters, features. better evaluation process lead better, accurate model applications
in article we’ll discussing model evaluation supervised classification model. we’ll cover evaluation procedures, evaluation metrics, apply them.
prerequisites
python 3.+
anaconda (scikit learn, numpy, pandas, matplotlib, seaborn)
jupyter notebook.
basic understanding supervised mach


In [8]:
# sklearn/LDA (unsupervised); text case consistency + lemmatization
model_alpha = TopicModellingSklearn(text=articles_python["text"],
                                       min_df = 3,
                                       max_df = 0.75,
                                       n_components = 3,
                                       random_state = 42)

179
213
                                                     00       000  00001  \
\nhow build alphazero ai using python keras\nte...  0.0  0.000000    0.0   
\npython perfect tool problem\nreflecting first...  0.0  0.000000    0.0   
\na complete machine learning project walk-thro...  0.0  0.019964    0.0   
train machine learning model google’s gpus free...  0.0  0.000000    0.0   
train ai convert design mockups html css\nwithi...  0.0  0.000000    0.0   

                                                    0001  00021  0005  001  \
\nhow build alphazero ai using python keras\nte...   0.0    0.0   0.0  0.0   
\npython perfect tool problem\nreflecting first...   0.0    0.0   0.0  0.0   
\na complete machine learning project walk-thro...   0.0    0.0   0.0  0.0   
train machine learning model google’s gpus free...   0.0    0.0   0.0  0.0   
train ai convert design mockups html css\nwithi...   0.0    0.0   0.0  0.0   

                                                    002  004  005 

In [9]:
# sklearn/LDA (unsupervised); text case consistency + lemmatization
model_beta = TopicModellingSklearn(text=articles_python["text"],
                                       min_df = 3,
                                       max_df = 0.75,
                                       n_components = 7,
                                       random_state = 42)

179
213
                                                     00       000  00001  \
\nhow build alphazero ai using python keras\nte...  0.0  0.000000    0.0   
\npython perfect tool problem\nreflecting first...  0.0  0.000000    0.0   
\na complete machine learning project walk-thro...  0.0  0.019964    0.0   
train machine learning model google’s gpus free...  0.0  0.000000    0.0   
train ai convert design mockups html css\nwithi...  0.0  0.000000    0.0   

                                                    0001  00021  0005  001  \
\nhow build alphazero ai using python keras\nte...   0.0    0.0   0.0  0.0   
\npython perfect tool problem\nreflecting first...   0.0    0.0   0.0  0.0   
\na complete machine learning project walk-thro...   0.0    0.0   0.0  0.0   
train machine learning model google’s gpus free...   0.0    0.0   0.0  0.0   
train ai convert design mockups html css\nwithi...   0.0    0.0   0.0  0.0   

                                                    002  004  005 

In [10]:
print("Log Likelihood: ", model_beta.LDA_model.score(model_beta.vocabulary))
print("Perplexity: ", model_beta.LDA_model.perplexity(model_beta.vocabulary))
print("Model parameters:")
pprint(model_beta.LDA_model.get_params())

Log Likelihood:  -150969.59261284457
Perplexity:  16231.23394788008
Model parameters:
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'batch',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 7,
 'n_jobs': None,
 'perp_tol': 0.1,
 'random_state': 42,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [14]:
search_params = {'n_components' : [5,6,7,8,9,10]}

In [15]:
lda_base = LatentDirichletAllocation()
model_testing = GridSearchCV(lda_base, param_grid=search_params)

In [16]:
model_testing.fit(model_beta.vocabulary)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='batch',
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=None,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                   

In [None]:
best_lda_model = model_testing.best_estimator_

In [None]:
print("Best Model's Params: ", model_testing.best_params_)

In [None]:
print('Model beta:')
for i, topic in enumerate(model_beta.LDA_model.components_):
    print('Top words for topic {}:'.format(i))
    print([model_beta.word_frequency.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

In [None]:
model_dict = {0 : 'general machine learning',
             1 : 'general data science',
             2 : 'natural language processing',
             3 : 'natural language processing',
             4 : 'general data science',
             5 : 'neural networks',
             6 : 'clustering'}

In [None]:
LDAvis_prepared_alpha = pyLDAvis.sklearn.prepare(model_alpha.LDA_model, model_alpha.vocabulary, model_alpha.word_frequency)

In [None]:
pyLDAvis.display(LDAvis_prepared_alpha)

In [None]:
LDAvis_prepared_beta = pyLDAvis.sklearn.prepare(model_beta.LDA_model, model_beta.vocabulary, model_beta.word_frequency)

In [None]:
pyLDAvis.display(LDAvis_prepared_beta)

In [None]:
pyLDAvis.prepared_data_to_html(LDAvis_prepared, template_type='general')

In [None]:
# model_beta = model_beta.LDA_model
# filename = 'pickled_LDA_model.sav'
# pickle.dump(model_beta, open(filename, 'wb'))

In [None]:
?pyLDAvis.prepared_to_html?

In [None]:
?pyLDAvis.prepared_to_html

In [None]:
?pyLDAvis.prepared_data_to_html