In [1]:
# Basics
import os
import pandas as pd
import numpy as np
import pickle
from pprint import pprint

# NLP
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
import pyLDAvis # this package is case sensitive
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
class TopicModellingSklearn:
    def __init__(self,text,min_df,max_df,n_components,random_state):
        """ 
        Initialize class. 
        
        Arguments:
            text - DF column containing text block
            min_df = Minimum number of articles the word must appear in for the
                word to be considered.
            max_df = Threshold for unique words to considered (drop words 
               appearing too frequently, as in stopwords)
            n_topics = Number of topics to consider
            random_seed = Random seed to use for the modelling
        """
        # Set up internal class variables
        self.text = text
        self.min_df = min_df
        self.max_df = max_df
        self.n_components = n_components
        self.random_state = random_state
        
        # Fit an LDA model
        self.LDA_model, self.word_frequency, self.vocabulary = self.LDA_model()

            
    def LDA_model(self):
        """ Fit text to an LDA model """
        stop_words_all = list(nltk.corpus.stopwords.words('english'))
        print(len(stop_words_all))
        stop_words_new = ["new","like","example","see","code",
                          "use","used","using","user","one","two","also",
                          "analysis","data","dataset","row","column",
                         "set","list","index","item","array",
                          "let","input","return","function","python",
                         "panda","package","number","would","figure","make","get"]
        stop_words_all.extend(stop_words_new)
        print(len(stop_words_all))
        
        word_frequency = CountVectorizer(min_df = self.min_df,
                                        stop_words=stop_words_all)
                
        vocabulary = word_frequency.fit_transform(
                self.text.values.astype('U'))
        
        LDA = LatentDirichletAllocation(n_components = self.n_components,
                                        random_state = self.random_state)
        LDA_model = LDA.fit(vocabulary)
        
        return LDA_model, word_frequency, vocabulary

In [3]:
def lemmatize_text(article_text):
        lemmatizer = WordNetLemmatizer()
        
        stop_words_all = nltk.corpus.stopwords.words('english')
        
        article_text_proc = []
        article_text = article_text.split(" ")
        for word in article_text:
            word = word.lower()
            if word not in stop_words_all:
                article_text_proc.append(lemmatizer.lemmatize(word))
        processed_text = " ".join(article_text_proc)
                
        return processed_text

In [9]:
# Import data
filedir = os.path.dirname(os.path.realpath('__file__'))
filename = os.path.join('../data/processed/articles_python.csv')
filename = os.path.abspath(os.path.realpath(filename))
articles_python = pd.read_csv(filename,index_col = "postId")

In [16]:
print("Original text of article 10")
print(articles_python["text"].iloc[100][0:500])

Original text of article 10

Automated Machine Learning Hyperparameter Tuning in Python
A complete walk through using Bayesian optimization for automated hyperparameter tuning in Python
Tuning machine learning hyperparameters is a tedious yet crucial task, as the performance of an algorithm can be highly dependent on the choice of hyperparameters. Manual tuning takes time away from important steps of the machine learning pipeline like feature engineering and interpreting results. Grid and random search are hands-off, but r


In [17]:
for i in range(0,len(articles_python)):
    if np.remainder(i,250) == 0:
        print("Now lemmatizing article {}".format(i))
    articles_python["text"].iloc[i] = lemmatize_text(articles_python["text"].iloc[i])

Now lemmatizing article 0
Now lemmatizing article 250
Now lemmatizing article 500
Now lemmatizing article 750
Now lemmatizing article 1000
Now lemmatizing article 1250


In [18]:
# Test that the lemmatization worked as intended
print("Lemmatized text of article 10")
print(articles_python["text"].iloc[100][0:500])

Lemmatized text of article 10

automated machine learning hyperparameter tuning python
a complete walk using bayesian optimization automated hyperparameter tuning python
tuning machine learning hyperparameters tedious yet crucial task, performance algorithm highly dependent choice hyperparameters. manual tuning take time away important step machine learning pipeline like feature engineering interpreting results. grid random search hands-off, require long run time waste time evaluating unpromising area search space. increasin


In [None]:
# sklearn/LDA (unsupervised); text case consistency + lemmatization
model_beta = TopicModellingSklearn(text=articles_python["text"],
                                       min_df = 3,
                                       max_df = 0.75,
                                       n_components = 7,
                                       random_state = 42)

In [None]:
print("Log Likelihood: ", model_beta.LDA_model.score(model_beta.vocabulary))
print("Perplexity: ", model_beta.LDA_model.perplexity(model_beta.vocabulary))
print("Model parameters:")
pprint(model_beta.LDA_model.get_params())

In [None]:
search_params = {'n_components' : [5,6,7,8,9,10],
                'min_df' : 3,
                'max_df' : 0.75,
                'random_state' : 42}

In [None]:
lda_base = LatentDirichletAllocation()
model_testing = GridSearchCV(lda_base, param_grid=search_params)

In [None]:
model_testing.fit(model_beta.vocabulary)

In [None]:
best_lda_model = model_testing.best_estimator_

In [None]:
print("Best Model's Params: ", model_testing.best_params_)

In [None]:
print('Model beta:')
for i, topic in enumerate(model_beta.LDA_model.components_):
    print('Top words for topic {}:'.format(i))
    print([model_beta.word_frequency.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

In [None]:
model_dict = {0 : 'general machine learning',
             1 : 'general data science',
             2 : 'natural language processing',
             3 : 'natural language processing',
             4 : 'general data science',
             5 : 'neural networks',
             6 : 'clustering'}

In [None]:
LDAvis_prepared = pyLDAvis.sklearn.prepare(model_beta.LDA_model, model_beta.vocabulary, model_beta.word_frequency)

In [None]:
pyLDAvis.display(LDAvis_prepared)

In [None]:
pyLDAvis.prepared_data_to_html(LDAvis_prepared, template_type='general')

In [None]:
model_beta = model_beta.LDA_model
filename = 'pickled_LDA_model.sav'
pickle.dump(model_beta, open(filename, 'wb'))

In [None]:
?pyLDAvis.prepared_to_html?

In [None]:
?pyLDAvis.prepared_to_html

In [None]:
?pyLDAvis.prepared_data_to_html