In [14]:
# Basics
import os
import pandas as pd
import numpy as np
import pickle

# NLP
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis # this package is case sensitive
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
class TopicModellingSklearn:
    def __init__(self,text,min_df,max_df,n_components,random_state):
        """ 
        Initialize class. 
        
        Arguments:
            text - DF column containing text block
            min_df = Minimum number of articles the word must appear in for the
                word to be considered.
            max_df = Threshold for unique words to considered (drop words 
               appearing too frequently, as in stopwords)
            n_topics = Number of topics to consider
            random_seed = Random seed to use for the modelling
        """
        # Set up internal class variables
        self.text = text
        self.min_df = min_df
        self.max_df = max_df
        self.n_components = n_components
        self.random_state = random_state
        
        # Fit an LDA model
        self.LDA_model, self.word_frequency, self.vocabulary = self.LDA_model()

            
    def LDA_model(self):
        """ Fit text to an LDA model """
        stop_words_all = list(nltk.corpus.stopwords.words('english'))
        print(len(stop_words_all))
        stop_words_new = ["new","like","example","see","code",
                          "use","used","using","user","one","two","also",
                          "analysis","data","dataset","row","column",
                         "set","list","index","item","array",
                          "let","input","return","function","python",
                         "panda","package","number","would","figure","make","get"]
        stop_words_all.extend(stop_words_new)
        print(len(stop_words_all))
        
        word_frequency = CountVectorizer(min_df = self.min_df,
                                        stop_words=stop_words_all)
        vocabulary = word_frequency.fit_transform(
                self.text.values.astype('U'))
        
        LDA = LatentDirichletAllocation(n_components = self.n_components,
                                        random_state = self.random_state)
        LDA_model = LDA.fit(vocabulary)
        
        return LDA_model, word_frequency, vocabulary

In [3]:
def lemmatize_text(article_text):
        lemmatizer = WordNetLemmatizer()
        
        stop_words_all = nltk.corpus.stopwords.words('english')
        
        article_text_proc = []
        article_text = article_text.split(" ")
        for word in article_text:
            word = word.lower()
            if word not in stop_words_all:
                article_text_proc.append(lemmatizer.lemmatize(word))
        processed_text = " ".join(article_text_proc)
                
        return processed_text

In [4]:
# Import data
filedir = os.path.dirname(os.path.realpath('__file__'))
filename = os.path.join('../data/processed/articles_python.csv')
filename = os.path.abspath(os.path.realpath(filename))
articles_python = pd.read_csv(filename,index_col = "postId")

In [5]:
print("Original text of article 10")
print(articles_python["text"].iloc[10][0:500])

Original text of article 10

Stock Prediction in Python
Make (and lose) fake fortunes while learning real Python
Trying to predict the stock market is an enticing prospect to data scientists motivated not so much as a desire for material gain, but for the challenge.We see the daily up and downs of the market and imagine there must be patterns we, or our models, can learn in order to beat all those day traders with business degrees. Naturally, when I started using additive models for time series prediction, I had to test th


In [6]:
for i in range(0,len(articles_python)):
    if np.remainder(i,250) == 0:
        print("Now lemmatizing article {}".format(i))
    articles_python["text"].iloc[i] = lemmatize_text(articles_python["text"].iloc[i])

Now lemmatizing article 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Now lemmatizing article 250
Now lemmatizing article 500
Now lemmatizing article 750
Now lemmatizing article 1000
Now lemmatizing article 1250


In [7]:
print("Lemmatized text of article 10")
print(articles_python["text"].iloc[10][0:500])

Lemmatized text of article 10

stock prediction python
make (and lose) fake fortune learning real python
trying predict stock market enticing prospect data scientist motivated much desire material gain, challenge.we see daily down market imagine must pattern we, models, learn order beat day trader business degrees. naturally, started using additive model time series prediction, test method proving ground stock market simulated funds. inevitably, joined many others tried beat market day-to-day basis failed. however, process, 


In [8]:
# sklearn/LDA (unsupervised); text case consistency + lemmatization
model_beta = TopicModellingSklearn(text=articles_python["text"],
                                       min_df = 3,
                                       max_df = 0.75,
                                       n_components = 7,
                                       random_state = 42)

179
213


In [9]:
print('Model beta:')
for i, topic in enumerate(model_beta.LDA_model.components_):
    print('Top words for topic {}:'.format(i))
    print([model_beta.word_frequency.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

Model beta:
Top words for topic 0:
['learn', 'time', 'need', 'graph', 'method', 'test', 'missing', 'validation', 'problem', 'algorithm', 'first', 'random', 'prediction', 'tree', 'machine', 'training', 'learning', 'value', 'feature', 'model']


Top words for topic 1:
['different', 'take', 'better', 'need', 'value', 'way', 'good', 'first', 'could', 'action', 'state', 'machine', 'model', 'game', 'class', 'problem', 'algorithm', 'feature', 'time', 'learning']


Top words for topic 2:
['entity', 'negative', 'create', 'learn', 'need', 'sentiment', 'test', 'train', 'label', 'class', 'positive', 'accuracy', 'classifier', 'text', 'classification', 'ml', 'training', 'machine', 'learning', 'model']


Top words for topic 3:
['corpus', 'similarity', 'look', 'term', 'frequency', 'stock', 'first', 'sentence', 'value', 'vector', 'topic', 'words', 'based', 'model', 'document', 'plot', 'series', 'text', 'time', 'word']


Top words for topic 4:
['line', 'look', 'work', 'project', 'spark', 'following', 'w

In [16]:
model_dict = {0 : 'general machine learning',
             1 : 'general data science',
             2 : 'natural language processing',
             3 : 'natural language processing',
             4 : 'general data science',
             5 : 'neural networks',
             6 : 'clustering'}

In [10]:
LDAvis_prepared = pyLDAvis.sklearn.prepare(model_beta.LDA_model, model_beta.vocabulary, model_beta.word_frequency)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [13]:
pyLDAvis.display(LDAvis_prepared)

In [15]:
filename = 'pickled_LDA_model.sav'
pickle.dump(model_beta, open(filename, 'wb'))

In [19]:
test_dict={0: 'galileo',
          1 : 'faraday'}

test_var = 0

test_output = test_dict[test_var]
print(test_output)

galileo
