In [None]:
# Download clean_data dataset and extract it to the working dir

In [4]:
import os
import pandas as pd

all_text_samples = []

# file_list contains names of all files in the "clean_data" folder
file_list = os.listdir("clean_data/")


In [None]:
for file_name in file_list:
    # Construct fiename and its path
    file = (f"clean_data/" + file_name)
    
    # Now open file for reading
    my_text_file = open(file, encoding="utf8")
    file_data = my_text_file.read()
    
    # Append the data to the list
    all_text_samples.append(file_data)

# Convert list to dataframe
text_dataframe = pd.DataFrame(all_text_samples)
text_dataframe.columns = ["Text"]

In [2]:
# Show the first sample of one row of dataframe
print(text_dataframe["Text"][0])

2016 Update: Whether you enjoy myth busting, Python, or just all enterprise software, you will also likely enjoy Enterprise Software with Python, presented by the author of the article below, and published by O’Reilly.

PayPal enjoys a remarkable amount of linguistic pluralism in its programming culture. In addition to the long-standing popularity of C++ and Java, an increasing number of teams are choosing JavaScript and Scala, and Braintree‘s acquisition has introduced a sophisticated Ruby community.

One language in particular has both a long history at eBay and PayPal and a growing mindshare among developers: Python.

Python has enjoyed many years of grassroots usage and support from developers across eBay. Even before official support from management, technologists of all walks went the extra mile to reap the rewards of developing in Python. I joined PayPal a few years ago, and chose Python to work on internal applications, but I’ve personally found production PayPal Python code fr

In [5]:
text_dataframe.head()

Unnamed: 0,Text
0,"2016 Update: Whether you enjoy myth busting, P..."
1,Let's start with the truth. The 3-point shot w...
2,Media playback is not supported on this device...
3,Krampus with babies postcard (via riptheskull/...
4,"Last week, Michael Dorf published a long and c..."


In [6]:
len(text_dataframe)

7911

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
# max_df is between 0-1 or an INT


count_vectorizer = CountVectorizer(max_df=0.90, min_df=4, stop_words="english")

In [9]:
doc_term_matrix = count_vectorizer.fit_transform(text_dataframe["Text"])

In [10]:
# dtm contains 7911 articles
# 45783 terms (words)

doc_term_matrix

<7911x45783 sparse matrix of type '<class 'numpy.int64'>'
	with 3482007 stored elements in Compressed Sparse Row format>

In [11]:
# LDA section

In [12]:
from sklearn.decomposition import LatentDirichletAllocation

In [13]:
# n_components = no of topics. This for now is trial-and-error
# I'm starting with 10 topics that could be in this group of texts
lda = LatentDirichletAllocation(n_components=10, random_state=1)

In [14]:
lda.fit(doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=1, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [15]:
# Metrics - log likelihood - higher score = better
print("Log likelihood : ", lda.score(doc_term_matrix))


Log likelihood :  -54619996.62116641


In [16]:
# Perplexity - lower = better. 
# = exp(-1 * log likelihood per word)
print("Perplexity: ", lda.perplexity(doc_term_matrix))

Perplexity:  5676.978386836095


In [18]:
len(count_vectorizer.get_feature_names())

45783

In [30]:
count_vectorizer.get_feature_names()[1500]

'abbreviations'

In [31]:
lda.components_

array([[1.10639106e+01, 1.53501573e+03, 1.00001937e-01, ...,
        1.00011451e-01, 1.00002908e-01, 1.00002963e-01],
       [4.25320530e+01, 9.36013737e+02, 3.80958727e+00, ...,
        1.00013365e-01, 1.00000000e-01, 1.00036914e-01],
       [3.24149536e+01, 2.72326043e+01, 1.00026045e-01, ...,
        1.00002969e-01, 1.00016871e-01, 1.14851716e+00],
       ...,
       [1.00012362e-01, 2.99500946e+02, 1.00012134e-01, ...,
        6.09994945e+00, 1.00020349e-01, 1.39723201e+00],
       [4.01616522e+01, 6.68081695e+02, 1.00001637e-01, ...,
        1.00000183e-01, 1.00009829e-01, 1.00000000e-01],
       [7.95734719e+01, 2.86610395e+02, 2.47907651e+00, ...,
        1.00010873e-01, 2.20789665e+00, 1.00082060e-01]])

In [32]:
lda.components_.shape

(10, 45783)

In [33]:
first_topic = lda.components_[0]

In [35]:
first_topic.argsort()

array([41333, 18234, 18506, ..., 30209, 19360, 35646], dtype=int64)

In [36]:
first_topic.argsort()[-15:]

array([39493, 25798, 29929, 27795, 24148, 43058, 45551, 35934, 36067,
        7121, 39486,  7926, 30209, 19360, 35646], dtype=int64)

In [37]:
print(count_vectorizer.get_feature_names()[39493])

study


In [38]:
print(count_vectorizer.get_feature_names()[25798])

medical


In [41]:
word_list = []
probability_list = []

top_number = 20
topic_count = 0

for probability_number in lda.components_:
    text_message = f"Top words for topic {topic_count} are : "
    print(text_message)
    for number in probability_number.argsort()[-top_number:]:
        print([count_vectorizer.get_feature_names()[number]], end="")
        probability_list.append(number)
    print("\n")
    topic_count += 1

Top words for topic 0 are : 
['family']['hospital']['time']['parents']['year']['study']['medical']['patients']['new']['like']['university']['years']['says']['school']['care']['students']['children']['people']['health']['said']

Top words for topic 1 are : 
['low']['economic']['year']['high']['economy']['driving']['republican']['health']['income']['weight']['fat']['new']['cars']['just']['like']['diet']['percent']['car']['trump']['people']

Top words for topic 2 are : 
['example']['different']['ll']['want']['using']['new']['learning']['people']['code']['don']['things']['way']['just']['make']['like']['need']['data']['work']['use']['time']

Top words for topic 3 are : 
['ingredients']['plant']['years']['healthy']['recipe']['protein']['add']['plants']['cooking']['good']['eat']['oil']['use']['time']['just']['species']['make']['like']['water']['food']

Top words for topic 4 are : 
['world']['people']['market']['work']['time']['million']['industry']['years']['year']['ai']['big']['like']['says'

In [42]:
from sklearn.model_selection import GridSearchCV

# Defien the params that we want to use
search_params = {"n_components": [10, 15, 20, 25, 30], "learning_decay": [.5, .7, .9]}

# Init the model
lda_comparison = LatentDirichletAllocation()

# Init Grid Search Class
lda_comparison = GridSearchCV(lda_comparison, param_grid=search_params)

# Run the grid search
lda_comparison.fit(doc_term_matrix)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_components': [10, 15, 20, 25, 30], 'learning_decay': [0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [43]:
# Best model which gives the highest score
best_lda_model = lda_comparisoncomparisoncomparisoncomparison.best_estimator_

# Model parameters is used to store a list of params settings for allprint 
# parameters candidatelda_comparisonint("Best model params: ", lda_comparison.beprintarams_)

# Best log likelihoodlda_comparisonprint("Best log likelihood score", lda_comparison.best_score_)

# Best perplexity
print("Model perplexity : ", best_lda_model.perplexity(doc_term_matrix))


Best model params:  {'learning_decay': 0.5, 'n_components': 10}
Best log likelihood score -18738771.51805039
Model perplexity :  5636.416223420087
