**Tuning for the number of topics and passes**



In [None]:
import nltk
import numpy as np
import pandas as pd
from nltk import word_tokenize
import re
import seaborn as sns
import matplotlib.pyplot as plt
import smart_open
from gensim.models import LdaModel
from gensim.models import LdaMulticore


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path = "/content/drive/MyDrive/Journal_Analysis/"

In [None]:
def extract_documents(folder_name):
  import os
  docs = []
  allfiles = []
  for entry in os.scandir(folder_name):
      if entry.path.endswith(".txt") and entry.is_file():
          path_paper = entry.path
          allfiles.append(path_paper)
          if os.stat(path_paper).st_size == 0:
              continue
          with open(path_paper) as f:
            text = f.read()
            docs.append(text)
  return docs, allfiles

In [None]:
docs, allfiles = extract_documents(path + 'allpapersTXT')

In [None]:
print(len(docs))
print(docs[0][:500])

226
The Artisan and His Audience: Identification with Work and Price Setting in a Handicraft Cluster in Southern India
Using ethnographic, experimental, and survey data from a handicraft cluster in southern India, this paper reports on a study of when and why people who identify with their work might sacrifice financial rewards in their economic decisions. Based on findings from ethnographic fieldwork, I hypothesize that the monetary value that individuals who identify with their work seek for their


In [None]:
allfiles[0]

'/content/drive/MyDrive/Journal_Analysis/allpapersTXT/10.1177_0001839217725782.txt'

In [None]:
print(docs[0][:500])

The Artisan and His Audience: Identification with Work and Price Setting in a Handicraft Cluster in Southern India
Using ethnographic, experimental, and survey data from a handicraft cluster in southern India, this paper reports on a study of when and why people who identify with their work might sacrifice financial rewards in their economic decisions. Based on findings from ethnographic fieldwork, I hypothesize that the monetary value that individuals who identify with their work seek for their


In [None]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.
    #docs[idx] = docs[idx].lower()  # Convert to lowercase.

# Remove numbers, but not words that contain numbers.
docs = [[token.lower() for token in doc] for doc in docs]
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are less than 5 characters.
docs = [[token for token in doc if len(token) > 4] for doc in docs]



In [None]:
# Lemmatize the documents.
import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
#Filter out stop words
nltk.download('stopwords')
en_stop = nltk.corpus.stopwords.words('english')
print(len(en_stop))
en_stop.extend(["organization", "organizational", "organizing", "would", "could", "result", "study", "model", "likely", "effect", "effected", "variable", "variables", "measure", "include", "suggest", "first", "level", "research"])
print(len(en_stop))
"organization" in en_stop

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
179
198


True

In [None]:
docs = [[token for token in doc if token not in en_stop] for doc in docs]   #remove stopwords

In [None]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)
            #print("yes")



In [None]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)
print(len(dictionary))

# Filter out words that occur less than 2 documents, or more than 70% of the documents.
dictionary.filter_extremes(no_below=3, no_above=0.7)
print(len(dictionary))


25096
11914


In [None]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 11914
Number of documents: 226


In [None]:
def modify_string(probability_string):
  prob = [float(x.split("*")[0]) for x in probability_string.split("+")]
  words = [x.split("*")[1] for x in probability_string.split("+")]
  return prob,words

In [None]:
def make_df(topics):
  topic_list = []
  df_all = []
  for topic in topics:
    topic_list.append("topic {0}".format(topic[0]))
    prob, words = modify_string(topic[1])
    df_topic = pd.DataFrame(data = [prob, words], index = ["Probabilities", "Words"]).T
    df_all.append(df_topic)
  df = pd.concat(df_all, axis = 1, keys = topic_list  )   #https://stackoverflow.com/questions/40820017/how-to-create-a-multilevel-dataframe-in-pandas
  
  return df


Hyperparameter Tuning

Parameters to tune: Number of topics 

In [None]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, passes, eta, alpha):
    from gensim.models import CoherenceModel

    lda_model = LdaModel(
            corpus= corpus, id2word=dictionary, num_topics= k, 
            chunksize=2000, eta= eta,
            passes= passes, iterations = 400, alpha= alpha, random_state= 42)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts= docs, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
min_topics = 10
max_topics = 15
step_size = 1
topics_range = np.arange(min_topics, max_topics, step_size)
print(topics_range)

[10 11 12 13 14]


Looping over topics
 <br>eta = "auto", alpha = "auto, passes = 1000

In [None]:
import time
t1 = time.time()
#Validation set is the whole corpus
# Topics range
min_topics = 10
max_topics = 15
step_size = 1
topics_range = np.arange(min_topics, max_topics, step_size)

#Passes range
passes_range = [1000]

model_results = {
                 'Topics': [],
                 'Passes': [],
                 'Coherence': []
                }
#Looping over parameter ranges 
for num_topics in topics_range:
  for passes in passes_range:
    cv = compute_coherence_values(corpus, dictionary, num_topics, passes, "auto", "auto")
    print(num_topics)
    print(cv)

    # Save the model results
    model_results['Topics'].append(num_topics)
    model_results['Passes'].append(passes)
    model_results['Coherence'].append(cv)
    print("done with one set of parameters")
#save it as a csv 
df_tuning = pd.DataFrame(model_results)
df_tuning.to_csv(path + 'lda_fullcorpus_tuning_results.csv', index=False)
t2 = time.time()
print("Time elapsed in minutes: {}".format((t2-t1)/60))

10
0.3811563177476281
done with one set of parameters
11
0.3742224012177109
done with one set of parameters


In [None]:
df_tuning

NameError: ignored

In [None]:
df_tuning

Unnamed: 0,Topics,Passes,Coherence
0,10,200,0.341962
1,10,500,0.358769
2,10,1000,0.360546
3,11,200,0.325037
4,11,500,0.343494
5,11,1000,0.352838
6,12,200,0.338209
7,12,500,0.346423
8,12,1000,0.347784
9,13,200,0.348951
