# Topic Modeling Analysis Reference Methods

The following contains reference methods used for the LDA topic model analysis in the submitted paper:

### CONTENT ANALYSIS, AGING AND AUTOBIOGRAPHICAL MEMORY Differences in the Content and Coherence of Autobiographical Memories Between Younger and Older Adults: Insights from Text Analysis 


-----------------

This document presents the primary steps in building the latent dirichlet topic model using in the associated analysis.

## Data Processing

Transcripts loaded into a pandas dataframe, tagged with additional information from each respondent.
cleaning involved 
- removing stop words
- removal of specific non-verbal entries (dashes, etc)
- lemmatization of input tokens and associated POS tagging




The data file after cleaning was called  **lda_df**

and contained the followig columns:

- subject_id - id of the subject giving narration
- gender
- age - numeric age
- group - grouped age
- stage - stage on which the transcript was narrativing (ie, childhood memory, adult memory etc)
- transcript_id - the id for this transcript (unique for subject_id X stage)

- token - each row has one token from that transcript
- lemma - the lemmatization of the token
- pos - associated part of speech tag (granular category)
- pos_type - broader grouping of the pos (ie, noun, verb, etc)


## Model Building

In [None]:
# IMPORT STATEMENTS
import pandas as pd
import os
import tqdm
import string 
import re
import numpy as np
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# nltk
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import sent_tokenize
from nltk import word_tokenize

# lemmatizer 
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

# spacy's lemmatizer was used
import spacy


### Method Used For Model Building 


#### LDA_DF -> Input Documents
 Generate Input Documents from lda_df, based on queries over transcripts (eg, age) and lemmas (eg, using POS to select)

In [None]:
def create_input_docs(TRANSCRIPT_QUERY, LEMMA_TOKEN_QUERY,lemma_or_token,lda_df, lda_df_name):

  # 1) FILTER BASED ON TRANSCRIPT_QUERY
  # make sure the query we give is not empty 
  if len(TRANSCRIPT_QUERY) > 0:
    # create the df "selected_tokens"
    selected_tokens = lda_df.query(TRANSCRIPT_QUERY).copy(deep=False)
  # if no transcript query, just take them all
  else:  
    selected_tokens = lda_df.copy(deep=False)

  # 2) FILTER BASED ON LEMMA_QUERY
  selected_tokens = selected_tokens.query(LEMMA_TOKEN_QUERY)

  # 3) SET INPUT_DOCS AS: 
  input_docs = selected_tokens.groupby("transcript_id")[lemma_or_token].apply(lambda x: list(x))

  # Checking: 
  print("lda_df_name: ", lda_df_name, " | TRANSCRIPT_QUERY:", TRANSCRIPT_QUERY, " | LEMMA_TOKEN_QUERY:", LEMMA_TOKEN_QUERY, " | lemma_or_token:", lemma_or_token)
  print("Working with %s docs with a total of %s tokens"%(len(input_docs), selected_tokens.shape[0]))

  return input_docs

#### Input Documents --> Corpus
Uses input documents to create an associated corpus & dictionary for handling bag of words

In [None]:
def create_dictionary_and_corpus(input_docs, do_filter, no_below_i, no_above_i):

  # 1) create Dictionary
  id2word = corpora.Dictionary(input_docs)

  # 2) FILTER EXTREME WORDS 
  if do_filter == True:

    id2word.filter_extremes(no_below=no_below_i, no_above=no_above_i)

  # 3) map corpus with this dictionary - this uses a BOW model
  corpus = [id2word.doc2bow(text) for text in input_docs]

  # 4) Term Document Frequency
  corpus = [id2word.doc2bow(text) for text in input_docs]

  # 5) View
  print(corpus[:1])

  return id2word, corpus

#### Documents + Corpus --> LDA Model
Run LDA Model

In [None]:
def run_lda_model(corpus, id2word, NUM_TOPICS, output_file):

  # 1) rerun lda model with NUM_TOPICS 
  lda_model = gensim.models.LdaModel(corpus=corpus,
                                          id2word=id2word,
                                          alpha='auto', 
                                          eta='auto',
                                          num_topics=NUM_TOPICS, 
                                          random_state=100,
                                          chunksize=100,
                                          passes=10,
                                          per_word_topics=True)
 
  #per_word_topics (bool) – If True, the model also computes a list of topics, 
  #sorted in descending order of most likely topics for each word, along with 
  #their phi values multiplied by the feature length (i.e. word count)
  
  # 2) print the Keyword in the topics
  pprint(lda_model.print_topics())

  # 3) open the file and write topics into it
  with open(output_file, 'wt') as out:
    
    pprint(lda_model.print_topics(), stream=out)

  # ??? 
  doc_lda = lda_model[corpus] 

  return lda_model


In [None]:
def get_topic_scores(lda_model, input_docs):


    input_doc_series = pd.Series([list(x.items()) for x in input_docs], index=input_docs.index)
    input_doc_topics_df = input_doc_series.explode().reset_index()
    input_doc_topics_df.rename(columns={0:"topic_score"},inplace=True)

    #  Name the topics starting with 1, by default they start with 0.
    input_doc_topics_df["topic"] = input_doc_topics_df["topic_score"].map(lambda x: "topic %s"%(x[0]+1))  # note, by default first topic is 0 - fix that here by adding 1
    input_doc_topics_df["value"] = input_doc_topics_df["topic_score"].map(lambda x: x[1])


    input_doc_topics_df.drop(columns='topic_score',inplace=True)

    return input_doc_topics_df

#### LDA Model --> Coherence Score
Determining the coherence and perplexity for LDA model

In [None]:
def compute_perplexity_and_coherence(lda_model,corpus,input_docs,id2word):


  # 2) compute Perplexity
  perplexity_score = lda_model.log_perplexity(corpus)
  print('\nPerplexity: ', perplexity_score)  # a measure of how good the model is. lower the better.


  # 3) compute Coherence Score
  coherence_model_lda = CoherenceModel(model=lda_model, texts=input_docs, dictionary=id2word, coherence='c_v')
  coherence_lda = coherence_model_lda.get_coherence()
  print('\nCoherence Score: ', coherence_lda)


  return perplexity_score, coherence_lda

#### Graph Coherence Scores


In [None]:
def plot_graph_of_coherence(coherence_values, num_topics_min, num_topics_max, graph_title):

  limit=num_topics_max; start=num_topics_min; step=1

  x = range(start, limit, step)

  plt.plot(x, coherence_values)
  plt.title(graph_title)
  plt.xlabel("Number of topics")
  plt.ylabel("Coherence score")
  plt.legend(("coherence_values"), loc='best')
  plt.show()