<a href="https://colab.research.google.com/github/smuratsirin/539b-Econometrics-/blob/master/Topic_Modelling_LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import spacy
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
from gensim.models import LdaModel
import random
random.seed(42)

In [None]:
# Load data
df = pd.read_csv('/content/sample_data/wos_2009.csv', encoding = "latin1")  # Ensure the CSV has columns 'title' and 'abstract'

# Initialize spaCy model
nlp = spacy.load('en_core_web_sm')

# Define a list of additional stopwords to remove
additional_stopwords = {"ltd", "right", "c.right", "copyright"}

# Preprocess text using spaCy
def preprocess(text):
    """
    This function preprocesses the input text by:
    1. Tokenizing the text: Splitting the text into individual words/tokens.
    2. Removing stopwords and punctuation: Filtering out common words, punctuation, and specified additional stopwords that do not contribute to topic identification.
    3. Lemmatizing the tokens: Converting words to their base forms (e.g., 'running' to 'run').
    """
    if pd.isna(text):  # Handle missing values
        return ""
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.text.lower() not in additional_stopwords]
    return ' '.join(tokens)

# Apply preprocessing to the 'abstract' column in the DataFrame
df['processed_text'] = df['Abstract'].apply(preprocess)

# Vectorization using TF-IDF (Term Frequency-Inverse Document Frequency)
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
# max_df=0.95: Ignore terms that appear in more than 95% of the documents.
# min_df=2: Ignore terms that appear in fewer than 2 documents.
X = vectorizer.fit_transform(df['processed_text'])

# Convert documents into a format suitable for Gensim's LDA model
# Create a dictionary representation of the documents
dictionary = corpora.Dictionary([text.split() for text in df['processed_text']])
# Create a bag-of-words representation of the documents
corpus = [dictionary.doc2bow(text.split()) for text in df['processed_text']]

# Train the LDA (Latent Dirichlet Allocation) model
lda_model = LdaModel(corpus, num_topics=15, id2word=dictionary, passes=15)
# num_topics=20: Number of topics to be identified by the model.
# passes=15: Number of iterations through the corpus during training.

# Assign topics to documents
def get_document_topic(bow, model):
    """
    Get the dominant topic for a given document represented as a bag-of-words.
    :param bow: Bag-of-words representation of the document.
    :param model: Trained LDA model.
    :return: Dominant topic for the document.
    """
    topics = model.get_document_topics(bow)
    return max(topics, key=lambda x: x[1])[0]  # Return the topic with the highest probability

# Apply the function to get the dominant topic for each document
df['topic'] = [get_document_topic(bow, lda_model) for bow in corpus]

# Extract the most relevant keywords for each topic
topics_keywords = lda_model.show_topics(num_topics=20, num_words=15, formatted=False)
# num_topics=20: Number of topics to be displayed.
# num_words=15: Number of keywords to display for each topic.
# formatted=False: Return the topics as lists of words instead of formatted strings.
topic_keywords_list = {topic: [word for word, prob in words] for topic, words in topics_keywords}


In [None]:
# Save the updated DataFrame to a new CSV file
df.to_csv('wos_2009_literature_review_with_topics.csv', index=False)

# Prepare the keywords data for export
keywords_data = []
for topic, keywords in topic_keywords_list.items():
    keywords_data.append({'topic': topic, 'keywords': ', '.join(keywords)})

# Create a DataFrame for the keywords
keywords_df = pd.DataFrame(keywords_data)

# Save the keywords DataFrame to a CSV file
keywords_df.to_csv('topic_keywords.csv', index=False)

# Display the most relevant 15 keywords for each topic
for topic, keywords in topic_keywords_list.items():
    print(f"Topic {topic}: {', '.join(keywords)}")

# If you want to return the keywords list in a variable
topic_keywords = [{"topic": topic, "keywords": keywords} for topic, keywords in topic_keywords_list.items()]