In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [1]:
!git clone https://github.com/saikrishnadeshavath/Text-Data-Exploration-and-Topic-Modeling-using-LDA.

Cloning into 'Text-Data-Exploration-and-Topic-Modeling-using-LDA.'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


In [None]:
df = pd.read_csv('/content/sample_data/text_docs.csv')
df.head()

Unnamed: 0,document_id,text
0,1,The stock market has been experiencing volatil...
1,2,"The economy is growing, and businesses are opt..."
2,3,Climate change is a critical issue that needs ...
3,4,Advances in artificial intelligence have revol...
4,5,The rise of electric vehicles is shaping the f...


# Task
Explore the dataset "text_docs", preprocess the text data, and generate topics using Latent Dirichlet Allocation (LDA).

## Data exploration

### Subtask:
Load the dataset, check the number of rows and unique documents, and identify necessary preprocessing steps.


**Reasoning**:
The dataset is already loaded in the previous steps. I will now print the number of rows and unique document IDs and identify necessary preprocessing steps based on the displayed data.



In [None]:
print(f"Number of rows: {len(df)}")
print(f"Number of unique documents: {df['document_id'].nunique()}")

# Based on the head of the dataframe, potential preprocessing steps include:
# 1. Converting text to lowercase.
# 2. Removing punctuation.
# 3. Removing stop words.
# 4. Tokenization.
# 5. Lemmatization or stemming.

Number of rows: 10
Number of unique documents: 10


## Preprocessing

### Subtask:
Clean the text data by removing stop words, punctuation, and performing stemming or lemmatization.


**Reasoning**:
Define a function to preprocess the text data and apply it to the 'text' column of the DataFrame.



In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize
    tokens = text.split()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

df['cleaned_text'] = df['text'].apply(preprocess_text)
display(df[['text', 'cleaned_text']].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text,cleaned_text
0,The stock market has been experiencing volatil...,stock market experiencing volatility due econo...
1,"The economy is growing, and businesses are opt...",economy growing business optimistic future
2,Climate change is a critical issue that needs ...,climate change critical issue need immediate g...
3,Advances in artificial intelligence have revol...,advance artificial intelligence revolutionized...
4,The rise of electric vehicles is shaping the f...,rise electric vehicle shaping future automobil...


## Prepare data for lda

### Subtask:
Create a dictionary and corpus for the preprocessed text data.


**Reasoning**:
Create a dictionary and corpus for the preprocessed text data using gensim.



**Reasoning**:
The previous command failed because the `gensim` library is not installed. Install the library before trying to import it again.



In [None]:
%pip install gensim



**Reasoning**:
Now that gensim is installed, retry creating the dictionary and corpus.



In [None]:
from gensim import corpora, models

# Create a dictionary
text_data = [doc.split() for doc in df['cleaned_text']]
dictionary = corpora.Dictionary(text_data)

# Create a corpus (Bag-of-Words representation)
corpus = [dictionary.doc2bow(doc) for doc in text_data]

print("Dictionary created:")
print(dictionary)
print("\nCorpus created (first document):")
print(corpus[0])

Dictionary created:
Dictionary<61 unique tokens: ['due', 'economic', 'experiencing', 'market', 'stock']...>

Corpus created (first document):
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]


**Reasoning**:
The previous command was interrupted. I will retry the code to create the dictionary and corpus.



**Reasoning**:
The DataFrame `df` was not found. This is likely due to the kernel restarting. Reload the data and then recreate the dictionary and corpus.



In [None]:
import pandas as pd
from gensim import corpora, models

df = pd.read_csv('/content/sample_data/text_docs.csv')

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize
    tokens = text.split()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

df['cleaned_text'] = df['text'].apply(preprocess_text)


# Create a dictionary
text_data = [doc.split() for doc in df['cleaned_text']]
dictionary = corpora.Dictionary(text_data)

# Create a corpus (Bag-of-Words representation)
corpus = [dictionary.doc2bow(doc) for doc in text_data]

print("Dictionary created:")
print(dictionary)
print("\nCorpus created (first document):")
print(corpus[0])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Dictionary created:
Dictionary<61 unique tokens: ['due', 'economic', 'experiencing', 'market', 'stock']...>

Corpus created (first document):
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]


## Generate topics using lda

### Subtask:
Apply Latent Dirichlet Allocation to extract topics and display the top words for each topic.


**Reasoning**:
Apply Latent Dirichlet Allocation to extract topics and display the top words for each topic using the previously created dictionary and corpus.



In [None]:
from gensim.models import LdaModel

# Instantiate an LdaModel object
# Choose a reasonable number of topics (e.g., 3, as the dataset is small)
# Use the created corpus and dictionary
# Set random_state for reproducibility
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=3,
                     random_state=42)

# Print the top words for each topic
# Specify the number of words to display for each topic (e.g., 5)
print("Top words for each topic:")
for topic in lda_model.print_topics(num_words=5):
    print(topic)



Top words for each topic:
(0, '0.046*"industry" + 0.045*"digital" + 0.043*"platform" + 0.029*"revolutionized" + 0.029*"advance"')
(1, '0.042*"future" + 0.035*"industry" + 0.030*"rise" + 0.030*"treatment" + 0.030*"electric"')
(2, '0.035*"market" + 0.035*"experiencing" + 0.035*"volatility" + 0.035*"uncertainty" + 0.035*"due"')


## Summary:

### Data Analysis Key Findings

*   The dataset contains 10 rows, with each row representing a unique document.
*   The text preprocessing steps included converting text to lowercase, removing punctuation and stop words, and lemmatization.
*   The `gensim` library was used to create a dictionary and corpus from the preprocessed text, which are necessary inputs for the LDA model.
*   An LDA model was trained to identify 3 topics from the text data.

### Insights or Next Steps

*   The top words for each topic provide a basis for interpreting the themes present in the documents. Further analysis could involve manually labeling these topics based on the words.
*   For a larger dataset, increasing the number of topics and iterations in the LDA model might yield more granular and accurate topic representations.
