In [2]:
# import required packages and libraries
import pandas as pd
import time
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [3]:
# import linkedin data csv file
df = pd.read_csv("/Users/apurva/Downloads/linkedin-instructor-cleaned.csv", encoding='ISO-8859-1')
df

Unnamed: 0,index_number,followers_count_linkedin,number_of_connections_mapped_linkedin,about_section_linkedin,average_reaction_count_linkedin,average_comment_count_linkedin,average_post_frequency_linkedin
0,156,2570,1,"Hello, welcome :)I'm an experienced web develo...",8.00,1.67,126.43
1,932,1780,1,I'm a passionate Learner & Teacher. I teach Ga...,23.00,4.00,121.76
2,2059,1244,1,Innovative and results-driven product designer...,36.67,12.00,152.20
3,1100,16206,1,Creator of the hit Docker Mastery series on Ud...,10.67,0.33,1.00
4,1229,12194,1,I am a tech entrepreneur and author. I teach ...,0.33,0.00,91.32
...,...,...,...,...,...,...,...
80,2448,12156,1,I love to explain complicated things in a simp...,0.00,0.00,304.40
81,2379,3417,1,Best Seller Courses:Complete VMWare vSphere ES...,15.00,2.00,1.00
82,2215,3007,1,"Hello, I am KÃ¡roly Nyisztor, a seasoned softw...",20.00,2.67,1.00
83,1271,46666,1,Raghav is a teacher and founder of AutomationS...,234.67,3.33,12.81


In [4]:
# Function to calculate coherence score
def calculate_coherence(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [5]:
# Function to find the optimal number of topics
def find_optimal_topics(dictionary, corpus, texts, limit, start=2, step=3):
    model_list, coherence_values = calculate_coherence(dictionary, corpus, texts, limit, start, step)

    # Find the number of topics with the highest coherence score
    optimal_num_topics = start + (coherence_values.index(max(coherence_values)) * step)

    return optimal_num_topics

In [6]:
# Create a function to apply topic modeling with stopword removal and stemming
def apply_topic_modeling(text):
    # Check if the input is a non-null string
    if isinstance(text, str):
        # Remove repetitions of sentences
        sentences = re.split(r'[.!?]', text)
        unique_sentences = list(set(sentences))
        cleaned_text = ' '.join(unique_sentences)

        # Split the cleaned text into words
        words = re.findall(r'\b\w+\b', cleaned_text)
        
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word.lower() not in stop_words]
        
        # Apply stemming
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]
        
        # Remove specific common words
        words = [word for word in words if word.lower() not in ['of', 'the', 'also']]
        
        # Create a Dictionary from the processed words
        dictionary = Dictionary([words])
        
        # Create a corpus from the processed words
        corpus = [dictionary.doc2bow([word]) for word in words]
        
        # Find the optimal number of topics
        optimal_num_topics = find_optimal_topics(dictionary, corpus, [words], limit=10, start=2, step=1)
        
        # Train the LDA model with the optimal number of topics
        lda_model = LdaModel(corpus, num_topics=optimal_num_topics, id2word=dictionary, passes=10)
        
        # Return the topics and associated keywords
        return lda_model.print_topics(num_topics=optimal_num_topics, num_words=5)


In [7]:
# Apply the function to each entry in 'about_section_linkedin' 
df['lda_topics'] = df['about_section_linkedin'].apply(apply_topic_modeling)

# Display the updated DataFrame 
print(df)

    index_number  followers_count_linkedin  \
0            156                      2570   
1            932                      1780   
2           2059                      1244   
3           1100                     16206   
4           1229                     12194   
..           ...                       ...   
80          2448                     12156   
81          2379                      3417   
82          2215                      3007   
83          1271                     46666   
84          1991                       568   

    number_of_connections_mapped_linkedin  \
0                                       1   
1                                       1   
2                                       1   
3                                       1   
4                                       1   
..                                    ...   
80                                      1   
81                                      1   
82                                      1 

In [8]:
# Load the results into csv file
df.to_csv('/Users/apurva/Downloads/Topic_Modelling.csv') 