In [10]:
import pandas as pd
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt

# Load your preprocessed text data from a CSV file
file_path = "../Prepossesing/data_cleaned.csv"
df = pd.read_csv(file_path, encoding="utf-8")
keyword = "Covid"

# Tokenize the text in the 'full_text' column
texts = [text.split() for text in df['full_text']]

# Create a dictionary from the tokenized text
dictionary = corpora.Dictionary(texts)

# Convert the text to a Bag of Words (BoW) representation
corpus = [dictionary.doc2bow(text) for text in texts]

# Initialize variables to keep track of the best model and coherence score
best_coherence_score = -1
best_lda_model = None
best_num_topics = 0

# Define the range of topics to explore
num_topics_range = range(2, 20)

# Iterate through different numbers of topics
for num_topics in num_topics_range:
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)
    
    # Calculate coherence score for the current model
    coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    
    # Check if the current model has a higher coherence score
    if coherence_score > best_coherence_score:
        best_coherence_score = coherence_score
        best_lda_model = lda_model
        best_num_topics = num_topics

# Print the best number of topics and its corresponding coherence score
print(f"Best Number of Topics: {best_num_topics}")
print(f"Best Coherence Score: {best_coherence_score}")
best_num_topics_str = str(best_num_topics)

# Print and explore the topics of the best model
topics = best_lda_model.show_topics(num_topics=best_num_topics, num_words=20, formatted=False)
print(topics)
for topic in topics:
    print(f"Topic {topic[0] + 1}:")
    print([word[0] for word in topic[1]])
    print()

# Visualize topics using pyLDAvis (optional)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(best_lda_model, corpus, dictionary)
pyLDAvis.save_html(vis, 'lda.html')


Best Number of Topics: 19
Best Coherence Score: 0.4575255387769461
[(0, [('salah', 0.053483587), ('sayang', 0.031713936), ('liburan', 0.0233829), ('jujur', 0.019518092), ('jaga', 0.015854241), ('akhir', 0.0157795), ('ngikutin', 0.013912284), ('satunya', 0.012257635), ('kemarin', 0.011682264), ('terus', 0.0115154125), ('gw', 0.011275583), ('banget', 0.0073509384), ('buzzer', 0.004614594), ('disitu', 0.0046141087), ('please', 0.0046135033), ('bukan', 0.002685802), ('manfaatnya', 0.0020447616), ('informasi', 0.0020447616), ('seputar', 0.0020447616), ('lewatkan', 0.0020444384)]), (1, [('jadi', 0.03567928), ('tuh', 0.03521112), ('gini', 0.019321559), ('era', 0.019191612), ('makin', 0.017409433), ('biar', 0.015772937), ('menjadi', 0.013660997), ('soalnya', 0.009766507), ('ini', 0.008885702), ('kantor', 0.008721063), ('dari', 0.008571089), ('anak', 0.008138344), ('the', 0.007853459), ('pemprov', 0.007853339), ('polusi', 0.00631342), ('punya', 0.0062782774), ('bgt', 0.0061628832), ('ikut', 0.0

In [None]:
import pandas as pd
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

# Load your preprocessed text data from a CSV file
file_path = "../Prepossesing/data_cleaned.csv"
df = pd.read_csv(file_path, encoding="utf-8")

# Tokenize the text in the 'full_text' column
texts = [text.split() for text in df['full_text']]

# Create a dictionary from the tokenized text
dictionary = corpora.Dictionary(texts)

# Convert the text to a Bag of Words (BoW) representation
corpus = [dictionary.doc2bow(text) for text in texts]

# Initialize lists to store the number of topics and corresponding coherence scores
num_topics_range = range(2, 20)
coherence_scores = []

# Iterate through different numbers of topics
for num_topics in num_topics_range:
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)
    
    # Calculate coherence score for the current model
    coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    
    coherence_scores.append(coherence_score)

# Plot the coherence scores
plt.figure(figsize=(20, 6))
plt.plot(num_topics_range, coherence_scores, marker='o', linestyle='-')
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.title("Topic Coherence Evaluation")
plt.grid(True)
plt.show()


In [11]:
import openai

# Ganti 'your-api-key' dengan API key
openai.api_key = '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'


# Data dengan bobot kata-kata tertinggi untuk setiap topik
data = [topics]

# Mengolah data untuk mengambil kata-kata terbobot tertinggi
topic_keywords = []
for topic_data in data:
    topic_data = topic_data[0][1]  # Ambil data topik pertama (bobot tertinggi)
    topic_data = sorted(topic_data, key=lambda x: x[1], reverse=True)  # Urutkan berdasarkan bobot terbesar ke terkecil
    keywords = [word[0] for word in topic_data[:10]]  # Ambil 10 kata dengan bobot tertinggi
    topic_keywords.extend(keywords)

# Create a prompt using the keywords
    
# CTF = Content + Task +  Format
#prompt = "Anda adalah AI Linguistik yang dapat menentukan kalimat dari beberapa topik berdasarkan dari kumpulan kata-kata hasil dari proses LDA dengan mempertimbangkan bobot dalam merangkai kata-kata kunci menjadi kalimat yang padu untuk sebuah topik yang di perbincangkan di Twitter dengan keyword :" + keyword +". Buatkan dari kata-kata kunci berikut, sesuai jumlah topik yang diberikan yaitu: " + best_num_topics_str + " topik, dengan format numbering list dengan hasil LDA berikut : \n\n" + " ".join(topic_keywords)

# RASCEF = Role + ( Action + Step + Context + Example ) + Format
role = "Anda sebagai AI Linguistik"
action = "menentukan kalimat dari beberapa topik berdasarkan dari kumpulan kata-kata hasil dari proses LDA"
step = "mempertimbangkan bobot setiap topik yang ada pada penomoran angka dalam merangkai kata-kata kunci menjadi kalimat yang padu untuk sebuah topik yang di perbincangkan di Twitter"
context = f"membahas tentang keyword: {keyword} dengan berbagai pandangan masyarakat terhadap topic tersebut dengan hasil LDA dengan kata-kata kunci berikut : \n\n" + " ".join(topic_keywords)
example = f"topik dengan contoh topik 1 : pemerintah yang mengambil tindakan untuk covid 19"
format_str = f"dengan format numbering list dengan 1 topik untuk 1 kalimat utama dengan jumlah sesuai jumlah topik yang diberikan yaitu : {best_num_topics_str}"

# Combine components into the RASCEF prompt
prompt = f"# RASCEF = Role + ( Action + Step + Context + Example ) + Format\n\nAnda adalah {role} yang {action} {step} {context}. Buatkan dari {example}, {format_str}"

# Generate a completion using the ChatCompletion endpoint
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ],
    temperature=0.5  
    # Temperature Tinggi (Misalnya, 0.8 - 1.0): Menghasilkan respons yang lebih kreatif dan variatif.
    # Temperature Sedang (Misalnya, 0.2 - 0.7): Menghasilkan respons yang seimbang antara kreativitas dan relevansi.
    # Temperature Rendah (Misalnya, 0.1 - 0.2): Menghasilkan respons yang sangat terdeterminasi dan konservatif.
)

# Extract the generated sentence from the response
generated_sentence = response['choices'][0]['message']['content']

# Print the generated sentence
print(topics)
print(generated_sentence)



[(0, [('salah', 0.053483587), ('sayang', 0.031713936), ('liburan', 0.0233829), ('jujur', 0.019518092), ('jaga', 0.015854241), ('akhir', 0.0157795), ('ngikutin', 0.013912284), ('satunya', 0.012257635), ('kemarin', 0.011682264), ('terus', 0.0115154125), ('gw', 0.011275583), ('banget', 0.0073509384), ('buzzer', 0.004614594), ('disitu', 0.0046141087), ('please', 0.0046135033), ('bukan', 0.002685802), ('manfaatnya', 0.0020447616), ('informasi', 0.0020447616), ('seputar', 0.0020447616), ('lewatkan', 0.0020444384)]), (1, [('jadi', 0.03567928), ('tuh', 0.03521112), ('gini', 0.019321559), ('era', 0.019191612), ('makin', 0.017409433), ('biar', 0.015772937), ('menjadi', 0.013660997), ('soalnya', 0.009766507), ('ini', 0.008885702), ('kantor', 0.008721063), ('dari', 0.008571089), ('anak', 0.008138344), ('the', 0.007853459), ('pemprov', 0.007853339), ('polusi', 0.00631342), ('punya', 0.0062782774), ('bgt', 0.0061628832), ('ikut', 0.0060685836), ('pasangan', 0.005939511), ('many', 0.005939169)]), (2,