<a href="https://colab.research.google.com/github/rickoefendi/Sentimen_Analysis-Topic-Modeling-/blob/main/LDA%26Decision_Tree_C4_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PREPOCESSING**

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (run this once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load data
data = pd.read_csv('data_Riko.csv')

# Initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Define stop words
stop_words = set(stopwords.words('indonesian'))

# Function to preprocess text
def preprocess(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = re.sub(r'\d+', '', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stop words
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    # Join tokens back to string
    cleaned_text = ' '.join(lemmatized_tokens)

    return cleaned_text

# Apply preprocessing to the text column
data['cleaned_text'] = data['full_text'].apply(preprocess)

# Save the preprocessed data to a new CSV file
data.to_csv('cleaned_data_wisata.csv', index=False)

print("Preprocessing completed. Cleaned data saved to 'cleaned_data_wisata.csv'.")


# **LABELING**

In [None]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns

# Download VADER lexicon (run this once)
nltk.download('vader_lexicon')

# Load data
data = pd.read_csv('cleaned_data_wisata.csv')

# Initialize VADER SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def vader_sentiment_labeling(text):
    # Periksa apakah teks adalah string, jika tidak, ubah menjadi string kosong
    if not isinstance(text, str):
        text = ""

    # Dapatkan skor sentimen dari VADER
    sentiment_scores = sia.polarity_scores(text)

    # Tentukan label sentimen berdasarkan skor compound
    if sentiment_scores['compound'] >= 0.05:
        return "positive"
    elif sentiment_scores['compound'] <= -0.05:
        return "negative"
    else:
        return "neutral"

# Terapkan fungsi vader_sentiment_labeling ke kolom teks yang telah dibersihkan
data['sentiment'] = data['cleaned_text'].apply(vader_sentiment_labeling)

# Hapus entri dengan label netral
filtered_data = data[data['sentiment'] != 'neutral']

# Simpan data yang sudah dilabel ke file CSV baru
filtered_data.to_csv('filtered_labeled_data_wisata.csv', index=False)

print("Labeling completed. Filtered labeled data saved to 'filtered_labeled_data_wisata.csv'.")

# Visualisasi distribusi sentimen (tanpa kelas netral)
plt.figure(figsize=(10, 6))
sns.countplot(x='sentiment', data=filtered_data, palette='viridis')
plt.title('Distribusi Sentimen (Positif vs Negatif)')
plt.xlabel('Sentimen')
plt.ylabel('Jumlah')
plt.show()

# **Laten Dirichlet Allocation**

In [None]:
!pip install gensim pyLDAvis matplotlib

In [None]:
import pandas as pd
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Load preprocessed data
data = pd.read_csv('cleaned_data_wisata.csv')

# Pastikan kolom cleaned_text adalah string
data['cleaned_text'] = data['cleaned_text'].astype(str)

# Hapus baris dengan teks kosong
data = data[data['cleaned_text'].str.strip().astype(bool)]

# Tokenisasi teks
tokenized_texts = [text.split() for text in data['cleaned_text']]

# Membuat dictionary dan corpus
dictionary = corpora.Dictionary(tokenized_texts)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

# Model LDA
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15, random_state=42)

# Menyimpan model LDA
lda_model.save('lda_model_wisata.gensim')

# Menyimpan dictionary
dictionary.save('dictionary.dict')

# Visualisasi dengan pyLDAvis
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

# Menyimpan visualisasi sebagai HTML
pyLDAvis.save_html(vis, 'lda_visualization.html')
print("LDA visualization saved as 'lda_visualization.html'.")



In [None]:
# Menampilkan kata-kata teratas untuk setiap topik
topics = lda_model.print_topics(num_words=10)
print("Topik-topik yang dihasilkan:")
for topic in topics:
    print(topic)

In [None]:
# Visualisasi dengan pyLDAvis
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

# Menampilkan visualisasi di output (langsung tanpa menyimpan)
pyLDAvis.display(vis)

In [None]:
from gensim.models import CoherenceModel

# Menghitung koherensi topik
coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print(f"Koherensi Model LDA: {coherence_lda:.4f}")

# **Decision Tree**

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import matplotlib.pyplot as plt
import joblib

# Daftar stop-words bahasa Indonesia
stop_words_id = [
    'akan', 'adalah', 'aku', 'apa', 'apakah', 'atau', 'dalam', 'dan', 'di', 'untuk',
    'dengan', 'ini', 'itu', 'jika', 'kamu', 'kami', 'kita', 'kita', 'mereka', 'saya',
    'tidak', 'yang', 'oleh', 'sebagai', 'dari', 'ke', 'pada', 'oleh', 'adalah', 'dari'
]

# Load labeled data
data = pd.read_csv('filtered_labeled_data_wisata.csv')

# Feature Extraction dengan stop-words bahasa Indonesia
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=stop_words_id)
X = vectorizer.fit_transform(data['cleaned_text'])
y = data['sentiment']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree Classifier dengan membatasi kedalaman
max_depth = 3  # Atur kedalaman maksimum pohon
clf = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
clf.fit(X_train, y_train)

# Predict and Evaluate
y_pred = clf.predict(X_test)

# Evaluasi Model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

# Menampilkan hasil prediksi per kelas
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the trained model if needed
joblib.dump(clf, 'decision_tree_model.pkl')

# Visualisasi Pohon Keputusan
plt.figure(figsize=(20, 10))
plot_tree(clf,
          feature_names=vectorizer.get_feature_names_out(),
          class_names=clf.classes_,
          filled=True,
          fontsize=10)
plt.title("Decision Tree Visualization")

# Simpan visualisasi sebagai PDF
plt.savefig('decision_tree_visualization.pdf')

print("Decision Tree visualization saved as 'decision_tree_visualization.pdf'.")


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
from gensim import corpora

# Load the LDA model and dictionary
lda_model = LdaModel.load('lda_model_wisata.gensim')
dictionary = corpora.Dictionary.load('dictionary.dict')

# Visualisasi Word Clouds untuk setiap topik
num_topics = lda_model.num_topics
for i in range(num_topics):
    plt.figure(figsize=(8, 5))

    # Dapatkan kata dan bobot dari topik
    words = dict(lda_model.show_topic(i, 20))  # 20 kata utama per topik

    # Buat Word Cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(words)

    # Tampilkan Word Cloud
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Topic {i+1}')

    # Simpan Word Cloud sebagai file PNG
    plt.savefig(f'topic_{i+1}_wordcloud.png')
    plt.show()

print("Word clouds saved as 'topic_X_wordcloud.png'.")


In [None]:
import numpy as np

# Load labeled data
data = pd.read_csv('filtered_labeled_data_wisata.csv')

# Tokenisasi teks untuk membuat corpus
tokenized_texts = [text.split() for text in data['cleaned_text']]
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

# Tentukan topik untuk setiap dokumen
topic_distribution = [lda_model.get_document_topics(doc) for doc in corpus]

# Tambahkan topik dominan untuk setiap dokumen ke dalam data
def get_dominant_topic(topics):
    if not topics:
        return None
    return max(topics, key=lambda x: x[1])[0]

data['dominant_topic'] = [get_dominant_topic(doc_topics) for doc_topics in topic_distribution]

# Hitung distribusi sentimen per topik
sentiment_per_topic = data.groupby(['dominant_topic', 'sentiment']).size().unstack(fill_value=0)

print("Sentiment distribution per topic:")
print(sentiment_per_topic)

# Visualisasi distribusi sentimen per topik dengan bar chart
sentiment_per_topic.plot(kind='bar', figsize=(8, 4))
plt.title('Sentiment Distribution per Topic')
plt.xlabel('Topic')
plt.ylabel('Count')
plt.xticks(ticks=np.arange(len(sentiment_per_topic.index)), labels=[f'Topic {i+1}' for i in sentiment_per_topic.index], rotation=45)
plt.legend(title='Sentiment')
plt.tight_layout()

# Simpan visualisasi distribusi sentimen sebagai file PNG
plt.savefig('sentiment_distribution_per_topic.png')
plt.show()

print("Sentiment distribution chart saved as 'sentiment_distribution_per_topic.png'.")
