In [2]:
# 1. Data Loading and Initial Setup

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from IPython.display import display

# Load the data from the CSV file
data = pd.read_csv("tweets-data.csv")

# Display a message indicating that the data has been successfully loaded
print("Loaded data:")

# Print the first 5 rows of the DataFrame with a nicer table format
display(data.head(5))

# Indonesian stopwords from NLTK and additional stopwords
stop_words_indo_nltk = stopwords.words('indonesian')
stop_words_eng_nltk = stopwords.words('english')
additional_stopwords = [
  'yang', 'di', 'ke', 'dari', 'ini', 'itu', 'pada', 'untuk', 'dan', 'dengan',
  'adalah', 'saya', 'kamu', 'dia', 'kita', 'mereka', 'akan', 'atau', 'seperti', 
  'FFFF00', 't co', 'FFFF00 ', 'https', 'segyongstar', 'ipi', 'ye', 'ha', 'a', 't', 
  'co' , 'i', 'font', 'fontcolor', 'fontcolor=', 'mkkkkkkkkkkk', '=', '#', '"', 'FFFF00', 'ffff'
]
stop_words_id = list(set(stop_words_indo_nltk + stop_words_eng_nltk + additional_stopwords))

# Initialize Sastrawi Stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

ModuleNotFoundError: No module named 'Sastrawi'

In [None]:
# 2. Text Cleaning and Stemming

# Select the 'full_text' column for sentiment analysis
text = data["full_text"]

# Display a message indicating the selected text column
print("Selected text column:")

# Set pandas display options to show all rows
pd.set_option('display.max_colwidth', None)
pd.set_option("display.max_rows", None)

# Convert the series to a DataFrame with the column named "full_text"
text_df = pd.DataFrame(text, columns=["full_text"])

# Display the first 10 rows of the DataFrame
display(text_df.head(10))

# Function to clean and stem the text data
def clean_and_stem_text(text):
text = text.lower() # Lowercase the text
text = re.sub(r"[^a-zA-Z0-9\s]", " ", text) # Remove special characters
text = re.sub(r'https?://\S+', '', text)
text = re.sub(r'\b\d+\b', '', text)
text = re.sub(r'[^a-zA-Z\s]', ' ', text)
text = text.replace('\t', ' ').replace('\n', ' ').replace('\\u', '
').replace('\\', '') # Replace escape characters
text = text.encode('ascii', 'replace').decode('ascii') # Encode to ASCII
tokens = word_tokenize(text) # Tokenize the text
filtered_tokens = [word for word in tokens if word not in stop_words_id] #
Remove stopwords
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens] # Stem the tokens
return " ".join(stemmed_tokens) # Join the cleaned and stemmed tokens
                                                          
# Clean and stem the text data
text = text.apply(clean_and_stem_text)

# Set pandas display options to show all rows and full width of columns
pd.set_option("display.max_rows", None)
pd.set_option('display.max_colwidth', None)

# Save the cleaned and stemmed text to a new DataFrame
cleaned_data = pd.DataFrame({'cleaned_text': text})

print("Cleaned and stemmed text data:")
display(cleaned_data.head(20))

# Save the cleaned and stemmed text data to a CSV file
cleaned_data.to_csv("cleaned_data.csv", index=False)

In [None]:
# 3. Translation

import torch
import numpy as np
from transformers import MarianMTModel, MarianTokenizer

# Load the cleaned data
cleaned_data = pd.read_csv("cleaned_data.csv")

# Load the MarianMT model and tokenizer
model_name = 'Helsinki-NLP/opus-mt-id-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Move model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to translate text
def translate_text(text):
    try:
        # Tokenize the text
        inputs = tokenizer(text, return_tensors="pt", padding=True)
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)
        # Generate translation
        translated = model.generate(input_ids=input_ids, attention_mask=attention_mask)
        # Decode the generated tokens
        translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
        return translated_text
    except Exception as e:
        print(f"Error translating text: {e}")
        return text

# Translate the cleaned and stemmed text data
cleaned_data['translated_text'] =
cleaned_data['cleaned_text'].apply(translate_text)

# Set pandas display options to show all rows and full width of columns
pd.set_option("display.max_rows", None)
pd.set_option('display.max_colwidth', None)

print("Translated text data:")
display(cleaned_data['translated_text'].head(20))

# Save the translated text to a new CSV file
cleaned_data.to_csv("translated_data.csv", index=False)
print("Translated data saved to translated_data.csv")

In [None]:
# 4. Sentiment Analysis 
 
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
import pandas as pd 
 
# Create an instance of Vader SentimentIntensityAnalyzer 
analyzer = SentimentIntensityAnalyzer() 
 
# Load the translated data 
translated_data = pd.read_csv("translated_data.csv") 
 
# Extract the translated text column 
translated_text = translated_data["translated_text"] 
 
# Function to add sentiment score to the data 
def get_sentiment(translated_text): 
    if pd.isna(translated_text): 
        return None  # Handle NaN values 
    sentiment = analyzer.polarity_scores(translated_text) 
    return sentiment["compound"] 
 
# Add sentiment score to the translated data 
translated_data["sentiment"] = 
translated_data["translated_text"].apply(get_sentiment) 
 
print("Translated data with sentiment score:") 
display(translated_data[['translated_text', 'sentiment']].head(20)) 
 
# Save the data with sentiment scores to a new CSV file 
translated_data.to_csv("translated_data_with_sentiment.csv", index=False) 
print("Translated data with sentiment scores saved to 
translated_data_with_sentiment.csv") 
 
# Function to classify sentiment score into positive, neutral, or negative 
def sentiment_label(score): 
  if score > 0.05: 
    return "positive" 
  elif score < -0.05: 
    return "negative" 
  else: 
    return "neutral" 
 
# Load the translated data with sentiment scores 
sentiment_data = pd.read_csv("translated_data_with_sentiment.csv") 
 
# Add sentiment label to the data 
sentiment_data["sentiment_label"] = 
sentiment_data["sentiment"].apply(sentiment_label) 
 
print("Data with sentiment label:") 
# Create a DataFrame with 'translated_text', 'sentiment', and 'sentiment_label' 
columns 
selected_data = sentiment_data[['translated_text', 'sentiment', 
'sentiment_label']] 
 
# Set pandas display options to show the full width of the 'translated_text' 
column 
pd.set_option('display.max_colwidth', None) 
 
# Display the DataFrame in table format 
display(selected_data.head(20)) 
 
# Save the DataFrame with sentiment labels to a new CSV file 
selected_data.to_csv("sentiment_data_with_labels.csv", index=False) 
print("Sentiment data with labels saved to sentiment_data_with_labels.csv")

In [None]:
# 5. Visualization
import matplotlib.pyplot as plt
from wordcloud import WordCloud
# Load the translated data with sentiment labels
final_data = pd.read_csv("sentiment_data_with_labels.csv")
# Fill NaN values with empty strings and convert all entries to strings
final_data['translated_text'] =
final_data['translated_text'].fillna('').astype(str)
# Combine all the translated text into a single string
text_combined = " ".join(final_data['translated_text'])
# Create and configure the word cloud
wordcloud = WordCloud(
width=800,
height=400,
background_color='white',
stopwords=set(stop_words_indo_nltk + stop_words_eng_nltk +
additional_stopwords),
max_words=200,
contour_color='steelblue',
contour_width=3
).generate(text_combined)
# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Data Tweets')
plt.show()
# Count the sentiment labels
sentiment_counts = final_data["sentiment_label"].value_counts()
# Create a table to show sentiment counts
sentiment_table = pd.DataFrame({"Sentiment": sentiment_counts.index, "Count": 
sentiment_counts.values})
print(sentiment_table.to_string(index=False))
# Create a histogram to visualize sentiment distribution
plt.bar(sentiment_counts.index, sentiment_counts.values)
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.title("Sentiment Distribution of UKT Increase Comments")
plt.show()