### Import Packages

In [1]:
import pandas as pd
import html
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk import tokenize
import nltk
from nltk.tokenize import sent_tokenize
from langdetect import detect

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Students\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Cleaning Text

In [3]:
def clean_text(review):
    # Step 1: Fix encoding issues (e.g., replacing "â€™" with correct characters)
    try:
        review = review.encode('latin1', errors='ignore').decode('utf-8', errors='ignore')
    except UnicodeEncodeError:
        pass  # If there's an encoding error, skip this step

    # Step 2: Remove list brackets and leading/trailing quotes
    review = re.sub(r"^\[|\]$", "", review)  # Remove square brackets at the start and end
    review = review.replace("'", "")  # Remove single quotes

    # Step 3: Remove extra backslashes
    review = review.replace("\\", "")

    # Step 4: Remove commas followed by double or single quotes
    review = re.sub(r',["\']', '', review)

    # Step 5: Replace double commas with a single comma
    review = re.sub(r',+', ',', review)

    # Step 6: Remove any instances of ".,", ",.", or ",,"
    review = re.sub(r'\.,|,\.', '.', review)  # Replace ".,", ",." with a single period
    review = re.sub(r',,', ',', review)  # Replace ",," with a single comma

    # Step 7: Remove hashtags
    review = re.sub(r'#\w+', '', review)

    # Step 8: Remove URLs
    review = re.sub(r'http\S+', '', review)

    # Step 9: Remove emojis
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticon
        u"\U0001F300-\U0001F5FF"  # simbol & dingbat
        u"\U0001F680-\U0001F6FF"  # transportasi & simbol map
        u"\U0001F700-\U0001F77F"  # simbol kuno
        u"\U0001F780-\U0001F7FF"  # simbol kuno tambahan
        u"\U0001F800-\U0001F8FF"  # simbol tanda batas
        u"\U0001F900-\U0001F9FF"  # emoticon tambahan
        u"\U0001FA00-\U0001FA6F"  # simbol musik
        u"\U0001FA70-\U0001FAFF"  # simbol musik tambahan
        u"\U00002702-\U000027B0"  # simbol karakter
        u"\U000024C2-\U0001F251"  # simbol katakter tambahan
        "]+", flags=re.UNICODE
    )
    review = emoji_pattern.sub(r'', review)

    # Step 10: Remove unwanted whitespace characters
    review = re.sub(r'\s+', ' ', review)  # Mengganti semua spasi, tab, dan newline dengan satu spasi
    review = review.replace('\n', ' ')
    review = review.replace('\\n', ' ')
    review = review.replace('\t', ' ')
    review = review.replace('\\t', ' ')
    review = review.replace('\\u', ' ')
    review = review.replace('\\', ' ')
    review = review.replace('\v', ' ')
    review = review.replace('\f', ' ')
    review = review.replace('\r', ' ')
    review = review.replace('\x0b', ' ')  # Vertical Tab
    review = review.replace('\x0c', ' ')  # Form Feed
    review = re.sub(r'\s+', ' ', review)  # Menghapus spasi ganda yang mungkin muncul setelah penggantian

    # Step 11: Decode HTML entities
    review = html.unescape(review)

    # Step 12: Remove HTML tags
    review = re.sub(r'<.*?>', '', review)

    # Step 13: Remove non-ASCII characters
    review = re.sub(r'[^\x00-\x7F]+', '', review)

    return review.strip()  # Menghapus spasi di awal dan akhir teks

### Vader Agregate

In [4]:


# Function to load the InSet lexicon
def load_inset_lexicon():
    positive_words = {}
    negative_words = {}
    
    # Load the positive lexicon file
    with open('positive.tsv', 'r', encoding='utf-8') as pos_file:
        next(pos_file)  # Skip the header line if present
        for line in pos_file:
            word, score = line.strip().split('\t')
            positive_words[word] = float(score)
    
    # Load the negative lexicon file
    with open('negative.tsv', 'r', encoding='utf-8') as neg_file:
        next(neg_file)  # Skip the header line if present
        for line in neg_file:
            word, score = line.strip().split('\t')
            negative_words[word] = float(score)
    
    return positive_words, negative_words

# Load the lexicons
positive_words, negative_words = load_inset_lexicon()

# Combine the positive and negative lexicons
combined_lexicon_dict = {**positive_words, **negative_words}

def sentiment(text):
    analyzer = SentimentIntensityAnalyzer()

    # Update the VADER lexicon with the combined lexicon
    analyzer.lexicon.update(combined_lexicon_dict)

    sentence_list = nltk.sent_tokenize(text)
    total_compound = 0.0
    for sentence in sentence_list:
        sentiment_scores = analyzer.polarity_scores(sentence)
        compound = sentiment_scores['compound']
        total_compound += compound

    average_compound = total_compound / len(sentence_list) if sentence_list else 0
    
    return round(average_compound, 4)

### IMPORT DATA

In [16]:
df = pd.read_csv("cnbc_raw.csv")
df['article_date'] = pd.to_datetime(df['article_date'])
df_filtered = df[df['article_date'].dt.year >= 2008].copy()
df_filtered

Unnamed: 0,id,crawl_timestamp,article_date,url,title,content,author,platform_id
0,754141,2024-08-21 23:29:20.794 +0700,2024-03-28,https://www.cnbcindonesia.com/market/202403280...,"Dolar AS Nyaris Rp 16.000, Suku Bunga BI Gimana?","Jakarta, CNBC Indonesia - Rupiah kembali melem...","Tim Redaksi,",5
1,754142,2024-08-21 23:29:21.010 +0700,2024-03-28,https://www.cnbcindonesia.com/tech/20240327144...,"Nadiem Kabur, Bandar Kripto Raksasa Usahakan T...","Jakarta, CNBC Indonesia - Nadiem Anjarwalla, s...","Redaksi,",5
2,754143,2024-08-21 23:29:21.193 +0700,2024-03-28,https://www.cnbcindonesia.com/market/202403280...,"Data AS & Situasi Global Bergejolak, Rupiah Me...","Jakarta, CNBC Indonesia - Rupiah melemah terha...","Muhammad Reza Ilham Taufani,",5
3,754144,2024-08-21 23:29:21.367 +0700,2024-03-28,https://www.cnbcindonesia.com/news/20240327172...,Nah Ini Dia! Biang Kerok yang Bikin Kelas Mene...,"Jakarta, CNBC Indonesia - Kondisi kelas meneng...","Ilham Restu,",5
4,754145,2024-08-21 23:29:21.557 +0700,2024-03-28,https://www.cnbcindonesia.com/market/202403280...,"Rupiah Makin Dekati Rp16.000/US$, Bagaimana Na...","Jakarta, CNBC indonesia - Nilai tukar rupiah k...","Tasya Natalia,",5
...,...,...,...,...,...,...,...,...
68905,754136,2024-08-21 23:29:19.710 +0700,2024-03-29,https://www.cnbcindonesia.com/research/2024032...,"Harga Emas Cetak Rekor 2 Hari Beruntun, Tembus...","Jakarta, CNBC Indonesia - Harga emas tak berhe...","mae,",5
68906,754137,2024-08-21 23:29:20.075 +0700,2024-03-28,https://www.cnbcindonesia.com/market/202403281...,"Analisa Penyebab Rupiah Lesu, Dolar AS Bisa ke...","Jakarta, CNBC Indonesia-Ekonom menilai terdapa...","M Rosseno Aji Nugroho,",5
68907,754138,2024-08-21 23:29:20.271 +0700,2024-03-28,https://www.cnbcindonesia.com/market/202403281...,"Sidang Sengketa Pemilu 2024 Berlanjut, IHSG Le...","Jakarta, CNBC Indonesia - Indeks Harga Saham G...","Chandra Dwi,",5
68908,754139,2024-08-21 23:29:20.443 +0700,2024-03-28,https://www.cnbcindonesia.com/market/202403280...,"Stok AS Lemah & Harapan Suku Bunga Turun, Harg...","Jakarta, CNBC Indonesia - Harga minyak mentah ...","Muhammad Reza Ilham Taufani,",5


### RUN CLEAN AND VADER ANALYSIS

In [17]:
def data_prep(df):
    df['clean'] = df['content'].apply(clean_text)
    df = df[df['clean'].notna()]
    df["compound"] = df['clean'].apply(sentiment)
    return df

In [18]:
df_filtered = data_prep(df_filtered)
df_filtered

Unnamed: 0,id,crawl_timestamp,article_date,url,title,content,author,platform_id,clean,compound
0,754141,2024-08-21 23:29:20.794 +0700,2024-03-28,https://www.cnbcindonesia.com/market/202403280...,"Dolar AS Nyaris Rp 16.000, Suku Bunga BI Gimana?","Jakarta, CNBC Indonesia - Rupiah kembali melem...","Tim Redaksi,",5,"Jakarta, CNBC Indonesia - Rupiah kembali melem...",-0.6577
1,754142,2024-08-21 23:29:21.010 +0700,2024-03-28,https://www.cnbcindonesia.com/tech/20240327144...,"Nadiem Kabur, Bandar Kripto Raksasa Usahakan T...","Jakarta, CNBC Indonesia - Nadiem Anjarwalla, s...","Redaksi,",5,"Jakarta, CNBCIndonesia - Nadiem Anjarwalla, sa...",-0.1486
2,754143,2024-08-21 23:29:21.193 +0700,2024-03-28,https://www.cnbcindonesia.com/market/202403280...,"Data AS & Situasi Global Bergejolak, Rupiah Me...","Jakarta, CNBC Indonesia - Rupiah melemah terha...","Muhammad Reza Ilham Taufani,",5,"Jakarta, CNBC Indonesia - Rupiah melemah terha...",-0.7640
3,754144,2024-08-21 23:29:21.367 +0700,2024-03-28,https://www.cnbcindonesia.com/news/20240327172...,Nah Ini Dia! Biang Kerok yang Bikin Kelas Mene...,"Jakarta, CNBC Indonesia - Kondisi kelas meneng...","Ilham Restu,",5,"Jakarta, CNBC Indonesia - Kondisi kelas meneng...",-0.8883
4,754145,2024-08-21 23:29:21.557 +0700,2024-03-28,https://www.cnbcindonesia.com/market/202403280...,"Rupiah Makin Dekati Rp16.000/US$, Bagaimana Na...","Jakarta, CNBC indonesia - Nilai tukar rupiah k...","Tasya Natalia,",5,"Jakarta, CNBC indonesia - Nilai tukar rupiah k...",-0.4082
...,...,...,...,...,...,...,...,...,...,...
68905,754136,2024-08-21 23:29:19.710 +0700,2024-03-29,https://www.cnbcindonesia.com/research/2024032...,"Harga Emas Cetak Rekor 2 Hari Beruntun, Tembus...","Jakarta, CNBC Indonesia - Harga emas tak berhe...","mae,",5,"Jakarta, CNBC Indonesia - Harga emas tak berhe...",-0.5124
68906,754137,2024-08-21 23:29:20.075 +0700,2024-03-28,https://www.cnbcindonesia.com/market/202403281...,"Analisa Penyebab Rupiah Lesu, Dolar AS Bisa ke...","Jakarta, CNBC Indonesia-Ekonom menilai terdapa...","M Rosseno Aji Nugroho,",5,"Jakarta, CNBC Indonesia-Ekonom menilai terdapa...",-0.8603
68907,754138,2024-08-21 23:29:20.271 +0700,2024-03-28,https://www.cnbcindonesia.com/market/202403281...,"Sidang Sengketa Pemilu 2024 Berlanjut, IHSG Le...","Jakarta, CNBC Indonesia - Indeks Harga Saham G...","Chandra Dwi,",5,"Jakarta, CNBC Indonesia - Indeks Harga Saham G...",-0.6759
68908,754139,2024-08-21 23:29:20.443 +0700,2024-03-28,https://www.cnbcindonesia.com/market/202403280...,"Stok AS Lemah & Harapan Suku Bunga Turun, Harg...","Jakarta, CNBC Indonesia - Harga minyak mentah ...","Muhammad Reza Ilham Taufani,",5,"Jakarta, CNBC Indonesia - Harga minyak mentah ...",-0.6049


In [19]:
df_fix = df_filtered[["id", "clean", "compound"]]
df_fix

Unnamed: 0,id,clean,compound
0,754141,"Jakarta, CNBC Indonesia - Rupiah kembali melem...",-0.6577
1,754142,"Jakarta, CNBCIndonesia - Nadiem Anjarwalla, sa...",-0.1486
2,754143,"Jakarta, CNBC Indonesia - Rupiah melemah terha...",-0.7640
3,754144,"Jakarta, CNBC Indonesia - Kondisi kelas meneng...",-0.8883
4,754145,"Jakarta, CNBC indonesia - Nilai tukar rupiah k...",-0.4082
...,...,...,...
68905,754136,"Jakarta, CNBC Indonesia - Harga emas tak berhe...",-0.5124
68906,754137,"Jakarta, CNBC Indonesia-Ekonom menilai terdapa...",-0.8603
68907,754138,"Jakarta, CNBC Indonesia - Indeks Harga Saham G...",-0.6759
68908,754139,"Jakarta, CNBC Indonesia - Harga minyak mentah ...",-0.6049


In [20]:
df_fix.to_csv("{}_compound.csv".format("cnbc"), index = False, sep = ";")