In [1]:
# pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2024.8.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googl

In [1]:
import re
import html
import pandas as pd
from googletrans import Translator
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
def lowercase(review):
    """Convert text to lowercase."""
    if review is None:
        return review
    return review.lower()

def remove_hashtags(review):
    """Menghapus hashtag dari teks menggunakan regex."""
    pattern = r'#\w+'
    return re.sub(pattern, '', review)

def remove_url(review):
    """Menghapus URL dari teks menggunakan regex."""
    return re.sub(r'http\S+', '', review)

def remove_emoji(review):
    """Menghapus emoji dari teks menggunakan regex."""
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticon
        u"\U0001F300-\U0001F5FF"  # simbol & dingbat
        u"\U0001F680-\U0001F6FF"  # transportasi & simbol map
        u"\U0001F700-\U0001F77F"  # simbol kuno
        u"\U0001F780-\U0001F7FF"  # simbol kuno tambahan
        u"\U0001F800-\U0001F8FF"  # simbol tanda batas
        u"\U0001F900-\U0001F9FF"  # emoticon tambahan
        u"\U0001FA00-\U0001FA6F"  # simbol musik
        u"\U0001FA70-\U0001FAFF"  # simbol musik tambahan
        u"\U00002702-\U000027B0"  # simbol karakter
        u"\U000024C2-\U0001F251"  # simbol katakter tambahan
        "]+", flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', review)

def remove_whitespace_chars(review):
    """Menghapus karakter whitespace yang tidak diinginkan dan spasi ganda."""
    review = re.sub(r'\s+', ' ', review)  # Mengganti semua spasi, tab, dan newline dengan satu spasi
    review = review.replace('\n', ' ')
    review = review.replace('\\n', ' ')
    review = review.replace('\t', ' ')
    review = review.replace('\\t', ' ')
    review = review.replace('\\u', ' ')
    review = review.replace('\\', ' ')
    review = review.replace('\v', ' ')
    review = review.replace('\f', ' ')
    review = review.replace('\r', ' ')
    review = review.replace('\x0b', ' ')  # Vertical Tab
    review = review.replace('\x0c', ' ')  # Form Feed
    review = re.sub(r'\s+', ' ', review)  # Menghapus spasi ganda yang mungkin muncul setelah penggantian
    return review.strip()  # Menghapus spasi di awal dan akhir teks

def decode_html_entities(review):
    """Menghapus entitas HTML dari teks."""
    return html.unescape(review)

def remove_html(review):
    """Menghapus elemen HTML dari teks menggunakan regex."""
    pattern = r'<.*?>'
    return re.sub(pattern, '', review)

def remove_irrelevant(review):
    """Menghapus karakter yang tidak relevan seperti angka, simbol, dan tanda baca."""
    return re.sub('[^a-zA-Z\s]', '', review)

def convert_eng(review):
    """Menerjemahkan teks dari bahasa Indonesia ke bahasa Inggris."""
    if review is None:
        raise ValueError("Input review tidak boleh None")
    
    translator = Translator()
    
    try:
        translation = translator.translate(review, src='id', dest='en')
        print("Respon API:", translation)
        return translation.text
    except Exception as e:
        print(f"Terjadi kesalahan saat menerjemahkan: {e}")
        return None


In [3]:
def data_prep(df):
    df['filtering'] = df['konten'].apply(remove_url)
    df['filtering'] = df['filtering'].apply(remove_hashtags)
    df['filtering'] = df['filtering'].apply(remove_emoji)
    df['filtering'] = df['filtering'].apply(remove_whitespace_chars)
    df['filtering'] = df['filtering'].apply(decode_html_entities)
    df['filtering'] = df['filtering'].apply(remove_html)
    df['filtering'] = df['filtering'].apply(remove_irrelevant)
    df['filtering'] = df['filtering'].apply(convert_eng)
    df['filtering'] = df['filtering'].apply(lowercase)
    return df


In [4]:
df = pd.read_csv("hasil-scrap-cnbc-5page.csv")
df = df[df['konten'] != ""].reset_index(drop=True)
df = df.loc[1:100]
df['konten'] = df['konten'].astype(str)
df

Unnamed: 0.1,Unnamed: 0,title,link,date,penulis,terbit,konten
1,1,InternasionalUpdate Demo Maut Bangladesh: Mili...,https://www.cnbcindonesia.com/news/20240806115...,2024-08-07,"luc,CNBC Indonesia",06 August 2024 13:10,"['Jakarta, CNBC Indonesia - Panglima Militer B..."
2,2,"Black Monday Guncang Pasar Modal Dunia, Petaka...",https://www.cnbcindonesia.com/market/202408060...,2024-08-07,"Mentari Puspadini,CNBC Indonesia",06 August 2024 10:05,"['Jakarta, CNBC Indonesia - Pasar modal dunia ..."
3,3,"Selamatkan Ekonomi Akhir Tahun, Jokowi Mau Bel...",https://www.cnbcindonesia.com/news/20240805160...,2024-08-07,"Arrijal Rachman,CNBC Indonesia",05 August 2024 17:05,"['Jakarta, CNBC Indonesia - Pemerintah berteka..."
4,4,Bursa Saham Dunia Jepang Hingga AS Ambruk Berj...,https://www.cnbcindonesia.com/market/202408051...,2024-08-07,"Chandra Dwi,CNBC Indonesia",05 August 2024 14:23,"['Jakarta, CNBC Indonesia - Pasar saham global..."
5,5,"InternasionalPengangguran Makin Banyak, AS Ber...",https://www.cnbcindonesia.com/news/20240805101...,2024-08-07,"luc,CNBC Indonesia",05 August 2024 10:15,"['Jakarta, CNBC Indonesia - Tanda-tanda lesuny..."
...,...,...,...,...,...,...,...
96,96,OJK Sebut Ada Tekanan ke Likuiditas Bank di RI...,https://www.cnbcindonesia.com/market/202407151...,2024-08-07,"Zefanya Aprilia,CNBC Indonesia",15 July 2024 19:50,"['Jakarta, CNBC Indonesia - Otoritas Jasa Keua..."
97,97,"Bukan AS atau China, Ini Pemberi Utang Terbesa...",https://www.cnbcindonesia.com/research/2024071...,2024-08-07,"Susi Setiawati,CNBC Indonesia",15 July 2024 14:55,"['Jakarta, CNBC Indonesia - Utang Luar Negeri ..."
98,98,Prabowo Tak Mungkin Ugal-ugalan: 2025 RI Butuh...,https://www.cnbcindonesia.com/news/20240715113...,2024-08-07,"Arrijal Rachman,CNBC Indonesia",15 July 2024 12:50,"['Jakarta, CNBC Indonesia - Beban utang jatuh ..."
99,99,Jangan Sembarangan Buka Deposito Koperasi! Bac...,https://www.cnbcindonesia.com/mymoney/20240715...,2024-08-07,"Financial Expert,CNBC Indonesia",15 July 2024 11:05,"['Jakarta, CNBC Indonesia - Salah satu jenis k..."


In [5]:
df = data_prep(df)

Respon API: Translated(src=id, dest=en, text=Jakarta CNBC Indonesia Military Commander Bangladesh will meet with student protest leaders on Tuesday when the country is waiting for the formation of a new government the day after Prime Minister Sheikh Hasina resigned and escaped following the violent rebellion that killed hundreds of student leaders who pioneered the movement against the work quotathen it becomes a demand that Hasina respects said that they want a new temporary government with the winner of Nobel Peace Muhammad Yunus as the Head of the Advisor of the Government other than what we recommend will not be accepted by Nahid Islam, one of the main organizers of the Student Movement in a video on Facebook with three other organizers as quoted as quoted as quoted as quotedFrom the reuters we will not accept a government supported or led by the military we have also discussed with Muhammad Yunus and he agreed to take this responsibility for our invitation to add Islam Yunus and h

In [6]:
df = df.dropna()

In [7]:
# Extract compound scores for each row
analyzer = SentimentIntensityAnalyzer()
df['Compound'] = df['filtering'].apply(lambda x: analyzer.polarity_scores(x)['compound'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Compound'] = df['filtering'].apply(lambda x: analyzer.polarity_scores(x)['compound'])


In [8]:
df

Unnamed: 0.1,Unnamed: 0,title,link,date,penulis,terbit,konten,filtering,Compound
1,1,InternasionalUpdate Demo Maut Bangladesh: Mili...,https://www.cnbcindonesia.com/news/20240806115...,2024-08-07,"luc,CNBC Indonesia",06 August 2024 13:10,"['Jakarta, CNBC Indonesia - Panglima Militer B...",jakarta cnbc indonesia military commander bang...,-0.9081
2,2,"Black Monday Guncang Pasar Modal Dunia, Petaka...",https://www.cnbcindonesia.com/market/202408060...,2024-08-07,"Mentari Puspadini,CNBC Indonesia",06 August 2024 10:05,"['Jakarta, CNBC Indonesia - Pasar modal dunia ...",jakarta cnbc indonesia world capital market ye...,-0.9896
3,3,"Selamatkan Ekonomi Akhir Tahun, Jokowi Mau Bel...",https://www.cnbcindonesia.com/news/20240805160...,2024-08-07,"Arrijal Rachman,CNBC Indonesia",05 August 2024 17:05,"['Jakarta, CNBC Indonesia - Pemerintah berteka...",jakarta cnbc indonesia the government is deter...,0.9928
5,5,"InternasionalPengangguran Makin Banyak, AS Ber...",https://www.cnbcindonesia.com/news/20240805101...,2024-08-07,"luc,CNBC Indonesia",05 August 2024 10:15,"['Jakarta, CNBC Indonesia - Tanda-tanda lesuny...",jakarta cnbc indonesia signs the sluggish econ...,-0.9889
6,6,Alasan Sri Mulyani Cs Ngotot APBN Harus Sehat:...,https://www.cnbcindonesia.com/news/20240805062...,2024-08-07,"Arrijal Rachman,CNBC Indonesia",05 August 2024 07:55,"['Jakarta, CNBC Indonesia - Menteri Keuangan S...",jakarta cnbc indonesia minister of finance sri...,-0.9957
...,...,...,...,...,...,...,...,...,...
96,96,OJK Sebut Ada Tekanan ke Likuiditas Bank di RI...,https://www.cnbcindonesia.com/market/202407151...,2024-08-07,"Zefanya Aprilia,CNBC Indonesia",15 July 2024 19:50,"['Jakarta, CNBC Indonesia - Otoritas Jasa Keua...",jakarta cnbc indonesia financial services auth...,0.9894
97,97,"Bukan AS atau China, Ini Pemberi Utang Terbesa...",https://www.cnbcindonesia.com/research/2024071...,2024-08-07,"Susi Setiawati,CNBC Indonesia",15 July 2024 14:55,"['Jakarta, CNBC Indonesia - Utang Luar Negeri ...",jakarta cnbc indonesia foreign debt uln indone...,0.9104
98,98,Prabowo Tak Mungkin Ugal-ugalan: 2025 RI Butuh...,https://www.cnbcindonesia.com/news/20240715113...,2024-08-07,"Arrijal Rachman,CNBC Indonesia",15 July 2024 12:50,"['Jakarta, CNBC Indonesia - Beban utang jatuh ...",jakarta cnbc indonesia the burden of debt due ...,-0.9210
99,99,Jangan Sembarangan Buka Deposito Koperasi! Bac...,https://www.cnbcindonesia.com/mymoney/20240715...,2024-08-07,"Financial Expert,CNBC Indonesia",15 July 2024 11:05,"['Jakarta, CNBC Indonesia - Salah satu jenis k...",jakarta cnbc indonesia one type of cooperative...,-0.7600


In [9]:
df.to_csv("hasil_data.csv",index=False)