In [9]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Data

In [10]:
# Replace 'your_file_path.csv' with the actual path to your file in Google Drive
# For example: '/content/drive/My Drive/my_data_folder/my_data.csv'
file_path = '/content/drive/My Drive/NLP/data_link_berita_with_content.csv'

# You can use pandas to read the data, assuming it's a CSV file
import pandas as pd

try:
  df = pd.read_csv(file_path)
  print(f"Successfully loaded data from {file_path}")
  display(df.head())
except FileNotFoundError:
  print(f"Error: File not found at {file_path}")
except Exception as e:
  print(f"An error occurred: {e}")

Successfully loaded data from /content/drive/My Drive/NLP/data_link_berita_with_content.csv


Unnamed: 0,link,judul,konten,tanggal,portal,tag
0,https://kumparan.com/kumparanbisnis/garuda-ind...,Garuda Indonesia Kembali RUPSLB di Tengah Isu ...,Garuda Indonesia Kembali RUPSLB di Tengah Isu ...,2025-09-30 00:00:00,Kumparan,Manajemen
1,https://www.bloombergtechnoz.com/detail-news/8...,Garuda Gelar RUPSLB di Tengah Isu Masuknya Dir...,Garuda Gelar RUPSLB di Tengah Isu Masuknya Dir...,2025-09-29 00:00:00,Bloomberg Technoz,Manajemen
2,https://voi.id/ekonomi/519004/komisi-v-dpr-bak...,Komisi V DPR Bakal Dalami Dugaan Mafia Jual Be...,JAKARTA - Ketua Komisi V DPR Lasarus mengataka...,2025-09-29 00:00:00,VOI.ID,Rute/Operasional
3,https://in.investing.com/news/company-news/gar...,Garuda Indonesia adds air cargo capacity to We...,,2025-09-29 00:00:00,Investing.com India,Lainnya
4,https://www.kompasiana.com/zainularifin2714/68...,Rencana Merger Garuda Indonesia - Pelita Air: ...,"Latar Belakang\nPada pertengahan 2023, wacana ...",2025-09-29 00:00:00,Kompasiana.com,Lainnya


# Remove empty rows

In [11]:
# Remove rows where the 'konten' column is NaN
df_cleaned = df.dropna(subset=['konten'])

print("Original DataFrame shape:", df.shape)
print("Cleaned DataFrame shape:", df_cleaned.shape)

display(df_cleaned.head())

Original DataFrame shape: (614, 6)
Cleaned DataFrame shape: (533, 6)


Unnamed: 0,link,judul,konten,tanggal,portal,tag
0,https://kumparan.com/kumparanbisnis/garuda-ind...,Garuda Indonesia Kembali RUPSLB di Tengah Isu ...,Garuda Indonesia Kembali RUPSLB di Tengah Isu ...,2025-09-30 00:00:00,Kumparan,Manajemen
1,https://www.bloombergtechnoz.com/detail-news/8...,Garuda Gelar RUPSLB di Tengah Isu Masuknya Dir...,Garuda Gelar RUPSLB di Tengah Isu Masuknya Dir...,2025-09-29 00:00:00,Bloomberg Technoz,Manajemen
2,https://voi.id/ekonomi/519004/komisi-v-dpr-bak...,Komisi V DPR Bakal Dalami Dugaan Mafia Jual Be...,JAKARTA - Ketua Komisi V DPR Lasarus mengataka...,2025-09-29 00:00:00,VOI.ID,Rute/Operasional
4,https://www.kompasiana.com/zainularifin2714/68...,Rencana Merger Garuda Indonesia - Pelita Air: ...,"Latar Belakang\nPada pertengahan 2023, wacana ...",2025-09-29 00:00:00,Kompasiana.com,Lainnya
5,https://www.cnnindonesia.com/ekonomi/202509292...,Dony Oskaria Pastikan Merger Pelita Air-Garuda...,--\nPlt Menteri Badan Usaha Milik Negara (BUMN...,2025-09-29 00:00:00,CNN Indonesia,Lainnya


# Remove English titles

In [12]:
!pip install langdetect
from langdetect import detect
import numpy as np

# Function to detect language, handles potential errors
def detect_language(text):
    try:
        # Ensure the input is a string
        if isinstance(text, str):
            return detect(text)
        else:
            return 'unknown' # Or handle non-string input as needed
    except:
        return 'unknown' # Return 'unknown' if language detection fails

# Apply the function to the 'judul' column and create a new column for detected language
df_cleaned['detected_language'] = df_cleaned['judul'].apply(detect_language)

# Filter out rows where the detected language is 'en' (English)
df_filtered = df_cleaned[df_cleaned['detected_language'] != 'en'].drop(columns=['detected_language'])


print("Original DataFrame shape:", df_cleaned.shape)
print("DataFrame shape after removing English titles:", df_filtered.shape)

display(df_filtered.head())

Original DataFrame shape: (533, 7)
DataFrame shape after removing English titles: (492, 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['detected_language'] = df_cleaned['judul'].apply(detect_language)


Unnamed: 0,link,judul,konten,tanggal,portal,tag
0,https://kumparan.com/kumparanbisnis/garuda-ind...,Garuda Indonesia Kembali RUPSLB di Tengah Isu ...,Garuda Indonesia Kembali RUPSLB di Tengah Isu ...,2025-09-30 00:00:00,Kumparan,Manajemen
1,https://www.bloombergtechnoz.com/detail-news/8...,Garuda Gelar RUPSLB di Tengah Isu Masuknya Dir...,Garuda Gelar RUPSLB di Tengah Isu Masuknya Dir...,2025-09-29 00:00:00,Bloomberg Technoz,Manajemen
2,https://voi.id/ekonomi/519004/komisi-v-dpr-bak...,Komisi V DPR Bakal Dalami Dugaan Mafia Jual Be...,JAKARTA - Ketua Komisi V DPR Lasarus mengataka...,2025-09-29 00:00:00,VOI.ID,Rute/Operasional
4,https://www.kompasiana.com/zainularifin2714/68...,Rencana Merger Garuda Indonesia - Pelita Air: ...,"Latar Belakang\nPada pertengahan 2023, wacana ...",2025-09-29 00:00:00,Kompasiana.com,Lainnya
5,https://www.cnnindonesia.com/ekonomi/202509292...,Dony Oskaria Pastikan Merger Pelita Air-Garuda...,--\nPlt Menteri Badan Usaha Milik Negara (BUMN...,2025-09-29 00:00:00,CNN Indonesia,Lainnya


# Save as csv

In [13]:
# Save the cleaned DataFrame to a new CSV file
output_file_path = '/content/drive/My Drive/NLP/data_link_berita_with_content_cleaned.csv' # You can change the path if needed

try:
    df_filtered.to_csv(output_file_path, index=False)
    print(f"Successfully saved cleaned data to {output_file_path}")
except Exception as e:
    print(f"An error occurred while saving the file: {e}")

Successfully saved cleaned data to /content/drive/My Drive/NLP/data_link_berita_with_content_cleaned.csv
