# [TEST] Cleaning CSV Master

# 0. Libraries and Packages

In [3]:
import numpy as np
import pandas as pd

In [4]:
file_path = '/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/master_csv/kompas_master.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,content_PublishedDate,content_editor,content_site,content_tags,content_title,content_total_words,content_type,url
0,"3 Januari 2024, 10:46 WIB",unknown,Kompas.com,unknown,Pejabat Senior Hamas Tewas Usai Serangan Drone...,unknown,Video,https://video.kompas.com/watch/1135553/pejabat...
1,"3 Januari 2024, 09:43 WIB",Arini Kusuma Jati,Kompas.com,unknown,Israel Bakal Bela Diri di Kasus Kejahatan Geno...,unknown,Video,https://video.kompas.com/watch/1135421/israel-...
2,"3 Januari 2024, 08:12 WIB",unknown,Kompas.com,unknown,Momen Militer Israel Temukan Berbagai Senjata ...,unknown,Video,https://video.kompas.com/watch/1135211/momen-m...
3,"3 Januari 2024, 08:05 WIB",Anneke Sherina Ramadhani,Kompas.com,unknown,Ribuan Warga Palestina Kembali ke Gaza Utara U...,unknown,Video,https://video.kompas.com/watch/1135209/ribuan-...
4,"3 Januari 2024, 07:17 WIB",Anggie Puspariana,Kompas.com,unknown,"Terperangkap Dalam Perang, Ini yang Dilakukan ...",unknown,Video,https://video.kompas.com/watch/1135175/terpera...


# 1. Cleaning Data
### Format:
- `id` = `int`
- `content_PublishedDate` = `datetime` YYYY-MM-DD HH:MM:SS
- `content_editor` = `object`
- `content_site` = `object `Kompas.com
- `content_tags` = `str`
- `content_title` = `str`
- `content_total_words` = `int`
- `content_type` = `object`
- `url` = `int`

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4575 entries, 0 to 4574
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   content_PublishedDate  1645 non-null   object
 1   content_editor         1645 non-null   object
 2   content_site           1645 non-null   object
 3   content_tags           1645 non-null   object
 4   content_title          1645 non-null   object
 5   content_total_words    1645 non-null   object
 6   content_type           1645 non-null   object
 7   url                    4575 non-null   object
dtypes: object(8)
memory usage: 286.1+ KB


## Checking for duplicate rows

In [6]:
duplicate = df[df.duplicated]
duplicate.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89 entries, 2641 to 4019
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   content_PublishedDate  29 non-null     object
 1   content_editor         29 non-null     object
 2   content_site           29 non-null     object
 3   content_tags           29 non-null     object
 4   content_title          29 non-null     object
 5   content_total_words    29 non-null     object
 6   content_type           29 non-null     object
 7   url                    89 non-null     object
dtypes: object(8)
memory usage: 6.3+ KB


In [7]:
df_clean = df.drop_duplicates()

In [8]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4486 entries, 0 to 4574
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   content_PublishedDate  1616 non-null   object
 1   content_editor         1616 non-null   object
 2   content_site           1616 non-null   object
 3   content_tags           1616 non-null   object
 4   content_title          1616 non-null   object
 5   content_total_words    1616 non-null   object
 6   content_type           1616 non-null   object
 7   url                    4486 non-null   object
dtypes: object(8)
memory usage: 315.4+ KB


## Standardizing `content_PublishedDate` and extracting `content_title` from `url`
YYYY-MM-DD HH:MM:SS

Logic (use regex)
for all rows in df
if `content_PublishedDate` has value:
    if format is not YYYY-MM-DD HH:MM:SS
        change to YYYY-MM-DD HH:MM:SS based on:
            4 number sequence = YYYY
            MM ikutin mapping:
                January = 01
                February = 02
            add space
            2 number sequence = DD
            erase ,
            erase WIB
            HH:MM to HH:SS:00
    else:
        keep format
else:
    null


if `content_title` is NULL:
    df[content_title] = last part of url (all text from the end before the last '/')
    remove -
    make title case

In [9]:
import pandas as pd
import re
from urllib.parse import urlparse

In [10]:
month_map = {
    "January": "01", 
    "February": "02", 
    "March": "03", 
    "April": "04",
    "May": "05", 
    "June": "06", 
    "July": "07", 
    "August": "08",
    "September": "09", 
    "October": "10", 
    "November": "11", 
    "December": "12"
}

In [11]:
df_clean = df.copy()

# Funtion for standardizing datetime
def standardize_date(date_str):
    if pd.isna(date_str):
        return None
    match = re.search(r"(\d{1,2}) (\w+) (\d{4}), (\d{2}):(\d{2}) WIB", date_str)
    if match:
        day, month, year, hour, minute = match.groups()
        return f"{year}-{month_map.get(month, 'XX')}-{int(day):02d} {hour}:{minute}:00"
    return date_str 

# Funtion for extracting judul berita from url
def extract_title_from_url(url):
    if pd.isna(url):
        return None
    path = urlparse(url).path
    last_part = path.rstrip("/").split("/")[-1]
    return last_part.replace("-", " ").title()

# Apply transformations
df_clean.loc[:, "content_PublishedDate"] = df_clean["content_PublishedDate"].apply(standardize_date)
df_clean.loc[:, "content_title"] = df_clean.apply(
    lambda row: extract_title_from_url(row["url"]) if pd.isna(row["content_title"]) else row["content_title"], axis=1
)

df_clean


Unnamed: 0,content_PublishedDate,content_editor,content_site,content_tags,content_title,content_total_words,content_type,url
0,2024-XX-03 10:46:00,unknown,Kompas.com,unknown,Pejabat Senior Hamas Tewas Usai Serangan Drone...,unknown,Video,https://video.kompas.com/watch/1135553/pejabat...
1,2024-XX-03 09:43:00,Arini Kusuma Jati,Kompas.com,unknown,Israel Bakal Bela Diri di Kasus Kejahatan Geno...,unknown,Video,https://video.kompas.com/watch/1135421/israel-...
2,2024-XX-03 08:12:00,unknown,Kompas.com,unknown,Momen Militer Israel Temukan Berbagai Senjata ...,unknown,Video,https://video.kompas.com/watch/1135211/momen-m...
3,2024-XX-03 08:05:00,Anneke Sherina Ramadhani,Kompas.com,unknown,Ribuan Warga Palestina Kembali ke Gaza Utara U...,unknown,Video,https://video.kompas.com/watch/1135209/ribuan-...
4,2024-XX-03 07:17:00,Anggie Puspariana,Kompas.com,unknown,"Terperangkap Dalam Perang, Ini yang Dilakukan ...",unknown,Video,https://video.kompas.com/watch/1135175/terpera...
...,...,...,...,...,...,...,...,...
4570,2023-11-29 17:15:00,unknown,Kompas.com,unknown,Momen Sandera Hamas Lambaikan Tangannya Saat D...,unknown,Video,https://video.kompas.com/watch/1062854/momen-s...
4571,2023-11-29 17:14:00,Aditya Jaya Iswara,Kompas.com,"sejarah Tepi Barat, sejarah wilayah Tepi Barat...",Sejarah Wilayah Tepi Barat dalam Konflik Israe...,2158,Standard,http://internasional.kompas.com/read/2023/11/2...
4572,2023-11-29 17:13:00,unknown,Kompas.com,unknown,"Dari Podium Sidang PBB, Menlu Retno Suarakan K...",unknown,Video,https://video.kompas.com/watch/1062853/dari-po...
4573,2023-11-29 17:09:49,Dani Prabowo,Kompas.com,"Israel, Palestina, pemilu, Bentrok di Bitung","Pasca Bentrokan di Bitung, Kapolri Minta Masya...",261,Standard,http://nasional.kompas.com/read/2023/11/29/170...


## New dataframe `df_title` with just `content_title`

In [12]:
df_title = df_clean[['content_title']]
df_title.head()

Unnamed: 0,content_title
0,Pejabat Senior Hamas Tewas Usai Serangan Drone...
1,Israel Bakal Bela Diri di Kasus Kejahatan Geno...
2,Momen Militer Israel Temukan Berbagai Senjata ...
3,Ribuan Warga Palestina Kembali ke Gaza Utara U...
4,"Terperangkap Dalam Perang, Ini yang Dilakukan ..."


## `df_title` to CSV

In [13]:
df_title.to_csv('/Users/salmadanu/Desktop/Skripsi/skripsi-env/narasipal/cleaning/title_csv/kompas_titles.csv', index=True)