In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import datetime
tqdm.pandas()

In [2]:
daum_open_path = './data/daum_data/daum_data.csv'
naver_open_path = './data/naver_data/news_naver_IT_contents_n.csv'

daum_df = pd.read_csv(daum_open_path)
naver_df = pd.read_csv(naver_open_path, on_bad_lines='skip')

In [3]:
copy_daum_df = daum_df.copy()
copy_naver_df = naver_df.copy()
print(copy_daum_df.info())
print(copy_naver_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 830608 entries, 0 to 830607
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Unnamed: 0   830608 non-null  int64 
 1   platform     830608 non-null  object
 2   title        830604 non-null  object
 3   category     830608 non-null  object
 4   article_url  830608 non-null  object
 5   date         830608 non-null  int64 
 6   publisher    830604 non-null  object
 7   content      828905 non-null  object
dtypes: int64(2), object(6)
memory usage: 50.7+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292503 entries, 0 to 292502
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   article_url  292503 non-null  object
 1   title        292487 non-null  object
 2   reg_date     292474 non-null  object
 3   publisher    292467 non-null  object
 4   author       292462 non-null  object
 5   sub_t

In [4]:
# 데이터 프레임 컬럼면 일치
copy_daum_df.columns = ['','platform_id','title','category','article_url','publication_date','publisher_id','content']
copy_naver_df.columns = ['article_url','title','publication_date','publisher_id','author','sub_title','content']

# 날짜 형식 통일
copy_daum_df['publication_date'] = pd.to_datetime(copy_daum_df['publication_date'], format='%Y%m%d', errors='coerce')
copy_naver_df['publication_date'] = pd.to_datetime(copy_naver_df['publication_date'], errors='coerce')

copy_daum_df['publication_date'] = copy_daum_df['publication_date'].dt.strftime('%Y-%m-%d')
copy_naver_df['publication_date'] = copy_naver_df['publication_date'].dt.strftime('%Y-%m-%d')


In [5]:
copy_daum_df.head(3)

Unnamed: 0,Unnamed: 1,platform_id,title,category,article_url,publication_date,publisher_id,content
0,0,daum,"롯데렌탈, 쏘카 지분 17.9% 추가 매입...2대 주주 지위",autos,https://v.daum.net/v/20230831212156741,2023-08-31,오토타임즈,\n\n -총 32.9% 지분 보유로 2대 주주 지위 \n\n\n -미래 모빌리티 ...
1,1,daum,"르노코리아, 주요모델 가격 인하… ‘가성비’로 승부수",autos,https://v.daum.net/v/20230831200507575,2023-08-31,세계일보,최근 국내 시장에서 부진한 성적을 내고 있는 르노코리아자동차가 내년 신차 출시까지의...
2,2,daum,"위기의 르노코리아…""200만원 내렸다"" 가격 인하 승부수",autos,https://v.daum.net/v/20230831200116487,2023-08-31,한국경제,'신차 부재' 여파로 올해 내수 시장서 고전을 면치 못하는 르노코리아자동차가 가격 ...


In [6]:
copy_naver_df.head(3)

Unnamed: 0,article_url,title,publication_date,publisher_id,author,sub_title,content
0,https://n.news.naver.com/mnews/article/003/001...,삼성 中 시장에 300만원 초호화폰 내놓는 속사정,2023-08-05,뉴시스,윤현성,"삼성, '심계천하' 시리즈 신작 10월 출시 전망…300만원 육박할 듯 고가 프리미...",지난해 10월 중국 시장에 출시된 삼성전자 심계천하 'W23 5G'와 'W23 플립...
1,��에 못미친데다,출시를 앞둔 새모델인 아이폰15프로 및 프로맥스 모델의 일부 디스플레이 부품이 신...,,,,,
2,https://n.news.naver.com/mnews/article/092/000...,애플 주가 4.8% 폭락…시총 3조 달러 무너졌다,2023-08-05,지디넷코리아,김익현,아이폰 등 HW매출 부진 실망감…올들어 최대 낙폭,애플 주가가 지난 해 9월 말 이후 가장 큰 폭으로 하락했다. 그 여파로 시가총액도...


In [7]:
# 결측치 제거
copy_daum_df.drop_duplicates(subset='article_url', keep='first', inplace=True)
copy_daum_df.drop_duplicates(subset='title', keep='first', inplace=True)
copy_daum_df.drop_duplicates(subset='content', keep='first', inplace=True)
copy_daum_df.dropna(subset=['content'], inplace=True)

copy_naver_df.drop_duplicates(subset='article_url', keep='first', inplace=True)
copy_naver_df.drop_duplicates(subset='title', keep='first', inplace=True)
copy_naver_df.drop_duplicates(subset='content', keep='first', inplace=True)
copy_naver_df.dropna(subset=['content'], inplace=True)

# 문장내 <>, [], 구문 제거(앞뒤 공백 제거 포함)
copy_daum_df['clear_str'] = copy_daum_df['content'].progress_apply(lambda x: re.sub(r'<.*?>', '', x))
copy_daum_df['clear_str'] = copy_daum_df['clear_str'].progress_apply(lambda x: re.sub(r'\[.*?\]', '', x))
copy_daum_df['clear_str'] = copy_daum_df['clear_str'].progress_apply(lambda x: x.strip())

copy_naver_df['clear_str'] = copy_naver_df['content'].progress_apply(lambda x: re.sub(r'<.*?>', '', x))
copy_naver_df['clear_str'] = copy_naver_df['clear_str'].progress_apply(lambda x: re.sub(r'\[.*?\]', '', x))
copy_naver_df['clear_str'] = copy_naver_df['clear_str'].progress_apply(lambda x: x.strip())

100%|██████████| 760612/760612 [00:01<00:00, 628648.17it/s]
100%|██████████| 760612/760612 [00:01<00:00, 492411.78it/s]
100%|██████████| 760612/760612 [00:00<00:00, 874700.65it/s]
100%|██████████| 255019/255019 [00:00<00:00, 614041.54it/s]
100%|██████████| 255019/255019 [00:00<00:00, 510196.13it/s]
100%|██████████| 255019/255019 [00:00<00:00, 1288608.12it/s]


In [8]:
# 단어별 처리 함수
def process_words(words, del_words, stopwords):
        processed_words=[]
        for word in words:
            if word == '기자':
                if processed_words:
                    processed_words.pop()
                continue
            
            if any(del_word in word for del_word in del_words):
                continue

            if '@' in word:
                continue
            # word = re.sub(r'\d+','', word)
            word = re.sub(r'[^\w\s.]','', word)
            
            if re.search('[a-zA-Z]', word):
                word = word.lower()
            
            for stopword in stopwords:
                if word.endswith(stopword):
                    word = word[:-len(stopword)]
            
            if len(word) <= 1:
                continue
                    
            if word:
                processed_words.append(word)
        return processed_words

In [9]:
# 정리 단어들 목록
press_name = set()

press_name.update(copy_daum_df['publisher_id'].unique())
press_name.update(copy_naver_df['publisher_id'].unique())

press_name = list(press_name)

stopword = ['기자', 'com', '.co', '저작권', '무단', '전재', '재배포', 'Copyr', 'copyr', '경향비즈'
        '영상', '취재', '편집', '문의', '금지', '특파원', '아이뉴스', '한경', '뉴스', '보도합니다']

In [10]:
# 단어로 정리
copy_daum_df['split_word'] = copy_daum_df['clear_str'].progress_apply(lambda x: x.split())
copy_daum_df['processed_words'] = copy_daum_df['split_word'].progress_apply(lambda words: process_words(words, press_name, stopword))

copy_naver_df['split_word'] = copy_naver_df['clear_str'].progress_apply(lambda x: x.split())
copy_naver_df['processed_words'] = copy_naver_df['split_word'].progress_apply(lambda words: process_words(words, press_name, stopword))


# 단어 수 저장
copy_daum_df['word_ctn'] = copy_daum_df['processed_words'].progress_apply(len)
copy_naver_df['word_ctn'] = copy_naver_df['processed_words'].progress_apply(len)

100%|██████████| 760612/760612 [00:17<00:00, 43193.95it/s]
100%|██████████| 760612/760612 [23:05<00:00, 548.81it/s]  
100%|██████████| 255019/255019 [00:10<00:00, 23657.10it/s]
100%|██████████| 255019/255019 [09:13<00:00, 460.87it/s]
100%|██████████| 760612/760612 [00:00<00:00, 1497546.34it/s]
100%|██████████| 255019/255019 [00:00<00:00, 1461255.33it/s]


In [11]:
# 문장 만들기
copy_daum_df['clear_sentence'] = copy_daum_df['processed_words'].progress_apply(lambda words: ' '.join(words))
copy_naver_df['clear_sentence'] = copy_naver_df['processed_words'].progress_apply(lambda words: ' '.join(words))

# 문장 수 저장
copy_daum_df['letter_ctn'] = copy_daum_df['clear_sentence'].progress_apply(len)
copy_daum_df['clear_sentence_split'] = copy_daum_df['clear_sentence'].progress_apply(lambda sentence: re.split(r'(?<!\d)\.(?!\d)', sentence))
copy_daum_df['sentence_ctn'] =  copy_daum_df['clear_sentence_split'].progress_apply(len)

copy_naver_df['letter_ctn'] = copy_naver_df['clear_sentence'].progress_apply(len)
copy_naver_df['clear_sentence_split'] = copy_naver_df['clear_sentence'].progress_apply(lambda sentence: re.split(r'(?<!\d)\.(?!\d)', sentence))
copy_naver_df['sentence_ctn'] =  copy_naver_df['clear_sentence_split'].progress_apply(len)

100%|██████████| 760612/760612 [00:05<00:00, 141453.81it/s]
100%|██████████| 255019/255019 [00:02<00:00, 126386.10it/s]
100%|██████████| 760612/760612 [00:00<00:00, 1400614.89it/s]
100%|██████████| 760612/760612 [00:29<00:00, 25909.96it/s]
100%|██████████| 760612/760612 [00:00<00:00, 1648418.96it/s]
100%|██████████| 255019/255019 [00:00<00:00, 1275085.58it/s]
100%|██████████| 255019/255019 [00:06<00:00, 41160.14it/s]
100%|██████████| 255019/255019 [00:00<00:00, 1604521.85it/s]


In [12]:
# 공백란 제거 추가
copy_daum_df = copy_daum_df[copy_daum_df['content'].str.strip().astype(bool)]
copy_naver_df = copy_naver_df[copy_naver_df['content'].str.strip().astype(bool)]

In [22]:
# 고유 id 만들기 함수
def make_tag_id(url):
    if 'daum' in url:
        sent_id = f'd{url.split("/")[-1][4:]}'
    elif 'naver' in url:
        sent_id = f'n{url.split("/")[-2]}{url.split("/")[-1]}'
    else:
        sent_id = 'unknwon_id'

    return sent_id

In [24]:
# 고유 id 데이터프레임 추가
copy_daum_df['tag'] = copy_daum_df['article_url'].apply(make_tag_id)
copy_naver_df['tag'] = copy_naver_df['article_url'].apply(make_tag_id)

In [27]:
prog_copy_daum_df = copy_daum_df[['tag','publication_date','article_url','letter_ctn', 'word_ctn', 'sentence_ctn', 'clear_sentence_split']]
prog_copy_naver_df = copy_naver_df[['tag','publication_date','article_url','letter_ctn', 'word_ctn', 'sentence_ctn', 'clear_sentence_split']]

all_prog_copy_df = pd.concat([prog_copy_daum_df, prog_copy_naver_df])

In [28]:
save_path = './data/01. preprocessed_data.csv'
all_prog_copy_df.to_csv(save_path, index=False, encoding='utf-8-sig')