In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
open_path = './news_data.csv'

df = pd.read_csv(open_path)

In [3]:
copy_df = df.copy()
copy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5624 entries, 0 to 5623
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     5624 non-null   object
 1   category  5624 non-null   object
 2   url       5624 non-null   object
 3   date      5624 non-null   int64 
 4   press     5624 non-null   object
 5   reporter  5621 non-null   object
 6   content   5588 non-null   object
dtypes: int64(1), object(6)
memory usage: 307.7+ KB


In [4]:
# 결측치 제거
copy_df.drop_duplicates(subset='url', keep='first', inplace=True)
copy_df.drop_duplicates(subset='title', keep='first', inplace=True)
copy_df.drop_duplicates(subset='content', keep='first', inplace=True)
copy_df.dropna(subset=['content'], inplace=True)

# 문장내 <>, [], 구문 제거(앞뒤 공백 제거 포함)
copy_df['clear_str'] = copy_df['content'].apply(lambda x: re.sub(r'<.*?>', '', x))
copy_df['clear_str'] = copy_df['clear_str'].apply(lambda x: re.sub(r'\[.*?\]', '', x))
copy_df['clear_str'] = copy_df['clear_str'].apply(lambda x: x.strip())

In [5]:
# 단어별 처리 함수
def process_words(words, del_words, stopwords):
        processed_words=[]
        for word in words:
            if word == '기자':
                if processed_words:
                    processed_words.pop()
                continue
            
            if any(del_word in word for del_word in del_words):
                continue

            if '@' in word:
                continue
            # word = re.sub(r'\d+','', word)
            word = re.sub(r'[^\w\s.]','', word)
            
            if re.search('[a-zA-Z]', word):
                word = word.lower()
            
            for stopword in stopwords:
                if word.endswith(stopword):
                    word = word[:-len(stopword)]
            
            if len(word) <= 1:
                continue
                    
            if word:
                processed_words.append(word)
        return processed_words

In [6]:
# 정리 단어들 목록
press_name = copy_df['press'].unique()
stopword = ['기자', 'com', '.co', '저작권', '무단', '전재', '재배포', 'Copyr', 'copyr', '경향비즈'
        '영상', '취재', '편집', '문의', '금지', '특파원', '아이뉴스', '한경', '뉴스', '보도합니다']

In [7]:
# 단어로 정리
copy_df['split_word'] = copy_df['clear_str'].str.split()
copy_df['processed_words'] = copy_df['split_word'].apply(lambda words: process_words(words, press_name, stopword))
# 단어 수 저장
copy_df['word_ctn'] = copy_df['processed_words'].apply(len)

In [8]:
# 문장 만들기
copy_df['clear_sentence'] = copy_df['processed_words'].apply(lambda words: ' '.join(words))
# 문장 수 저장
copy_df['letter_ctn'] = copy_df['clear_sentence'].apply(len)
copy_df['clear_sentence_split'] = copy_df['clear_sentence'].apply(lambda sentence: re.split(r'(?<!\d)\.(?!\d)', sentence))
copy_df['sentence_ctn'] =  copy_df['clear_sentence_split'].apply(len)

In [9]:
# 공백란 제거 추가
copy_df = copy_df[copy_df['content'].str.strip().astype(bool)]

In [10]:
# 고유 id 만들기 함수
def make_tag_id(url):
    if 'daum' in url:
        sent_id = f'd{url.split("/")[-1][4:]}'
    elif 'naver' in url:
        sent_id = f'n{url.split("/")[-2]}{url.split("/")[-1]}'

    return sent_id

In [11]:
# 고유 id 데이터프레임 추가
copy_df['tag'] = df['url'].apply(make_tag_id)

In [12]:
save_path = './preprocessed_news_data.csv'
copy_df[['tag','date','url','letter_ctn', 'word_ctn', 'sentence_ctn', 'clear_sentence_split']].to_csv(save_path, index=False, encoding='utf-8-sig')