In [2]:
import pandas as pd
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import os 
import re

# 임의 데이터 형식 통일 
# 하나의 파일로 데이터 다 통합하기

"""뉴스 영상 스크래핑 구간 
위기발생 - 1달 ~ 위기 대응 + 1달

유튜브 영상 스크래핑 구간
1. 위기발생 전 2년 ~ 위기 발생 전날
2. 위기 발생 당일 ~ 위기 대응 전날
3. 위기 대응 날 ~ 1달"""

information = [["아이린","2020.10.20","2020.10.22"],["조현아","2014.12.05","2014.12.08"],["박나래","2021.03.24","2021.03.25"],["설현","2016.05.03","2016.05.12"],
               ["양팡","2020.08.05","2020.08.08"],["강민경","2020.07.15","2020.07.17"], ["유희열","2022.06.14","2022.06.14"],
               ["홍진영","2020.11.05","2020.11.06"],["설민석","2020.12.29","2020.12.29"]]

In [2]:
# 파일 병합 전처리 과정
import chardet

path = os.getcwd()

total_naver = []
total_yt_transcript = []
total_yt_comment = []

for info in information:
    query_path = path + "/text_file" + "/" + info[0]

    for file_name in os.listdir(query_path):
        file_path = query_path + "/" + file_name
        print(file_name)
        
        # check 
        with open(file_path, 'rb') as f:
            result = chardet.detect(f.read())
            detected_encoding = result['encoding']
        try:
            df = pd.read_csv(file_path, encoding=detected_encoding)
            df_add = pd.DataFrame({"query":[info[0]] * len(df)})
            df = pd.concat([df,df_add],axis=1)
        
        except pd.errors.EmptyDataError:
            print(f"Skipping malformed or empty file: {file_name}")
            continue
         
        if "with_comments" in file_name:
            total_naver.append(df)
            print("comment +1")
        
        elif "youtube_transcript" in file_name:
            total_yt_transcript.append(df)
            print("transcript +1")

        elif "youtube_comment" in file_name:
            total_yt_comment.append(df)
            print("comment +1")
        
yt_transcript = pd.concat(total_yt_transcript, ignore_index=True)
yt_transcript.to_csv("yt_transcript.csv", index=False)

yt_comment = pd.concat(total_yt_comment, ignore_index=True)
yt_comment.to_csv("yt_comment.csv", index=False)

naver = pd.concat(total_naver, ignore_index = True)

def clean_text(text):
    if not isinstance(text, str):
        return text  # Return as-is if not a string
    
    text = re.sub("작성자에 의해 삭제된 댓글입니다\. \|", "", text)
    text = re.sub("클린봇이 부적절한 표현을 감지한 댓글입니다\. \|", "", text)
    text = re.sub("작성자에 의해 삭제된 댓글입니다\.", "", text)
    text = re.sub("클린봇이 부적절한 표현을 감지한 댓글입니다\.", "", text)


    return text

naver["comments"] = naver["comments"].apply(clean_text)

naver.to_csv('naver.csv', index=False)

  text = re.sub("작성자에 의해 삭제된 댓글입니다\. \|", "", text)
  text = re.sub("클린봇이 부적절한 표현을 감지한 댓글입니다\. \|", "", text)
  text = re.sub("작성자에 의해 삭제된 댓글입니다\.", "", text)
  text = re.sub("클린봇이 부적절한 표현을 감지한 댓글입니다\.", "", text)


아이린_success_2020-09-20.csv
아이린_youtube_transcript_2018-10-19T00:00:00Z_to_2020-10-19T23:59:59Z.csv
transcript +1
아이린_youtube_transcript_2020-10-20T00:00:00Z_to_2020-10-21T23:59:59Z.csv
transcript +1
아이린_youtube_comment_2020-10-22T00:00:00Z_to_2020-12-22T23:59:59Z.csv
comment +1
아이린_with_comments_2020-09-20.csv
comment +1
아이린_youtube_comment_2020-10-20T00:00:00Z_to_2020-10-21T23:59:59Z.csv
comment +1
아이린_youtube_comment_2018-10-19T00:00:00Z_to_2020-10-19T23:59:59Z.csv
comment +1
아이린_youtube_transcript_2020-10-22T00:00:00Z_to_2020-12-22T23:59:59Z.csv
transcript +1
아이린_links_2020-09-20.csv
아이린_exception_2020-09-20.csv
조현아_youtube_transcript_2014-12-08T00:00:00Z_to_2015-01-08T23:59:59Z.csv
transcript +1
조현아_success_2014-11-05.csv
조현아_exception_2014-11-05.csv
조현아_youtube_comment_2014-12-08T00:00:00Z_to_2015-01-08T23:59:59Z.csv
comment +1
조현아_youtube_transcript_2012-12-04T00:00:00Z_to_2014-12-04T23:59:59Z.csv
transcript +1
조현아_l

In [3]:
columns = ['query', 'url', 'comment']
naver_comment = pd.DataFrame(columns=columns)

for i in range(len(naver)):
    comment_list = naver.loc[i,"comments"]
    if isinstance(comment_list, str):
        comment_list = naver.loc[i,"comments"].split("|")
        repeat = len(comment_list)
    else:
        repeat = 1
    
    
    adding_df = pd.DataFrame({"query" : repeat *[naver.loc[i,"query"]], "url" : repeat *[naver.loc[i,"URL"]], "comment" : comment_list})
    naver_comment = pd.concat([naver_comment,adding_df],ignore_index=True)

naver_comment.to_csv("naver_divided_comment.csv",index=False)