# 1. 데이터 로드

In [1]:
#  버전 설치
# !pip install -r requirements.txt

In [1]:
import os

import asyncio
import aiohttp
import time
import pandas as pd
import numpy as np
from dtw import *
import pickle
from pytrends.request import TrendReq
from concurrent.futures import ThreadPoolExecutor
from pytz import timezone
from datetime import datetime
from pytrends.request import TrendReq
import nest_asyncio
from models.naver.blog import activity_rate
from api_set import APIClient
import utils
import models.crawling.trend as trend 
from models.crawling.collect_keywords import collect_keywords
from models.crawling.google_trend import collect_rising_keywords
from models.naver.news import main_news 
from models.crawling.select_keyword import select_keyword, rising_keyword_analysis, monthly_rule
from models.anaysis import execute_analysis , process_results

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



# 2. API설정

In [2]:

# API 설정
BASE_URL = utils.get_secret("BASE_URL")
CUSTOMER_ID = utils.get_secret("CUSTOMER_ID")
API_KEY = utils.get_secret("API_KEY")
SECRET_KEY = utils.get_secret("SECRET_KEY")
URI = utils.get_secret("URI")
METHOD = utils.get_secret("METHOD")
# API 클라이언트 인스턴스 생성
api_client = APIClient(BASE_URL, CUSTOMER_ID, API_KEY, SECRET_KEY,URI,METHOD)


# 3. 연관검색어 수집

In [3]:
# 키 로드
keywords_data = utils.load_keywords('main_keyword.json')

# 오늘의 날짜 가져오기
formatted_today, day = utils.get_today_date()


utils.make_directory('./data')
utils.make_directory('./data/rl_srch')
utils.make_directory(f'./data/rl_srch/{day}')  # 키워드별 연관검색어 리스트 저장

# 검색어 리스트와 결과 저장 경로 설정
srch_keyword = ['keyword_final']  
save_path = './data/rl_srch/'  

In [4]:


nest_asyncio.apply()

async def main(srch_keyword, day):
    # 오늘 날짜로 폴더 경로 생성
    folder_path = './data/rl_srch/' + datetime.now().strftime('%y%m%d')
    file_path = f"{folder_path}/collected_keywords.csv"
    
    # 폴더가 존재하는지 확인
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    # 파일이 존재하는지 확인
    if os.path.isfile(file_path):
        # 파일이 존재하면, 데이터를 읽어옵니다.
        collected_keywords_data = pd.read_csv(file_path)
    else:
        # 파일이 없으면, collect_keywords 함수를 호출해서 데이터를 수집합니다.
        collected_keywords_data = await collect_keywords(srch_keyword, day)
        # 결과를 CSV로 저장
        collected_keywords_data.to_csv(file_path, index=False)
    
    return collected_keywords_data
collected_keywords_data=asyncio.run(main(srch_keyword, day))

collected_keywords_dat_copy=asyncio.run(main(srch_keyword, day))

In [5]:
## 중복검색어컬럼 생성, 중복제거하고 각 키워드별로 50개씩 집계하는 로직 (함수화 필요할까 고민해야댐)

# 1. collected_keywords_data의 복사본 생성
temp_df = collected_keywords_data.copy()

# 2. 새로운 컬럼 '중복검색어' 추가 (초기값으로 빈 문자열 할당)
temp_df['중복검색어'] = ''

# 3. 연관키워드별로 해당하는 모든 검색어를 찾는 딕셔너리 생성
keywords_dict = {}
for index, row in temp_df.iterrows():
    associated_keyword = row['연관키워드']
    search_keyword = row['검색어']
    if associated_keyword in keywords_dict:
        # 이미 리스트에 있는 경우 중복을 피하기 위해 추가하지 않음
        if search_keyword not in keywords_dict[associated_keyword]:
            keywords_dict[associated_keyword].append(search_keyword)
    else:
        # 새로운 키워드인 경우 리스트 초기화
        keywords_dict[associated_keyword] = [search_keyword]

# 4. '중복검색어' 컬럼을 채워 넣음
for index, row in temp_df.iterrows():
    associated_keyword = row['연관키워드']
    # 연관키워드에 해당하는 모든 검색어를 '중복검색어' 컬럼에 할당
    temp_df.at[index, '중복검색어'] = ','.join(keywords_dict[associated_keyword])
df_list_test = []
already_selected = set()
for _, group in temp_df.groupby('검색어'):
    selected_rows = []  # Collect rows to append
    for index, row in group.iterrows():
        if row['연관키워드'] not in already_selected:
            selected_rows.append(row)
            already_selected.add(row['연관키워드'])
        if len(selected_rows) >= 50:
            break
    # Append all selected rows at once to improve performance
    filtered_group = pd.DataFrame(selected_rows)
    df_list_test.append(filtered_group)
collected_keywords_data = pd.concat(df_list_test, ignore_index=True)



In [6]:

# df_list = [group for _, group in collected_keywords_data.groupby('검색어')]
# collected_keywords_data = utils.merge_and_mark_duplicates_limited(df_list)


In [7]:
matching_count = (collected_keywords_data['연관키워드'] == 'S&P500ETF').sum()

matching_count

1

In [8]:
collected_keywords_data= utils.add_client_info(collected_keywords_data)
new_columns = ['일별급상승', '주별급상승', '월별급상승', '주별지속상승', '월별지속상승', '월별규칙성']

for column in new_columns:
    collected_keywords_data[column] = 0

In [9]:
duplicates = collected_keywords_data['연관키워드'].duplicated(keep=False)

# 중복된 '연관키워드'가 있는지 여부를 출력
if duplicates.any():
    print("중복된 '연관키워드'가 존재합니다.")
    # 중복된 '연관키워드'를 보고 싶다면 다음 코드를 사용
    print(collected_keywords_data[duplicates]['연관키워드'].unique())
else:
    print("중복된 '연관키워드'가 존재하지 않습니다.")

중복된 '연관키워드'가 존재하지 않습니다.


In [10]:
def groupped_df(name,collected_keywords_data):
    grouped = collected_keywords_data.groupby(name)
    df_list = [group for _, group in grouped]
    return df_list
df_list=groupped_df('id',collected_keywords_data)
n=len(df_list)
print(n)

5


In [11]:

# 데이터를 로드하거나 크롤링하여 반환하는 비동기 함수
async def load_or_crawl_data(df_list, clients):
    today_date_str = datetime.now().strftime("%y%m%d")
    directory = f"./data/trend_data/{today_date_str}"
    save_path = f"{directory}/data_{today_date_str}.pkl"
    
    # 파일이 존재하면 데이터 로드
    if os.path.exists(save_path):
        with open(save_path, 'rb') as f:
            results = pickle.load(f)
    else:
        # 파일이 없으면 비동기 크롤링 시작
        results = await run_all(df_list, clients)
        # 결과 데이터 저장
        if not os.path.exists(directory):
            os.makedirs(directory)
        with open(save_path, 'wb') as f:
            pickle.dump(results, f)
    
    return results

# 비동기 크롤링 함수
async def trend_main(df, clients):
    params = {
        "search_keywords": list(df['연관키워드']),
        "id": df['id'].iloc[0],
        "pw": df['pw'].iloc[0],
        "api_url": "https://openapi.naver.com/v1/datalab/search",
        "name": '연관검색어'
    }
    api_url = "https://openapi.naver.com/v1/datalab/search"
    
    # trend_maincode 함수 실행
    results = await trend.trend_maincode(params, clients, api_url)
    return results

async def run_all(df_list, clients):
    tasks = [trend_main(df, clients) for df in df_list]
    results = await asyncio.gather(*tasks)
    return results

clients = utils.get_secret("clients")  # clients 정보를 로드

# 이벤트 루프 실행 및 데이터 로드 또는 크롤링
trend_main_data = asyncio.run(load_or_crawl_data(df_list, clients))
results = trend_main_data.copy()

직렬로 처리

In [12]:

# start_time = time.time()
# select_periods = ['daily', 'weekly', 'month']
# rising_periods=['weekly', 'month']

# formatted_today, today_date = utils.get_today_date()
# month_rule_list=[]
# select_list=[[],[],[]]

# rising_list=[[],[]]
# rising_month_list=[]


# i = 0
# # 일별, 주별, 월별 키워드 선택 실행
# for period in select_periods:
#     for keyword_df_group in results:
#         for keyword_df in keyword_df_group:
#             selected_tmp, selected_graph, selected_info = select_keyword(keyword_df, today_date, period)
#             if selected_graph is not None:
#                 # 데이터프레임의 열 이름을 출력합니다.
#                 selected_graph['InfoData'] = selected_info
#                 select_list[i].append(selected_graph)
#             else:
#                 pass
#     i += 1
# # 월별, 주별, 일별 키워드 분석 실행

#     # 각 분석 기간에 대해 결과 집합을 순회합니다.
# for keyword_group in results:
#     # 키워드 그룹의 각 키워드 데이터프레임에 대해 순회합니다.
#     for keyword_data in keyword_group:
#         # 월별 규칙을 적용하여 결과를 가져옵니다.
#         monthly_data, monthly_chart, similarity_rate, rising_months = monthly_rule(keyword_data, today_date, 'month')
        
#         if monthly_data is not None:
#             # 결과 데이터프레임의 열 이름을 가져옵니다.
#             column_names = monthly_data.columns
#             rising_month_list.append([rising_months,column_names[0]])
#             # 결과 데이터프레임에서 값 리스트를 추출합니다.
#             data_values_list = monthly_data[column_names].values
#             # 월별 차트에 데이터 값을 추가합니다.
#             monthly_chart['Indicator'] = data_values_list
#             monthly_chart['InfoData'] = similarity_rate
#             # 상승 월 정보를 추가합니다. 상승 월이 없는 경우 0으로 설정합니다.
#             monthly_chart['RisingMonth'] = 0
            
#             # 최종 결과 리스트에 수정된 월별 차트를 추가합니다.
#             month_rule_list.append(monthly_chart)
                
# # 주별, 월별 상승 키워드 분석 실행
# rising_analysis_periods = ['weekly', 'month']
# i=0
# for period in rising_analysis_periods:
#     for keyword_df_group in results:
#         for keyword_df in keyword_df_group:
#             rising_tmp, rising_graph, rising_info = rising_keyword_analysis(keyword_df, today_date, period)
#             if rising_tmp is not None:
#                 column_names=rising_tmp.columns
#                 data_values_list = rising_tmp[column_names].values
#                 rising_graph['Indicator'] = data_values_list
#                 rising_graph['InfoData'] = rising_info

#                 rising_list[i].append(rising_graph)
#     i=i+1




# end_time = time.time()
# print(f"Analysis completed in {end_time - start_time} seconds.")

병렬로 처리

In [13]:

# 전역 변수로 리스트 초기화
month_rule_list_a = []
rising_list_a = [[], []]  # 주별 상승, 월별 상승
select_list_a = [[], [], []]  # 일별 선택, 주별 선택, 월별 선택
future = execute_analysis(results,month_rule_list_a,rising_list_a,select_list_a)


month_rule_list=[]
select_list=[[],[],[]]

rising_list=[[],[]]
rising_month_list=[]


# 각 리스트를 처리
select_list[0] = process_results(select_list_a[0])
select_list[1] = process_results(select_list_a[1])
select_list[2] = process_results(select_list_a[2])

rising_list[0] = process_results(rising_list_a[0])
rising_list[1] = process_results(rising_list_a[1])

# month_rule_list_a를 처리하면서 추가 데이터 처리를 포함
for result in month_rule_list_a:
    if not all(value is None for value in result) and result[0] is not None:
        column_names = result[0].columns
        data_values_list = result[0][column_names].values
        additional_data = {
            'Indicator': data_values_list,
            'RisingMonth': 0,
            '유형': '월별규칙성'  # 모든 결과에 대해 '유형'을 '월별규칙성'으로 설정
        }
        month_rule_list += process_results([result], additional_data=additional_data)


# Graph_result

In [14]:

# 리스트와 유형을 매핑
lists_and_types = [
    (select_list[0], '일별급상승'),
    (select_list[1], '주별급상승'),
    (select_list[2], '월별급상승'),
    (rising_list[0], '주별지속상승'),
    (rising_list[1], '월별지속상승'),
    (month_rule_list, '월별규칙성')
]


# 모든 리스트를 처리하고 하나의 데이터프레임으로 병합
processed_dfs = [utils.process_and_concat(df_list, label) for df_list, label in lists_and_types]

# 비어 있지 않은 DataFrame들만 병합
graph_result = pd.concat([df for df in processed_dfs if not df.empty]).reset_index(drop=True)

graph_result.reset_index(drop=True, inplace=True)
# 불필요한 컬럼 삭제 및 '주간지속상승'을 '주별지속상승'으로 수정

graph_result = graph_result.drop(columns=['InfoData'])
graph_result['유형'].replace({'주간지속상승': '주별지속상승'}, inplace=True)

# 정렬
graph_result.sort_values(by=['연관검색어', '유형', '검색일자'], ascending=[True, True, True], inplace=True)

# 최종 결과 출력
graph_result.reset_index(drop=True, inplace=True)




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  graph_result['유형'].replace({'주간지속상승': '주별지속상승'}, inplace=True)


In [15]:
flags_and_lists = [
    ("일별 급상승", select_list[0]),
    ("주별 급상승", select_list[1]),
    ("주별 지속상승", rising_list[0]),
    ("월별 급상승", select_list[2]),
    ("월별 지속상승", rising_list[1]),
    ("월별 규칙성", month_rule_list),
]
for flag_name, data_list in flags_and_lists:
    # data_list가 리스트인지 확인
    if not isinstance(data_list, list):
        print(f"{flag_name}: data_list가 리스트가 아닙니다.")
        continue
    
    # data_list 내의 각 요소가 DataFrame인지, '연관검색어' 컬럼이 있는지 확인
    for idx, df in enumerate(data_list):
        if not isinstance(df, pd.DataFrame):
            print(f"{flag_name}: 인덱스 {idx}에 DataFrame이 아닌 요소가 있습니다.")
        elif "연관검색어" not in df.columns:
            print(f"{flag_name}: 인덱스 {idx}의 DataFrame에 '연관검색어' 컬럼이 없습니다.")

# utils.update_keywords_flag 함수를 호출하기 전에 각 data_list의 유효성 검사
for flag_name, data_list in flags_and_lists:
    # 데이터 프레임으로 구성된 리스트만 유지
    valid_data_list = [df for df in data_list if isinstance(df, pd.DataFrame) and "연관검색어" in df.columns]
    
    # 유효한 데이터 리스트만을 사용하여 키워드 플래그 업데이트
    utils.update_keywords_flag(collected_keywords_data, valid_data_list, flag_name)


In [16]:
 # process_data : 지정된 조건에 따라 데이터를 필터링하고, 추가 처리를 통해 최종 데이터프레임을 반환하는 함수.
def safe_process_data(process_function, data, category1, category2, selection):
    """
    process_function: 데이터 처리 함수 (예: utils.process_data)
    data: 처리할 데이터프레임
    category1, category2: 데이터 처리 함수에 전달될 카테고리 인자
    selection: 데이터 처리 함수에 전달될 선택 리스트 또는 기타 인자
    
    반환값: 처리된 데이터프레임 또는 빈 데이터프레임
    """
    if data is not None and not data.empty:
        try:
            return process_function(data, category1, category2, selection)
        except Exception as e:
            print(f"Error processing data: {e}")
            # 처리 중 오류가 발생한 경우 빈 데이터프레임 반환
            return pd.DataFrame()
    else:
        print("No data available.")
        return pd.DataFrame()

info_result_daily_select = safe_process_data(utils.process_data, collected_keywords_data, '일별 급상승', '일별 급상승', select_list[0])
info_result_weekly_select = utils.process_data(collected_keywords_data, '주별 급상승', '주별 급상승', select_list[1])
info_result_monthly_select = utils.process_data(collected_keywords_data, '월별 급상승', '월별 급상승', select_list[2]) 

info_result_weekly_continuous = utils.process_data(collected_keywords_data, '주별 지속상승', '주별 지속상승', rising_list[0])

info_result_monthly_continuous = utils.process_data(collected_keywords_data, '월별 지속상승', '월별 지속상승', rising_list[1])

info_result_monthly_pattern = utils.process_data(collected_keywords_data, '월별 규칙성', '월별 규칙성', month_rule_list)

info_result_final = pd.concat([info_result_daily_select,info_result_weekly_select, info_result_monthly_select,\
                               info_result_weekly_continuous, info_result_monthly_continuous,\
                                  info_result_monthly_pattern]).reset_index(drop=True)

# 구글/ 네이버 한꺼번에

In [17]:
info_result_final

Unnamed: 0,연관키워드,월간검색수_합계,중복검색어,일별 급상승,주별 급상승,주별 지속상승,월별 급상승,월별 지속상승,월별 규칙성,유형,지표,상승월
0,한국비엔씨주가,123380.0,"달러환율,ELS",1.0,,,,,,일별 급상승,452.38%,
1,금리인하,63700.0,"미국주식,WTI,미국금리",1.0,1.0,,1.0,,,일별 급상승,56.16%,
2,삼성전자주가전망,19480.0,"금리,재테크,테마주,특징주,공모주,배당주,주가지수,달러환율,ETF",1.0,,,,,,일별 급상승,94.01%,
3,바이비트수수료,3000.0,돈버는앱,1.0,,,,,,일별 급상승,9.24%,
4,DEX거래소,4370.0,"디지털자산,암호화폐",1.0,1.0,,1.0,,,일별 급상승,60.74%,
...,...,...,...,...,...,...,...,...,...,...,...,...
779,퇴사통보기간,15660.0,퇴직연금,,,,,,1.0,월별 규칙성,92.7%,
780,퇴사사유,10390.0,퇴직연금,,,,1.0,,1.0,월별 규칙성,92.49%,
781,사직서작성방법,7930.0,퇴직연금,,,,,,1.0,월별 규칙성,93.39%,
782,일용직퇴직금,6610.0,퇴직연금,,,,,,1.0,월별 규칙성,91.96%,


##### 뉴스링크,제목 수집 (네이버)

In [18]:
async def collect_google_keywords(target_keywords):
    today_date = datetime.now().strftime("%y%m%d")
    directory_path = f"./data/trend_data/{today_date}"
    file_path = os.path.join(directory_path, f"google_data_{today_date}.pkl")
    
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
    
    if os.path.exists(file_path):
        with open(file_path, 'rb') as file:
            rising_keywords_results = pickle.load(file)
    else:
        rising_keywords_results = await collect_rising_keywords(target_keywords)
        with open(file_path, 'wb') as file:
            pickle.dump(rising_keywords_results, file)
    
    return rising_keywords_results

async def collect_news_keywords(target_keywords):
    today_date = datetime.now().strftime("%y%m%d")
    directory_path = f"./data/trend_data/{today_date}"
    file_path = os.path.join(directory_path, f"news_data_{today_date}.pkl")
    
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
    
    if os.path.exists(file_path):
        with open(file_path, 'rb') as file:
            news_data = pickle.load(file)
    else:
        news_data = await main_news(target_keywords)
        with open(file_path, 'wb') as file:
            pickle.dump(news_data, file)
    
    return news_data

# 메인 비동기 실행 함수
async def main(target_keywords):
    google_keywords_results, news_keywords_results = await asyncio.gather(
        collect_google_keywords(target_keywords),
        collect_news_keywords(target_keywords)
    )
    
    return google_keywords_results, news_keywords_results
target_keywords = list(set(info_result_final['연관키워드']))
print(target_keywords)

rising_keywords_results,news_data=asyncio.run(main(target_keywords))

['PYTHON', '씨씨에스주가', '비트코인사는법', 'ISA계좌서민형', '아파트', '주식사이트', '일본주식', '소득세법', '코인선물', '정보공개', '김프', '월배당ETF', '비트코인선물', '지급명령강제집행', '보험설계사', '근저당설정해지', '고시원', 'NPL', 'KB다이렉트', '청창사', '콜드월렛', '건강보험종류', 'KB', '주식차트사이트', '1억투자', 'NH투자증권', '코인종류', '미국채권ETF', 'AI주식', '대출금리계산기', '금시세1돈', '대여금반환청구소송', '비트코인거래소', '데이터분석', '상속한정승인', '하이닉스주가', 'PEF', '코인추천', '마멘토', '코인거래소', '비트코인찾기', '코인단타', '아이큐코인', '채권양도통지서', '사직', '2월신용카드캐시백', '전세자금대출금리비교', 'ISA계좌수수료', 'HLB주가', '해외선물투자', '연말정산세액공제', '카드이벤트', '신용정보', '상속인', 'IRP퇴직연금수령', 'ISC주가', '금시세18K', '배당기준일', '사망보험', '금투자', '자동차다이렉트', '모바일신용카드', '공모전', '24K금값', '3월신용카드이벤트', '코인거래소순위', '퇴직연금계좌', '자동차보험비교견적사이트', '디지털트윈', '레고켐바이오주가', '대부업사업자등록', '복리예금', 'AMD주가', '미국채ETF', '핸드폰부업', '지분경매', '일자리사이트', '코인하는법', '한미반도체주가', '공제보험', 'INSURANCE', 'KODEX미국S&P500TR', '배당주순위', '플레이댑', 'TSMC주가', '비갱신암보험', '바이비트수수료', '청구이의의소', 'SCHD주가', 'CMA이자', '퇴사통보기간', '주식계산기', '울산다운2지구우미린모델하우스', '전고체관련주', '적금추천', '사업자금대출', '정부지원', '금1돈시세', '관세', '아파트월세', '줍줍분양', '수원금거래소', 'QQQ주가', '일자리

In [34]:
################################
#활동성 분석
################################

today_date = datetime.now().strftime("%y%m%d")
directory_path = f"./data/target_keywords/{today_date}"
file_path = os.path.join(directory_path, "target_keywords.txt")

if not os.path.exists(directory_path):
    # 디렉토리가 존재하지 않는 경우, 디렉토리 생성
    os.makedirs(directory_path)

# 파일이 존재하는지 확인
if not os.path.exists(file_path):
    # 키워드를 파일에 작성
    with open(file_path, 'w') as file:
        for keyword in target_keywords:
            file.write("%s\n" % keyword)
    result = f"{file_path}에 키워드 저장됨"
else:
    result = f"{file_path} 파일이 이미 존재합니다. 작업을 건너뜁니다."

result
activity_rate(f'{directory_path}/target_keywords.txt')


Processing Keywords:   0%|          | 0/547 [00:00<?, ?it/s]




Processing Keywords:   0%|          | 1/547 [00:05<51:14,  5.63s/it]




Processing Keywords:   0%|          | 2/547 [00:11<50:26,  5.55s/it]




Processing Keywords:   1%|          | 3/547 [00:16<49:28,  5.46s/it]




Processing Keywords:   1%|          | 4/547 [00:21<48:55,  5.41s/it]




Processing Keywords:   1%|          | 5/547 [00:27<51:04,  5.65s/it]




Processing Keywords:   1%|          | 6/547 [00:33<50:27,  5.60s/it]




Processing Keywords:   1%|▏         | 7/547 [00:39<50:51,  5.65s/it]




Processing Keywords:   1%|▏         | 8/547 [00:44<51:13,  5.70s/it]




Processing Keywords:   2%|▏         | 9/547 [00:50<50:59,  5.69s/it]




Processing Keywords:   2%|▏         | 10/547 [00:56<50:49,  5.68s/it]




Processing Keywords:   2%|▏         | 11/547 [01:01<50:14,  5.62s/it]




Processing Keywords:   2%|▏         | 12/547 [01:07<50:41,  5.69s/it]




Processing Keywords:   2%|▏         | 13/547 [01:13<50:26,  5.67s/it]




Processing Keywords:   3%|▎         | 14/547 [01:19<52:19,  5.89s/it]




Processing Keywords:   3%|▎         | 15/547 [01:25<51:53,  5.85s/it]




Processing Keywords:   3%|▎         | 16/547 [01:31<53:24,  6.04s/it]




Processing Keywords:   3%|▎         | 17/547 [01:37<52:41,  5.96s/it]




Processing Keywords:   3%|▎         | 18/547 [01:43<52:19,  5.94s/it]




Processing Keywords:   3%|▎         | 19/547 [01:49<51:20,  5.83s/it]




Processing Keywords:   4%|▎         | 20/547 [01:54<49:59,  5.69s/it]




Processing Keywords:   4%|▍         | 21/547 [01:59<49:16,  5.62s/it]




Processing Keywords:   4%|▍         | 22/547 [02:05<50:02,  5.72s/it]




Processing Keywords:   4%|▍         | 23/547 [02:11<50:36,  5.79s/it]




Processing Keywords:   4%|▍         | 24/547 [02:17<50:55,  5.84s/it]




Processing Keywords:   5%|▍         | 25/547 [02:23<50:21,  5.79s/it]




Processing Keywords:   5%|▍         | 26/547 [02:29<50:02,  5.76s/it]




Processing Keywords:   5%|▍         | 27/547 [02:34<49:19,  5.69s/it]




Processing Keywords:   5%|▌         | 28/547 [02:40<48:54,  5.65s/it]




Processing Keywords:   5%|▌         | 29/547 [02:46<49:25,  5.72s/it]




Processing Keywords:   5%|▌         | 30/547 [02:51<49:15,  5.72s/it]




Processing Keywords:   6%|▌         | 31/547 [02:57<49:20,  5.74s/it]




Processing Keywords:   6%|▌         | 32/547 [03:03<48:31,  5.65s/it]




Processing Keywords:   6%|▌         | 33/547 [03:08<49:01,  5.72s/it]




Processing Keywords:   6%|▌         | 34/547 [03:15<49:47,  5.82s/it]




Processing Keywords:   6%|▋         | 35/547 [03:20<49:17,  5.78s/it]




Processing Keywords:   7%|▋         | 36/547 [03:26<48:54,  5.74s/it]




Processing Keywords:   7%|▋         | 37/547 [03:31<48:30,  5.71s/it]




Processing Keywords:   7%|▋         | 38/547 [03:37<48:29,  5.72s/it]




Processing Keywords:   7%|▋         | 39/547 [03:43<48:22,  5.71s/it]




Processing Keywords:   7%|▋         | 40/547 [03:49<48:17,  5.72s/it]




Processing Keywords:   7%|▋         | 41/547 [03:54<48:12,  5.72s/it]




Processing Keywords:   8%|▊         | 42/547 [04:00<48:42,  5.79s/it]




Processing Keywords:   8%|▊         | 43/547 [04:06<48:19,  5.75s/it]




Processing Keywords:   8%|▊         | 44/547 [04:12<47:59,  5.72s/it]




Processing Keywords:   8%|▊         | 45/547 [04:17<47:21,  5.66s/it]




Processing Keywords:   8%|▊         | 46/547 [04:23<46:45,  5.60s/it]




Processing Keywords:   9%|▊         | 47/547 [04:28<46:46,  5.61s/it]




Processing Keywords:   9%|▉         | 48/547 [04:34<46:30,  5.59s/it]




Processing Keywords:   9%|▉         | 49/547 [04:39<45:56,  5.54s/it]




Processing Keywords:   9%|▉         | 50/547 [04:45<46:07,  5.57s/it]




Processing Keywords:   9%|▉         | 51/547 [04:51<46:56,  5.68s/it]




Processing Keywords:  10%|▉         | 52/547 [04:56<46:46,  5.67s/it]




Processing Keywords:  10%|▉         | 53/547 [05:02<46:55,  5.70s/it]




Processing Keywords:  10%|▉         | 54/547 [05:08<46:00,  5.60s/it]




Processing Keywords:  10%|█         | 55/547 [05:13<46:05,  5.62s/it]




Processing Keywords:  10%|█         | 56/547 [05:19<46:10,  5.64s/it]




Processing Keywords:  10%|█         | 57/547 [05:25<46:10,  5.65s/it]




Processing Keywords:  11%|█         | 58/547 [05:30<45:14,  5.55s/it]




Processing Keywords:  11%|█         | 59/547 [05:35<44:54,  5.52s/it]




Processing Keywords:  11%|█         | 60/547 [05:41<44:56,  5.54s/it]




Processing Keywords:  11%|█         | 61/547 [05:47<45:13,  5.58s/it]




Processing Keywords:  11%|█▏        | 62/547 [05:52<45:13,  5.59s/it]




Processing Keywords:  12%|█▏        | 63/547 [05:58<44:59,  5.58s/it]




Processing Keywords:  12%|█▏        | 64/547 [06:03<44:51,  5.57s/it]




Processing Keywords:  12%|█▏        | 65/547 [06:09<44:53,  5.59s/it]




Processing Keywords:  12%|█▏        | 66/547 [06:15<45:28,  5.67s/it]




Processing Keywords:  12%|█▏        | 67/547 [06:20<44:46,  5.60s/it]




Processing Keywords:  12%|█▏        | 68/547 [06:26<44:25,  5.56s/it]




Processing Keywords:  13%|█▎        | 69/547 [06:32<45:50,  5.75s/it]




Processing Keywords:  13%|█▎        | 70/547 [06:38<45:31,  5.73s/it]




Processing Keywords:  13%|█▎        | 71/547 [06:44<46:01,  5.80s/it]




Processing Keywords:  13%|█▎        | 72/547 [06:49<45:13,  5.71s/it]




Processing Keywords:  13%|█▎        | 73/547 [06:55<45:00,  5.70s/it]




Processing Keywords:  14%|█▎        | 74/547 [07:00<44:47,  5.68s/it]




Processing Keywords:  14%|█▎        | 75/547 [07:06<44:30,  5.66s/it]




Processing Keywords:  14%|█▍        | 76/547 [07:12<44:49,  5.71s/it]




Processing Keywords:  14%|█▍        | 77/547 [07:17<44:12,  5.64s/it]




Processing Keywords:  14%|█▍        | 78/547 [07:23<43:40,  5.59s/it]




Processing Keywords:  14%|█▍        | 79/547 [07:29<44:06,  5.65s/it]




Processing Keywords:  15%|█▍        | 80/547 [07:34<43:42,  5.62s/it]




Processing Keywords:  15%|█▍        | 81/547 [07:40<43:16,  5.57s/it]




Processing Keywords:  15%|█▍        | 82/547 [07:45<42:59,  5.55s/it]




Processing Keywords:  15%|█▌        | 83/547 [07:51<43:09,  5.58s/it]




Processing Keywords:  15%|█▌        | 84/547 [07:56<42:17,  5.48s/it]




Processing Keywords:  16%|█▌        | 85/547 [08:02<42:39,  5.54s/it]




Processing Keywords:  16%|█▌        | 86/547 [08:07<42:43,  5.56s/it]




Processing Keywords:  16%|█▌        | 87/547 [08:13<42:25,  5.53s/it]




Processing Keywords:  16%|█▌        | 88/547 [08:18<42:24,  5.54s/it]




Processing Keywords:  16%|█▋        | 89/547 [08:24<42:06,  5.52s/it]




Processing Keywords:  16%|█▋        | 90/547 [08:29<42:12,  5.54s/it]




Processing Keywords:  17%|█▋        | 91/547 [08:35<42:43,  5.62s/it]




Processing Keywords:  17%|█▋        | 92/547 [08:41<42:49,  5.65s/it]




Processing Keywords:  17%|█▋        | 93/547 [08:47<42:38,  5.64s/it]




Processing Keywords:  17%|█▋        | 94/547 [08:52<42:19,  5.61s/it]




Processing Keywords:  17%|█▋        | 95/547 [08:58<42:03,  5.58s/it]




Processing Keywords:  18%|█▊        | 96/547 [09:03<41:49,  5.56s/it]




Processing Keywords:  18%|█▊        | 97/547 [09:09<41:36,  5.55s/it]




Processing Keywords:  18%|█▊        | 98/547 [09:14<41:22,  5.53s/it]




Processing Keywords:  18%|█▊        | 99/547 [09:20<41:27,  5.55s/it]




Processing Keywords:  18%|█▊        | 100/547 [09:25<41:41,  5.60s/it]




Processing Keywords:  18%|█▊        | 101/547 [09:31<41:35,  5.60s/it]




Processing Keywords:  19%|█▊        | 102/547 [09:37<41:21,  5.58s/it]




Processing Keywords:  19%|█▉        | 103/547 [09:42<40:54,  5.53s/it]




Processing Keywords:  19%|█▉        | 104/547 [09:48<41:13,  5.58s/it]




Processing Keywords:  19%|█▉        | 105/547 [09:53<41:33,  5.64s/it]




Processing Keywords:  19%|█▉        | 106/547 [09:59<41:26,  5.64s/it]




Processing Keywords:  20%|█▉        | 107/547 [10:05<41:09,  5.61s/it]




Processing Keywords:  20%|█▉        | 108/547 [10:10<41:37,  5.69s/it]




Processing Keywords:  20%|█▉        | 109/547 [10:16<41:16,  5.65s/it]




Processing Keywords:  20%|██        | 110/547 [10:21<40:42,  5.59s/it]




Processing Keywords:  20%|██        | 111/547 [10:27<40:30,  5.57s/it]




Processing Keywords:  20%|██        | 112/547 [10:32<40:10,  5.54s/it]




Processing Keywords:  21%|██        | 113/547 [10:38<40:15,  5.57s/it]




Processing Keywords:  21%|██        | 114/547 [10:44<40:04,  5.55s/it]




Processing Keywords:  21%|██        | 115/547 [10:49<39:40,  5.51s/it]




Processing Keywords:  21%|██        | 116/547 [10:55<40:05,  5.58s/it]




Processing Keywords:  21%|██▏       | 117/547 [11:00<39:28,  5.51s/it]




Processing Keywords:  22%|██▏       | 118/547 [11:06<39:35,  5.54s/it]




Processing Keywords:  22%|██▏       | 119/547 [11:12<40:57,  5.74s/it]




Processing Keywords:  22%|██▏       | 120/547 [11:17<40:26,  5.68s/it]




Processing Keywords:  22%|██▏       | 121/547 [11:23<40:31,  5.71s/it]




Processing Keywords:  22%|██▏       | 122/547 [11:29<40:21,  5.70s/it]




Processing Keywords:  22%|██▏       | 123/547 [11:35<41:35,  5.88s/it]




Processing Keywords:  23%|██▎       | 124/547 [11:41<40:49,  5.79s/it]




Processing Keywords:  23%|██▎       | 125/547 [11:46<39:31,  5.62s/it]




Processing Keywords:  23%|██▎       | 126/547 [11:52<39:13,  5.59s/it]




Processing Keywords:  23%|██▎       | 127/547 [11:58<40:07,  5.73s/it]




Processing Keywords:  23%|██▎       | 128/547 [12:03<39:22,  5.64s/it]




Processing Keywords:  24%|██▎       | 129/547 [12:09<38:53,  5.58s/it]




Processing Keywords:  24%|██▍       | 130/547 [12:14<38:56,  5.60s/it]




Processing Keywords:  24%|██▍       | 131/547 [12:20<38:35,  5.57s/it]




Processing Keywords:  24%|██▍       | 132/547 [12:25<38:52,  5.62s/it]




Processing Keywords:  24%|██▍       | 133/547 [12:31<39:28,  5.72s/it]




Processing Keywords:  24%|██▍       | 134/547 [12:37<38:56,  5.66s/it]




Processing Keywords:  25%|██▍       | 135/547 [12:42<38:45,  5.64s/it]




Processing Keywords:  25%|██▍       | 136/547 [12:49<39:58,  5.83s/it]




Processing Keywords:  25%|██▌       | 137/547 [12:55<39:48,  5.83s/it]




Processing Keywords:  25%|██▌       | 138/547 [13:00<39:12,  5.75s/it]




Processing Keywords:  25%|██▌       | 139/547 [13:06<39:02,  5.74s/it]




Processing Keywords:  26%|██▌       | 140/547 [13:11<38:27,  5.67s/it]




Processing Keywords:  26%|██▌       | 141/547 [13:17<38:42,  5.72s/it]




Processing Keywords:  26%|██▌       | 142/547 [13:23<38:15,  5.67s/it]




Processing Keywords:  26%|██▌       | 143/547 [13:28<38:02,  5.65s/it]




Processing Keywords:  26%|██▋       | 144/547 [13:34<37:25,  5.57s/it]




Processing Keywords:  27%|██▋       | 145/547 [13:39<36:57,  5.52s/it]




Processing Keywords:  27%|██▋       | 146/547 [13:45<36:52,  5.52s/it]




Processing Keywords:  27%|██▋       | 147/547 [13:50<37:04,  5.56s/it]




Processing Keywords:  27%|██▋       | 148/547 [13:56<37:31,  5.64s/it]




Processing Keywords:  27%|██▋       | 149/547 [14:02<36:56,  5.57s/it]




Processing Keywords:  27%|██▋       | 150/547 [14:07<37:12,  5.62s/it]




Processing Keywords:  28%|██▊       | 151/547 [14:13<36:40,  5.56s/it]




Processing Keywords:  28%|██▊       | 152/547 [14:18<35:59,  5.47s/it]




Processing Keywords:  28%|██▊       | 153/547 [14:24<36:20,  5.53s/it]




Processing Keywords:  28%|██▊       | 154/547 [14:29<35:59,  5.50s/it]




Processing Keywords:  28%|██▊       | 155/547 [14:34<35:48,  5.48s/it]




Processing Keywords:  29%|██▊       | 156/547 [14:40<35:35,  5.46s/it]




Processing Keywords:  29%|██▊       | 157/547 [14:45<35:10,  5.41s/it]




Processing Keywords:  29%|██▉       | 158/547 [14:51<35:50,  5.53s/it]




Processing Keywords:  29%|██▉       | 159/547 [14:56<35:15,  5.45s/it]




Processing Keywords:  29%|██▉       | 160/547 [15:02<34:51,  5.40s/it]




Processing Keywords:  29%|██▉       | 161/547 [15:07<35:08,  5.46s/it]




Processing Keywords:  30%|██▉       | 162/547 [15:13<35:03,  5.46s/it]




Processing Keywords:  30%|██▉       | 163/547 [15:18<34:39,  5.42s/it]




Processing Keywords:  30%|██▉       | 164/547 [15:24<35:26,  5.55s/it]




Processing Keywords:  30%|███       | 165/547 [15:30<35:48,  5.62s/it]




Processing Keywords:  30%|███       | 166/547 [15:35<35:55,  5.66s/it]




Processing Keywords:  31%|███       | 167/547 [15:42<36:59,  5.84s/it]




Processing Keywords:  31%|███       | 168/547 [15:47<36:39,  5.80s/it]




Processing Keywords:  31%|███       | 169/547 [15:53<37:07,  5.89s/it]




Processing Keywords:  31%|███       | 170/547 [15:59<36:41,  5.84s/it]




Processing Keywords:  31%|███▏      | 171/547 [16:05<36:13,  5.78s/it]




Processing Keywords:  31%|███▏      | 172/547 [16:11<36:34,  5.85s/it]




Processing Keywords:  32%|███▏      | 173/547 [16:17<37:16,  5.98s/it]




Processing Keywords:  32%|███▏      | 174/547 [16:23<36:16,  5.84s/it]




Processing Keywords:  32%|███▏      | 175/547 [16:29<37:13,  6.00s/it]




Processing Keywords:  32%|███▏      | 176/547 [16:35<36:31,  5.91s/it]




Processing Keywords:  32%|███▏      | 177/547 [16:40<35:27,  5.75s/it]




Processing Keywords:  33%|███▎      | 178/547 [16:45<34:34,  5.62s/it]




Processing Keywords:  33%|███▎      | 179/547 [16:51<34:22,  5.60s/it]




Processing Keywords:  33%|███▎      | 180/547 [16:57<34:21,  5.62s/it]




Processing Keywords:  33%|███▎      | 181/547 [17:02<34:04,  5.58s/it]




Processing Keywords:  33%|███▎      | 182/547 [17:08<34:40,  5.70s/it]




Processing Keywords:  33%|███▎      | 183/547 [17:14<34:27,  5.68s/it]




Processing Keywords:  34%|███▎      | 184/547 [17:19<34:22,  5.68s/it]




Processing Keywords:  34%|███▍      | 185/547 [17:25<34:36,  5.74s/it]




Processing Keywords:  34%|███▍      | 186/547 [17:31<34:18,  5.70s/it]




Processing Keywords:  34%|███▍      | 187/547 [17:35<31:09,  5.19s/it]




Processing Keywords:  34%|███▍      | 188/547 [17:40<31:28,  5.26s/it]




Processing Keywords:  35%|███▍      | 189/547 [17:46<32:14,  5.40s/it]




Processing Keywords:  35%|███▍      | 190/547 [17:52<32:38,  5.49s/it]




Processing Keywords:  35%|███▍      | 191/547 [17:58<33:08,  5.59s/it]




Processing Keywords:  35%|███▌      | 192/547 [18:03<32:11,  5.44s/it]




Processing Keywords:  35%|███▌      | 193/547 [18:08<32:40,  5.54s/it]




Processing Keywords:  35%|███▌      | 194/547 [18:15<33:48,  5.75s/it]




Processing Keywords:  36%|███▌      | 195/547 [18:20<32:59,  5.62s/it]




Processing Keywords:  36%|███▌      | 196/547 [18:25<32:15,  5.52s/it]




Processing Keywords:  36%|███▌      | 197/547 [18:31<33:01,  5.66s/it]




Processing Keywords:  36%|███▌      | 198/547 [18:37<32:45,  5.63s/it]




Processing Keywords:  36%|███▋      | 199/547 [18:42<32:16,  5.56s/it]




Processing Keywords:  37%|███▋      | 200/547 [18:48<32:24,  5.60s/it]




Processing Keywords:  37%|███▋      | 201/547 [18:53<32:04,  5.56s/it]




Processing Keywords:  37%|███▋      | 202/547 [18:59<32:18,  5.62s/it]




Processing Keywords:  37%|███▋      | 203/547 [19:06<34:15,  5.98s/it]




Processing Keywords:  37%|███▋      | 204/547 [19:12<34:09,  5.97s/it]




Processing Keywords:  37%|███▋      | 205/547 [19:17<33:15,  5.83s/it]




Processing Keywords:  38%|███▊      | 206/547 [19:23<32:47,  5.77s/it]




Processing Keywords:  38%|███▊      | 207/547 [19:29<32:25,  5.72s/it]




Processing Keywords:  38%|███▊      | 208/547 [19:34<32:20,  5.72s/it]




Processing Keywords:  38%|███▊      | 209/547 [19:41<33:12,  5.90s/it]




Processing Keywords:  38%|███▊      | 210/547 [19:47<33:23,  5.94s/it]




Processing Keywords:  39%|███▊      | 211/547 [19:53<33:16,  5.94s/it]




Processing Keywords:  39%|███▉      | 212/547 [19:58<32:33,  5.83s/it]




Processing Keywords:  39%|███▉      | 213/547 [20:04<32:11,  5.78s/it]




Processing Keywords:  39%|███▉      | 214/547 [20:09<31:24,  5.66s/it]




Processing Keywords:  39%|███▉      | 215/547 [20:15<31:08,  5.63s/it]




Processing Keywords:  39%|███▉      | 216/547 [20:20<30:45,  5.57s/it]




Processing Keywords:  40%|███▉      | 217/547 [20:26<30:15,  5.50s/it]




Processing Keywords:  40%|███▉      | 218/547 [20:31<30:01,  5.48s/it]




Processing Keywords:  40%|████      | 219/547 [20:37<30:13,  5.53s/it]




Processing Keywords:  40%|████      | 220/547 [20:42<30:17,  5.56s/it]




Processing Keywords:  40%|████      | 221/547 [20:48<29:57,  5.51s/it]




Processing Keywords:  41%|████      | 222/547 [20:53<29:32,  5.45s/it]




Processing Keywords:  41%|████      | 223/547 [20:59<29:31,  5.47s/it]




Processing Keywords:  41%|████      | 224/547 [21:04<29:23,  5.46s/it]




Processing Keywords:  41%|████      | 225/547 [21:10<29:33,  5.51s/it]




Processing Keywords:  41%|████▏     | 226/547 [21:15<30:02,  5.61s/it]




Processing Keywords:  41%|████▏     | 227/547 [21:21<29:58,  5.62s/it]




Processing Keywords:  42%|████▏     | 228/547 [21:27<30:16,  5.70s/it]




Processing Keywords:  42%|████▏     | 229/547 [21:33<30:01,  5.67s/it]




In [20]:
#######################################
# 뉴스링크,제목,연관검색어 데이터프레임 생성
#######################################


name_list = list(news_data.keys())  
# DataFrame 초기화
news_df = pd.DataFrame()

# 모든 키워드에 대해 처리
for keyword in name_list:
    # 뉴스 항목이 있는 경우 데이터 추가
    for news_item in news_data[keyword]:
        news_row = [keyword, news_item[0], news_item[1]]  # 연관키워드, 뉴스제목, 뉴스링크
        news_df = pd.concat([news_df, pd.DataFrame([news_row])], ignore_index=True)

    # 뉴스 항목 수가 10개에 미치지 못하면 나머지를 빈 행으로 채움
    for _ in range(10 - len(news_data[keyword])):
        empty_row = [keyword, None, None]  # 연관키워드, 빈 뉴스제목, 빈 뉴스링크
        news_df = pd.concat([news_df, pd.DataFrame([empty_row])], ignore_index=True)

# 칼럼 이름 설정
news_df.columns = ['연관검색어', '뉴스제목', '뉴스링크']


# merge 

In [21]:
keyword_activity_rates = pd.read_csv(f'{directory_path}/keyword_activity_rates.csv')
keyword_activity_rates.columns = ['연관검색어', '활동성']

# '활동성' 열의 데이터를 백분율 형태의 문자열로 변환
keyword_activity_rates['활동성'] = keyword_activity_rates['활동성'].apply(lambda x: f"{x}%")
# news_df와 keyword_activity_rates를 '연관검색어' 열을 기준으로 병합
keyword_activity_rates = keyword_activity_rates.drop_duplicates(subset=['연관검색어'])
merged_keyword_activity_rates = pd.merge(news_df, keyword_activity_rates, on='연관검색어', how='left')




####
# 네이버 merge
####
collected_keywords_dat_copy.rename(columns={'연관키워드': '연관검색어'}, inplace=True)
info_result_final.rename(columns={'연관키워드': '연관검색어'}, inplace=True)
# collected_keywords_dat_copy에서 '연관키워드'와 '검색어'를 기준으로 중복 제거
collected_keywords_dat_copy = collected_keywords_dat_copy.drop_duplicates(subset=['연관검색어'], keep='first')
# 이제 merged_keyword_activity_rates와 결합
final_merged_df = pd.merge(merged_keyword_activity_rates, collected_keywords_dat_copy[['연관검색어', '검색어']], on='연관검색어', how='left')



In [22]:
final_merged_df_copy = final_merged_df.copy()

# 구글검색어 컬럼을 초기화합니다.
final_merged_df_copy['구글검색어'] = None

# 이후의 모든 작업은 final_merged_df_copy에 대해 수행합니다.
i = 0
for keyword, queries in rising_keywords_results.items():
    filled_queries = queries[:10] + [None] * (10 - len(queries[:10]))
    for query in filled_queries:
        if i < len(final_merged_df_copy):
            final_merged_df_copy.at[i, '구글검색어'] = query
            i += 1
        else:
            break


# final_merged_df의 '검색어' 컬럼에서 각 10번째 검색어를 추출합니다.
keyword_list_per_10 = final_merged_df_copy['검색어'].tolist()[::10]


 
# collected_keywords_dat_copy에서 각 검색어별 상위 10개 연관검색어를 가져옵니다.
# 여기서는 각 검색어별로 가장 높은 월간검색수를 가진 상위 10개를 선정합니다.
top_keywords_by_search = collected_keywords_dat_copy.groupby('검색어').apply(
    lambda x: x.nlargest(10, '월간검색수_합계')
).reset_index(drop=True)



# 새로운 DataFrame을 초기화합니다. 이 DataFrame에는 각 검색어별 상위 10개 연관검색어가 포함됩니다.
new_rows_for_final_df = []


for keyword in keyword_list_per_10:
    # 특정 키워드에 대한 상위 10개 연관 검색어 추출
    top_queries_for_keyword = top_keywords_by_search[top_keywords_by_search['검색어'] == keyword].head(10)
    
    # 추출된 연관 검색어를 결과 리스트에 추가
    num_rows_added = 0  # 추가된 연관 검색어의 수를 추적
    for _, row in top_queries_for_keyword.iterrows():
        new_rows_for_final_df.append(row['연관검색어'])
        num_rows_added += 1
    
    # 10개 미만인 경우 나머지를 None으로 채우기
    for _ in range(10 - num_rows_added):
        new_rows_for_final_df.append(None)


# new_rows_for_final_df의 길이를 확인하고 final_merged_df의 '네이버검색어' 컬럼에 값을 할당합니다.
# 주의: new_rows_for_final_df의 길이가 final_merged_df의 행 수와 동일해야 합니다.
# 만약 길이가 다르다면, 길이가 맞도록 조정이 필요합니다.
if len(new_rows_for_final_df) == len(final_merged_df_copy):
    final_merged_df_copy['네이버검색어'] = new_rows_for_final_df
else:
    print("경고: '네이버검색어' 데이터의 길이가 final_merged_df와 다릅니다. 데이터 확인이 필요합니다.")

# 최종 DataFrame 확인
#final_merged_df_copy

  top_keywords_by_search = collected_keywords_dat_copy.groupby('검색어').apply(


# 형식 수정

In [23]:

#info_result_final = info_result_final.drop(columns=["일별 급상승", "주별 급상승", "주별 지속상승", "월별 급상승", "월별 지속상승", "월별 규칙성"])

final_merged_df_result = pd.merge(info_result_final, final_merged_df_copy, how='left', on='연관검색어')
today_date = datetime.now().strftime("%Y-%m-%d")

# '기준일자' 컬럼을 가장 앞에 추가
final_merged_df_result.insert(0, '기준일자', today_date)
# 컬럼명 변경: '중복검색어' -> '검색키워드', '월간검색수_합계' -> '검색량'

final_merged_df_result.rename(columns={'중복검색어': '검색키워드', '월간검색수_합계': '검색량'}, inplace=True)

final_merged_df_result = final_merged_df_result.drop(columns=["검색어"])


final_merged_df_result['상승월'] = None
# rising_month_list의 각 항목에 대해 반복 처리
for month_info in rising_month_list:
    months, keyword = month_info  # month_info는 각각의 월 목록과 키워드를 포함합니다.
    keyword_rows = final_merged_df_result[final_merged_df_result['연관검색어'] == keyword]  # 해당 키워드에 대한 행만 선택합니다.
    
    if not keyword_rows.empty:

        for i, month in enumerate(months):
            if i < len(keyword_rows):
                final_merged_df_result.loc[keyword_rows.index[i], '상승월'] = month
            else:
                break  # 월의 개수보다 더 많은 행에 대해서는 처리를 중단합니다.



# 형식맞추기 위한 info_result_final 순서 정렬
info_result_af_copy=pd.DataFrame()
a = final_merged_df_result.query("`유형` == '일별 급상승'")
b = final_merged_df_result.query("`유형` == '주별 급상승' or `유형` == '주별 지속상승'")
c = final_merged_df_result.query("`유형` == '월별 급상승' or `유형` == '월별 지속상승' or `유형` == '월별 규칙성'")
a_sort=a.sort_values(by=['연관검색어', '유형'], ascending=[True, True])
b_sort = b.sort_values(by=['연관검색어', '유형'], ascending=[True, True])
c_sort = c.sort_values(by=['연관검색어', '유형'], ascending=[True, True])
info_result_af_copy=pd.concat([a_sort,b_sort,c_sort])

# 형식을 위한 이름 변경
new_column_order = ['기준일자', '유형', '연관검색어', '검색키워드', '검색량', '지표', '뉴스제목', '뉴스링크', '활동성', '구글검색어', '네이버검색어', '상승월']
info_result_af_copy_reordered = info_result_af_copy[new_column_order]

# 혹시나 모를 예외처리(형식에 어긋나는 것을 예방)
info_result_af_copy_reordered['뉴스제목'] = info_result_af_copy_reordered['뉴스제목'].str.replace("|", "")
info_result_af_copy_reordered['연관검색어'] = info_result_af_copy_reordered['연관검색어'].str.replace("|", "")
info_result_af_copy_reordered['네이버검색어'] = info_result_af_copy_reordered['네이버검색어'].str.replace("|", "")
info_result_af_copy_reordered['구글검색어'] = info_result_af_copy_reordered['구글검색어'].str.replace("|", "")
info_result_af_copy_reordered['뉴스제목'] = info_result_af_copy_reordered['뉴스제목'].str.replace("–", "-")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  info_result_af_copy_reordered['뉴스제목'] = info_result_af_copy_reordered['뉴스제목'].str.replace("|", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  info_result_af_copy_reordered['연관검색어'] = info_result_af_copy_reordered['연관검색어'].str.replace("|", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  info_

In [24]:
# 유형 순서 정렬
info_result_af_copy_reordered_modified = info_result_af_copy_reordered.copy()


# 인덱스 재설정
info_result_af_copy_reordered_modified.reset_index(drop=True, inplace=True)

sort_order = {
    "일별 급상승": 1,
    "주별 급상승": 2,
    "주별 지속상승": 3,
    "월별 급상승": 4,
    "월별 지속상승": 5,
    "월별 규칙성" : 6
}

# 유형 컬럼에 대한 정렬 순서를 적용하기 위해 임시 컬럼 추가
info_result_af_copy_reordered_modified['sort_key'] = info_result_af_copy_reordered_modified['유형'].map(sort_order)

# 임시 컬럼을 기준으로 정렬
info_result_af_copy_reordered_modified = info_result_af_copy_reordered_modified.sort_values(by=['sort_key', '연관검색어'], ascending=[True, True])

# 임시 컬럼 삭제
info_result_af_copy_reordered_modified.drop('sort_key', axis=1, inplace=True)

info_result_af_copy_reordered_modified.reset_index(drop=True, inplace=True)


In [25]:
# 형식에 맞춰서 띄어쓰기 변경
# '유형' 컬럼의 값을 바꾸기 위한 딕셔너리 정의
replace_values = {
    '일별 급상승': '일별급상승',
    '주별 급상승': '주별급상승',
    '주별 지속상승': '주별지속상승',
    '월별 급상승': '월별급상승',
    '월별 지속상승': '월별지속상승',
    '월별 규칙성': '월별규칙성'
}

# '유형' 컬럼 내의 값을 바꾸기
graph_result['유형'] = graph_result['유형'].replace(replace_values)

### nan값 제거

In [26]:
na_related_search_terms = list(graph_result[pd.isna(graph_result['검색량'])]['연관검색어'])
unique_na_related_search_terms  = list(set(na_related_search_terms))

filtered_graph_result = graph_result[~graph_result['연관검색어'].isin(unique_na_related_search_terms)]


filtered_info_result_af_copy_reordered_modified = info_result_af_copy_reordered_modified[~info_result_af_copy_reordered_modified['연관검색어'].isin(unique_na_related_search_terms)]

In [27]:
filtered_graph_result_updated = filtered_graph_result.iloc[:, :-2]
filtered_graph_result_updated_a = filtered_graph_result_updated[filtered_graph_result_updated['유형'] == '일별급상승']

# 올바른 조건을 사용하여 필터링
filtered_graph_result_updated_b = filtered_graph_result_updated[
    filtered_graph_result_updated['유형'].isin(['월별급상승', '월별지속상승', '월별규칙성'])]
filtered_graph_result_updated_c = filtered_graph_result_updated[
    filtered_graph_result_updated['유형'].isin(['주별급상승', '주별지속상승'])]
sorted_filtered_graph_result_updated_b = filtered_graph_result_updated_b.sort_values(by=['연관검색어', '유형', '검색일자'])
sorted_filtered_graph_result_updated_c = filtered_graph_result_updated_c.sort_values(by=['연관검색어', '유형', '검색일자'])


In [28]:
combined_df = pd.concat([filtered_graph_result_updated_a, sorted_filtered_graph_result_updated_c, sorted_filtered_graph_result_updated_b], axis=0)
combined_df.reset_index(inplace = True, drop = True)

In [29]:
# 전송용 결과 테이블 생성 함수

def make_csv(table) :

  # 컬럼 추출
  col_a = ''
  col_b = ''

  for col in table.columns :
    col_a = str(col) + '|||'
    col_b = col_b + col_a
  col_b


  # 행 추출
  row_list = []

  for j in range(0, len(table)) :
    tmp_a = ''
    tmp_b = ''

    for i in range(0, len(table.columns)) :
      tmp_a = str(table.iloc[j,i]) + '|||'
      tmp_b = tmp_b + tmp_a
    row_list.append(tmp_b)

  row_list.insert(0,col_b)
  df = pd.DataFrame(row_list)

  return df


In [30]:
info_data = filtered_info_result_af_copy_reordered_modified.fillna(' ')
info_data.reset_index(inplace = True, drop = True)
today = datetime.now(timezone('Asia/Seoul'))
formatted_today = today.strftime('%y%m%d')


In [31]:
result_csv = make_csv(info_data)

# 현재 날짜를 'yyMMdd' 형식으로 포맷팅
today = datetime.now(timezone('Asia/Seoul'))
formatted_today = today.strftime('%y%m%d')

# 저장할 경로
save_path = f'./data/result_out/{formatted_today}'

# 해당 경로가 존재하지 않으면 생성
if not os.path.exists(save_path):
    os.makedirs(save_path)

# CSV 파일 저장
result_csv.to_csv(f'{save_path}/info_{formatted_today}.csv', encoding='utf-8-sig', index=False, header=False)


In [32]:
combined_df_a = combined_df[combined_df['유형'] == '일별급상승']

# 올바른 조건을 사용하여 필터링
combined_df_b = combined_df[
    combined_df['유형'].isin(['월별급상승', '월별지속상승', '월별규칙성'])]
combined_df_c = combined_df[
    combined_df['유형'].isin(['주별급상승', '주별지속상승'])]
sorted_combined_df__b = combined_df_b.sort_values(by=['유형', '연관검색어', '검색일자'])
sorted_combined_df__c = combined_df_c.sort_values(by=['유형', '연관검색어', '검색일자'])
combined_df = pd.concat([combined_df_a, sorted_combined_df__c, sorted_combined_df__b], axis=0)
combined_df.reset_index(inplace = True, drop = True)

In [33]:
result_graph = make_csv(combined_df)


# 현재 날짜를 'yyMMdd' 형식으로 포맷팅
today = datetime.now(timezone('Asia/Seoul'))
formatted_today = today.strftime('%y%m%d')

# 저장할 경로
save_path = f'./data/result_out/{formatted_today}'

# 해당 경로가 존재하지 않으면 생성
if not os.path.exists(save_path):
    os.makedirs(save_path)

# CSV 파일 저장
result_graph.to_csv(f'{save_path}/graph_{formatted_today}.csv', encoding='utf-8-sig', index=False, header=False)
