In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import os
import sys


In [3]:
pd.set_option('display.max_colwidth', None)  # 전체 열 너비를 무제한으로 설

# 현재 경로
print(os.getcwd())

# id,title,content,likes,url,author,views,created_at,updated_at,조회수
df_post = pd.read_csv("../data/코나 화재/naver_cafe_posts.csv")
df_comment = pd.read_csv("../data/코나 화재/naver_cafe_comments.csv")


/Users/admin/Desktop/teamproject/softeer-team-project/wh/comment_difference_EDA


In [4]:
# id,title,content,likes,url,author,views,created_at,updated_at,조회수
# views -> int64, created_at -> datetime64[ns], updated_at -> datetime64[ns]
def convert_10000(value : str):
    value = value.replace(',', '')
    
    if '만' in value:
        return float(value.replace('만', '')) * 10000
    return int(value)

df_post['views'] = df_post['views'].apply(convert_10000)


In [5]:
# created_at, updated_at -> datetime64[ns]
df_post['created_at'] = pd.to_datetime(df_post['created_at'])
df_post['updated_at'] = pd.to_datetime(df_post['updated_at'])

In [6]:
# post_id,cmt_content,cmt_author,cmt_created_at
df_comment['cmt_created_at'] = pd.to_datetime(df_comment['cmt_created_at'])

df_comment.dtypes

post_id                    int64
cmt_content               object
cmt_author                object
cmt_created_at    datetime64[ns]
dtype: object

In [7]:
# 먼저 df_comment에서 post_id별로 댓글 수를 집계합니다.
comment_count = df_comment.groupby('post_id').size().reset_index(name='comment_count')

# df_filtered에 comment_count를 병합합니다.
df_filtered = pd.merge(df_post, comment_count, left_on='id', right_on='post_id', how='left')

# comment_count가 없는 경우 NaN이 될 수 있으므로, 이를 0으로 대체합니다.
df_filtered['comment_count'] = df_filtered['comment_count'].fillna(0).astype(int)



In [8]:
def get_comment(post_id, df_comment):
    comments = df_comment[df_comment['post_id'] == post_id]
    return comments


In [29]:
def get_comment_num_by_persent(df_filtered : pd.DataFrame, persent) -> int:
    # df_filtered['comment_count'].quantile(0.75)
    return int(df_filtered['comment_count'].quantile(persent))

# comment_count 변수의 값보다 comment_count가 큰부분과 작은 부분으로 df_filtered를 나눕니다.
def divide_filtered_df(df_filtered : pd.DataFrame, comment_count):
    df_filtered_less = df_filtered[df_filtered['comment_count'] < comment_count]
    df_filtered_more = df_filtered[df_filtered['comment_count'] >= comment_count]
    
    return df_filtered_less, df_filtered_more
    

In [30]:
get_comment_num_by_persent(df_filtered, 0.97)


51

In [42]:
value = 50
# value = get_comment_num_by_persent(df_filtered, 0.97)
df_filtered_less, df_filtered_more = divide_filtered_df(df_filtered, value)

In [33]:
df_filtered_less.dtypes 

id                        int64
title                    object
content                  object
likes                   float64
url                      object
author                   object
views                   float64
created_at       datetime64[ns]
updated_at       datetime64[ns]
post_id                 float64
comment_count             int64
dtype: object

In [43]:
# id, title, content, likes, url, author, views, created_at, updated_at, comment_count중 post_id, comment_count는 drop 하지않습니다. 나머지는 드랍합니다.
df_filtered_less = df_filtered_less.drop(['id', 'title', 'content', 'likes', 'url', 'author', 'views', 'created_at', 'updated_at'], axis=1)
df_filtered_more = df_filtered_more.drop(['id', 'title', 'content', 'likes', 'url', 'author', 'views', 'created_at', 'updated_at'], axis=1)


In [40]:
df_filtered_more.dtypes

id                 int64
post_id          float64
comment_count      int64
dtype: object

In [54]:
from joblib import Parallel, delayed

def process_post_id(post_id, df_comment):
    # 해당 post_id에 대한 댓글 필터링
    comments = df_comment[df_comment['post_id'] == post_id]
    
    # 댓글이 없는 경우 기본값으로 채우기
    if comments.empty:
        full_interval_df = pd.DataFrame({
            'post_id': [post_id],
            'time_interval': [0],
            'count': [0],
            'cumulative_num': [0]
        })
        return full_interval_df
    
    # 시간순으로 정렬
    comments['cmt_created_at'] = pd.to_datetime(comments['cmt_created_at'])
    comments = comments.sort_values('cmt_created_at')
    
    # 첫 댓글이 달린 시간을 기준으로 5분 단위로 분리하기
    comments['time_elapsed'] = (comments['cmt_created_at'] - comments['cmt_created_at'].iloc[0]).dt.total_seconds() // 60
    
    # 5분 간격으로 time_elapsed의 범위를 생성
    min_interval = comments['time_elapsed'].min() // 5 * 5
    max_interval = (comments['time_elapsed'].max() // 5 + 1) * 5
    time_range = pd.DataFrame({'time_interval': range(int(min_interval), int(max_interval) + 1, 5)})
    
    # 각 5분 단위의 댓글 수 집계
    comments['time_interval'] = (comments['time_elapsed'] // 5) * 5
    count_per_interval = comments.groupby('time_interval').size().reset_index(name='count')
    
    # 5분 단위의 time_range에 댓글 수를 병합
    full_interval_df = pd.merge(time_range, count_per_interval, on='time_interval', how='left')
    full_interval_df['count'] = full_interval_df['count'].fillna(0).astype(int)
    
    # 누적 댓글 수 계산
    full_interval_df['cumulative_num'] = full_interval_df['count'].cumsum()
    
    # post_id 추가
    full_interval_df['post_id'] = post_id
    
    return full_interval_df

def process_filtered_less(df_filtered_less, df_comment):
    # df_filtered_less의 post_id 리스트
    post_ids = df_filtered_less['post_id'].unique()
    
    # 병렬 처리
    results = Parallel(n_jobs=-1)(delayed(process_post_id)(post_id, df_comment) for post_id in post_ids)
    
    # 결과 결합
    df_commulate_num = pd.concat(results, ignore_index=True)
    
    # df_commulate_num에 nan이 있는지 확인
    print(df_commulate_num.isnull().sum())
    
    # df_filtered_less와 병합
    final_df = pd.merge(df_filtered_less, df_commulate_num, on='post_id', how='right')
    
    # 댓글이 없는 경우 time_interval, count, cumulative_num 0으로 채우기
    final_df[['time_interval', 'count', 'cumulative_num']] = final_df[['time_interval', 'count', 'cumulative_num']].fillna(0)
    
    return final_df

# 함수 사용 예시
# df_filtered_less와 df_comment가 이미 정의되어 있다고 가정합니다.
final_df = process_filtered_less(df_filtered_less, df_comment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

time_interval     0
count             0
cumulative_num    0
post_id           1
dtype: int64


In [53]:
# final_df post_id별로  comment_count 오름차순 정렬
final_df = final_df.sort_values('comment_count', ascending=True)

final_df.head(40)

Unnamed: 0,post_id,comment_count,time_interval,count,cumulative_num
80966,,0,0,0,0
80937,,0,0,0,0
80938,,0,0,0,0
80939,,0,0,0,0
80940,,0,0,0,0
80941,,0,0,0,0
80942,,0,0,0,0
80943,,0,0,0,0
80944,,0,0,0,0
80945,,0,0,0,0


In [10]:

def transform_cumlate_num(df):
    df_commulate_num = pd.DataFrame(columns=['post_id', 'time_interval', 'count', 'cumulative_num'])
    df['post_id']
    for i in range(len(post_id_list)): 
        # post_id,cmt_content,cmt_author,cmt_created_at
        comments = get_comment(post_id_list[i], df_comment)
        
        # cmt_created_at을 datetime으로 변환
        comments['cmt_created_at'] = pd.to_datetime(comments['cmt_created_at'])
        comments = comments.sort_values('cmt_created_at')

        # 시간순으로 정렬
        comments = comments.sort_values('cmt_created_at')

        # 첫 댓글이 달린 시간을 기준으로 5분 단위로 분리하기
        comments['time_elapsed'] = (comments['cmt_created_at'] - comments['cmt_created_at'].iloc[0]).dt.total_seconds() // 60
        
        # 5분 간격으로 time_elapsed의 범위를 생성
        min_interval = comments['time_elapsed'].min() // 5 * 5
        max_interval = (comments['time_elapsed'].max() // 5 + 1) * 5
        time_range = pd.DataFrame({'time_interval': range(int(min_interval), int(max_interval) + 1, 5)})

        # 각 5분 단위의 댓글 수 집계
        comments['time_interval'] = (comments['time_elapsed'] // 5) * 5
        count_per_interval = comments.groupby('time_interval').size().reset_index(name='count')

        # 5분 단위의 time_range에 댓글 수를 병합
        full_interval_df = pd.merge(time_range, count_per_interval, on='time_interval', how='left')
        full_interval_df['count'] = full_interval_df['count'].fillna(0).astype(int)

        # 누적 댓글 수 계산
        full_interval_df['cumulative_num'] = full_interval_df['count'].cumsum()
        
        # full_interval_df에 post_id를 추가
        full_interval_df['post_id'] = int(post_id_list[i])
        
        # df_commulate_num에 full_interval_df를 추가
        df_commulate_num = pd.concat([df_commulate_num, full_interval_df], ignore_index=True)
    


    
    
    
    