In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import os
import sys
import scipy.stats as stats
import plotly.graph_objects as go


In [2]:
pd.set_option('display.max_colwidth', None)  # 전체 열 너비를 무제한으로 설

# 현재 경로
print(os.getcwd())

# id,title,content,likes,url,author,views,created_at,updated_at,조회수
df_post = pd.read_csv("../data/코나 화재/naver_cafe_posts.csv")
df_comment = pd.read_csv("../data/코나 화재/naver_cafe_comments.csv")


/Users/admin/Desktop/teamproject/softeer-team-project/EDA/wh/comment_difference_EDA


In [3]:
# id,title,content,likes,url,author,views,created_at,updated_at,조회수
# views -> int64, created_at -> datetime64[ns], updated_at -> datetime64[ns]
def convert_10000(value : str):
    value = value.replace(',', '')
    
    if '만' in value:
        return float(value.replace('만', '')) * 10000
    return int(value)

df_post['views'] = df_post['views'].apply(convert_10000)


In [4]:
# created_at, updated_at -> datetime64[ns]
df_post['created_at'] = pd.to_datetime(df_post['created_at'])
df_post['updated_at'] = pd.to_datetime(df_post['updated_at'])

In [5]:
# post_id,cmt_content,cmt_author,cmt_created_at
df_comment['cmt_created_at'] = pd.to_datetime(df_comment['cmt_created_at'])

df_comment.dtypes

post_id                    int64
cmt_content               object
cmt_author                object
cmt_created_at    datetime64[ns]
dtype: object

In [6]:
# 먼저 df_comment에서 post_id별로 댓글 수를 집계합니다.
comment_count = df_comment.groupby('post_id').size().reset_index(name='comment_count')

# df_filtered에 comment_count를 병합합니다.
df_filtered = pd.merge(df_post, comment_count, left_on='id', right_on='post_id', how='right')

# comment_count가 없는 경우 NaN이 될 수 있으므로, 이를 0으로 대체합니다.
df_filtered['comment_count'] = df_filtered['comment_count'].fillna(0).astype(int)

# df_filtered의 'post_id' 컬럼에 nan이 있는지 확인합니다.
df_filtered['post_id'].isnull().sum()




np.int64(0)

In [7]:
def get_comment(post_id, df_comment):
    comments = df_comment[df_comment['post_id'] == post_id]
    return comments


In [8]:
def get_comment_num_by_persent(df_filtered : pd.DataFrame, persent) -> int:
    # df_filtered['comment_count'].quantile(0.75)
    return int(df_filtered['comment_count'].quantile(persent))

# comment_count 변수의 값보다 comment_count가 큰부분과 작은 부분으로 df_filtered를 나눕니다.
def divide_filtered_df(df_filtered : pd.DataFrame, comment_count):
    df_filtered_less = df_filtered[df_filtered['comment_count'] < comment_count]
    df_filtered_more = df_filtered[df_filtered['comment_count'] >= comment_count]
    
    return df_filtered_less, df_filtered_more
    

In [9]:
get_comment_num_by_persent(df_filtered, 0.97)


51

In [10]:
value = 50
# value = get_comment_num_by_persent(df_filtered, 0.97)
df_filtered_less, df_filtered_more = divide_filtered_df(df_filtered, value)

In [11]:
df_filtered_less.dtypes 

id                        int64
title                    object
content                  object
likes                   float64
url                      object
author                   object
views                   float64
created_at       datetime64[ns]
updated_at       datetime64[ns]
post_id                   int64
comment_count             int64
dtype: object

In [12]:
# id, title, content, likes, url, author, views, created_at, updated_at, comment_count중 post_id, comment_count는 drop 하지않습니다. 나머지는 드랍합니다.
df_filtered_less = df_filtered_less.drop(['id', 'title', 'content', 'likes', 'url', 'author', 'views', 'created_at', 'updated_at'], axis=1)
df_filtered_more = df_filtered_more.drop(['id', 'title', 'content', 'likes', 'url', 'author', 'views', 'created_at', 'updated_at'], axis=1)


In [13]:
df_filtered_more.dtypes

post_id          int64
comment_count    int64
dtype: object

In [14]:
from joblib import Parallel, delayed

def process_post_id(post_id, df_comment):
    # 해당 post_id에 대한 댓글 필터링
    comments = df_comment[df_comment['post_id'] == post_id]
    
    # 댓글이 없는 경우 기본값으로 채우기
    if comments.empty:
        full_interval_df = pd.DataFrame({
            'post_id': [post_id],
            'time_interval': [0],
            'count': [0],
            'cumulative_num': [0]
        })
        return full_interval_df
    
    # 시간순으로 정렬
    comments['cmt_created_at'] = pd.to_datetime(comments['cmt_created_at'])
    comments = comments.sort_values('cmt_created_at')
    
    # 첫 댓글이 달린 시간을 기준으로 5분 단위로 분리하기
    comments['time_elapsed'] = (comments['cmt_created_at'] - comments['cmt_created_at'].iloc[0]).dt.total_seconds() // 60
    
    # 5분 간격으로 time_elapsed의 범위를 생성
    min_interval = comments['time_elapsed'].min() // 5 * 5
    max_interval = (comments['time_elapsed'].max() // 5 + 1) * 5
    time_range = pd.DataFrame({'time_interval': range(int(min_interval), int(max_interval) + 1, 5)})
    
    # 각 5분 단위의 댓글 수 집계
    comments['time_interval'] = (comments['time_elapsed'] // 5) * 5
    count_per_interval = comments.groupby('time_interval').size().reset_index(name='count')
    
    # 5분 단위의 time_range에 댓글 수를 병합
    full_interval_df = pd.merge(time_range, count_per_interval, on='time_interval', how='left')
    full_interval_df['count'] = full_interval_df['count'].fillna(0).astype(int)
    
    # 누적 댓글 수 계산
    full_interval_df['cumulative_num'] = full_interval_df['count'].cumsum()
    
    # post_id 추가
    full_interval_df['post_id'] = post_id
    
    return full_interval_df

def process_filtered(df_filtered_less, df_comment):
    
    # df_filtered_less에 post_id에 nan이 있는지 확인
    print('nan이 있는지 확인')
    print(np.isnan(df_filtered_less['post_id']).sum())
    
    # df_filtered_less의 post_id 리스트
    post_ids = df_filtered_less['post_id'].unique()
    
    
    # 병렬 처리
    results = Parallel(n_jobs=-1)(delayed(process_post_id)(post_id, df_comment) for post_id in post_ids)
    
    # 결과 결합
    df_commulate_num = pd.concat(results, ignore_index=True)
    
    # df_commulate_num에 nan이 있는지 확인
    print(df_commulate_num.isnull().sum())
    
    # df_commulate_num의 post_id 가 nand인 행 정보 출력
    print('문제가 되는 행 정보')
    print(df_commulate_num[df_commulate_num['post_id'].isnull()])   
    
    # df_filtered_less와 병합
    final_df = pd.merge(df_filtered_less, df_commulate_num, on='post_id', how='right')
    
    # 댓글이 없는 경우 time_interval, count, cumulative_num 0으로 채우기
    final_df[['time_interval', 'count', 'cumulative_num']] = final_df[['time_interval', 'count', 'cumulative_num']].fillna(0)
    
    return final_df

# 함수 사용 예시
# df_filtered_less와 df_comment가 이미 정의되어 있다고 가정합니다.
final_df_less = process_filtered(df_filtered_less, df_comment)
final_df_more = process_filtered(df_filtered_more, df_comment)

nan이 있는지 확인
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

time_interval     0
count             0
cumulative_num    0
post_id           0
dtype: int64
문제가 되는 행 정보
Empty DataFrame
Columns: [time_interval, count, cumulative_num, post_id]
Index: []
nan이 있는지 확인
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

time_interval     0
count             0
cumulative_num    0
post_id           0
dtype: int64
문제가 되는 행 정보
Empty DataFrame
Columns: [time_interval, count, cumulative_num, post_id]
Index: []


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [15]:
# final_df post_id별로  comment_count 오름차순 정렬
final_df_less = final_df_less.sort_values('comment_count', ascending=True)
final_df_more = final_df_more.sort_values('comment_count', ascending=True)


In [16]:
# +e 표현식을 사용하지 않고 출력
pd.options.display.float_format = '{:.2f}'.format
final_df_less.describe()

Unnamed: 0,post_id,comment_count,time_interval,count,cumulative_num
count,5113545.0,5113545.0,5113545.0,5113545.0,5113545.0
mean,821596.34,19.16,192018.94,0.01,16.76
std,335737.23,12.1,269132.92,0.12,11.67
min,345305.0,1.0,0.0,0.0,1.0
25%,529822.0,9.0,15545.0,0.0,7.0
50%,705421.0,17.0,75310.0,0.0,14.0
75%,1167669.0,26.0,262730.0,0.0,24.0
max,1537241.0,49.0,1472020.0,13.0,49.0


In [17]:
# time interval 설정
"""
TIME_INTERVAL_RANGE은 360으로 설정합니다. 이는 5분 간격으로 6시간을 의미합니다. 
설정값을 바꿀 수 있습니다.
"""

TIME_INTERVAL_RANGE = 360

# final_df_less['time_interval']이 360보다 큰 경우를 제외
final_df_less = final_df_less[final_df_less['time_interval'] <= TIME_INTERVAL_RANGE]
final_df_more = final_df_more[final_df_more['time_interval'] <= TIME_INTERVAL_RANGE]


In [18]:
normal_df_less = final_df_less.groupby('time_interval')['cumulative_num'].mean().reset_index()
normal_df_more = final_df_more.groupby('time_interval')['cumulative_num'].mean().reset_index()

In [19]:
# time_interval별 mean과 std 계산
stats_df_less = final_df_less.groupby('time_interval')['cumulative_num'].agg(['mean', 'std']).reset_index()
stats_df_more = final_df_more.groupby('time_interval')['cumulative_num'].agg(['mean', 'std']).reset_index()


In [20]:
# 정규 분포를 따르는 확률 밀도 함수 계산
def calculate_pdf(row):
    # x 값 생성 (0 ~ 200 범위)
    x = np.linspace(0, 200, 201)  # 201 points from 0 to 200
    
    # 확률 밀도 함수 계산
    pdf = stats.norm.pdf(x, row['mean'], row['std'])
    
    return pd.DataFrame({'time_interval': row['time_interval'], 'cumulative_num': x, 'pdf': pdf})

# 각 time_interval에 대해 확률 밀도 함수 계산
pdf_df_less = pd.concat([calculate_pdf(row) for _, row in stats_df_less.iterrows()], ignore_index=True)
pdf_df_more = pd.concat([calculate_pdf(row) for _, row in stats_df_more.iterrows()], ignore_index=True)

In [21]:
# time_interval별로 PDF를 시각화
fig = px.line(pdf_df_more, x='cumulative_num', y='pdf', color='time_interval', title='Probability Density Function Over Time Intervals (0-200)')
fig.show()

In [22]:
# time_interval별로 PDF를 시각화
fig = px.line(pdf_df_less, x='cumulative_num', y='pdf', color='time_interval', title='Probability Density Function Over Time Intervals (0-200)')
fig.show()

In [23]:
cnt = 0
# time_interval 별로 그래프를 그리기
for time_interval in pdf_df_less['time_interval'].unique():
    if cnt > 5 :
        break
    cnt += 1
    
    # 해당 time_interval에 대한 데이터를 필터링
    less_df = pdf_df_less[pdf_df_less['time_interval'] == time_interval]
    more_df = pdf_df_more[pdf_df_more['time_interval'] == time_interval]

    # 그래프 객체 생성
    fig = go.Figure()

    # pdf_df_less에 대한 선 그래프 추가
    fig.add_trace(go.Scatter(x=less_df['cumulative_num'], y=less_df['pdf'], mode='lines',
                             name='Less', line=dict(color='blue')))

    # pdf_df_more에 대한 선 그래프 추가
    fig.add_trace(go.Scatter(x=more_df['cumulative_num'], y=more_df['pdf'], mode='lines',
                             name='More', line=dict(color='red')))

    # 레이아웃 설정
    fig.update_layout(
        title=f'Probability Density Function Comparison (Time Interval: {time_interval})',
        xaxis_title='Cumulative Number',
        yaxis_title='PDF',
        showlegend=True
    )

    # 그래프 표시
    fig.show()

In [24]:
import numpy as np

def calculate_cumulative_area_by_interval(pdf_df_less, pdf_df_more, x_value):
    """
    time_interval별로 주어진 x_value부터 200까지의 확률 밀도 함수(PDF) 면적을 계산하고, 
    pdf_df_more에서 pdf_df_less의 면적을 뺀 값을 반환합니다.
    """
    results = []

    time_intervals = pdf_df_less['time_interval'].unique()
    
    for interval in time_intervals:
        # time_interval별로 필터링
        less_filtered = pdf_df_less[(pdf_df_less['time_interval'] == interval) & (pdf_df_less['cumulative_num'] >= x_value)]
        more_filtered = pdf_df_more[(pdf_df_more['time_interval'] == interval) & (pdf_df_more['cumulative_num'] >= x_value)]
        
        # 면적 계산 (trapezoid에 의한 적분)
        area_less = np.trapezoid(less_filtered['pdf'], less_filtered['cumulative_num']) if not less_filtered.empty else 0
        area_more = np.trapezoid(more_filtered['pdf'], more_filtered['cumulative_num']) if not more_filtered.empty else 0
        
        # 차이 계산
        # area_diff = (area_more * 0.03) / (area_less * 0.97)
        # area_diff = area_more - area_less
        area_diff = area_more / area_less
        
        # 결과 저장
        results.append({
            'time_interval': interval,
            'area_less': area_less,
            'area_more': area_more,
            'area_difference': area_diff,
            'input_value': x_value
        })
    
    return pd.DataFrame(results)

# 예시로 x = 100일 때 계산
x_value = 40
area_diffs_by_interval = calculate_cumulative_area_by_interval(pdf_df_less, pdf_df_more, x_value)

print(area_diffs_by_interval)


    time_interval  area_less  area_more  \
0            0.00       0.00       0.00   
1            5.00       0.00       0.00   
2           10.00       0.00       0.00   
3           15.00       0.00       0.00   
4           20.00       0.00       0.00   
..            ...        ...        ...   
68         340.00       0.00       0.72   
69         345.00       0.00       0.72   
70         350.00       0.00       0.72   
71         355.00       0.00       0.73   
72         360.00       0.00       0.73   

                                                                                      area_difference  \
0  41450652184498243003828807963854309779210664764769798561170404632620405927300770204936501723136.00   
1                                                         1071356869158620797910693674466515353600.00   
2                                                                          45529637758502654967808.00   
3                                                              

In [25]:
final_df_more.describe()

Unnamed: 0,post_id,comment_count,time_interval,count,cumulative_num
count,6024.0,6024.0,6024.0,6024.0,6024.0
mean,937533.94,74.04,179.16,0.69,38.26
std,402675.89,18.62,105.07,1.39,18.2
min,501858.0,50.0,0.0,0.0,1.0
25%,562962.5,55.0,90.0,0.0,25.0
50%,742785.0,70.0,180.0,0.0,39.0
75%,1331941.0,97.0,270.0,1.0,50.0
max,1533858.0,100.0,360.0,16.0,88.0


In [26]:
# final_df_more과 area_diffs_by_interval을 join한다. final_df_more['cumlate_num']과 area_diffs_by_interval['input_value']를 기준 1, 'time_interval'을 기준으로 join한다.
# final_df_more = pd.merge(final_df_more, area_diffs_by_interval, left_on=['cumulative_num', 'time_interval'], right_on=['input_value', 'time_interval'], how='left')
# final_df_less = pd.merge(final_df_less, area_diffs_by_interval, left_on=['cumulative_num', 'time_interval'], right_on=['input_value', 'time_interval'], how='left')

In [27]:
final_df_more.describe()

Unnamed: 0,post_id,comment_count,time_interval,count,cumulative_num
count,6024.0,6024.0,6024.0,6024.0,6024.0
mean,937533.94,74.04,179.16,0.69,38.26
std,402675.89,18.62,105.07,1.39,18.2
min,501858.0,50.0,0.0,0.0,1.0
25%,562962.5,55.0,90.0,0.0,25.0
50%,742785.0,70.0,180.0,0.0,39.0
75%,1331941.0,97.0,270.0,1.0,50.0
max,1533858.0,100.0,360.0,16.0,88.0


In [28]:
final_df_more

Unnamed: 0,post_id,comment_count,time_interval,count,cumulative_num
495920,1040577,50,360,0,41
495919,1040577,50,355,0,41
495918,1040577,50,350,0,41
495917,1040577,50,345,0,41
495916,1040577,50,340,0,41
...,...,...,...,...,...
496852,1094067,100,135,0,72
496853,1094067,100,140,0,72
496854,1094067,100,145,0,72
496843,1094067,100,90,2,67


In [29]:
def calculate_cumulative_area_by_interval_one_row(row, pdf_df_less, pdf_df_more, x_value):
    """
    각 행별로 주어진 x_value부터 200까지의 확률 밀도 함수(PDF) 면적을 계산하고, 
    pdf_df_more에서 pdf_df_less의 면적을 뺀 값을 반환합니다.
    """
    interval = row['time_interval']
    
    # time_interval별로 필터링
    less_filtered = pdf_df_less[(pdf_df_less['time_interval'] == interval) & (pdf_df_less['cumulative_num'] >= x_value)]
    more_filtered = pdf_df_more[(pdf_df_more['time_interval'] == interval) & (pdf_df_more['cumulative_num'] >= x_value)]
    
    # 면적 계산 (trapezoid에 의한 적분)
    area_less = np.trapezoid(less_filtered['pdf'], less_filtered['cumulative_num']) if not less_filtered.empty else 0
    area_more = np.trapezoid(more_filtered['pdf'], more_filtered['cumulative_num']) if not more_filtered.empty else 0
    
    # 차이 계산
    # area_diff = (area_more * 0.03) / (area_less * 0.97)
    # area_diff = area_more - area_less
    area_diff = area_more / area_less
    
    # 결과 반환
    return pd.Series({
        'area_less': area_less,
        'area_more': area_more,
        'area_difference': area_diff,
        'input_value': x_value
    })

# apply를 사용하여 각 행에 대해 함수 적용
result_less = final_df_less.apply(
    lambda row: calculate_cumulative_area_by_interval_one_row(row, pdf_df_less, pdf_df_more, row['cumulative_num']),
    axis=1
)

result_more = final_df_more.apply(
    lambda row: calculate_cumulative_area_by_interval_one_row(row, pdf_df_less, pdf_df_more, row['cumulative_num']),
    axis=1
)

# 원래 데이터프레임에 결과를 추가
final_df_less = pd.concat([final_df_less, result_less], axis=1)
final_df_more = pd.concat([final_df_more, result_more], axis=1)



In [34]:
final_df_less.describe(percentiles=[0.05, 0.1, 0.15, 0.20, 0.25, 0.3, 0.35, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.97, 1])

Unnamed: 0,post_id,comment_count,time_interval,count,cumulative_num,area_less,area_more,area_difference,input_value
count,141394.0,141394.0,141394.0,141394.0,141394.0,141394.0,141394.0,141394.0,141394.0
mean,769466.63,14.83,167.2,0.18,9.38,0.53,0.94,8697.72,9.38
std,365147.6,10.28,105.92,0.62,7.68,0.28,0.1,2510471.49,7.68
min,345305.0,1.0,0.0,0.0,1.0,0.0,0.0,1.12,1.0
5%,471792.0,3.0,10.0,0.0,1.0,0.02,0.76,1.14,1.0
10%,512549.0,4.0,25.0,0.0,2.0,0.08,0.85,1.16,2.0
15%,514347.0,5.0,40.0,0.0,2.0,0.16,0.89,1.2,2.0
20%,516432.0,6.0,60.0,0.0,3.0,0.24,0.92,1.23,3.0
25%,519343.0,7.0,75.0,0.0,4.0,0.31,0.94,1.27,4.0
30%,523752.0,8.0,90.0,0.0,4.0,0.38,0.95,1.31,4.0


In [35]:
# describe에 상위 90, 95, 96, 97, 98, 99, 100%를 추가
final_df_more.describe(percentiles=[0.05, 0.1, 0.15, 0.20, 0.25, 0.3, 0.35, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.97, 1])



Unnamed: 0,post_id,comment_count,time_interval,count,cumulative_num,area_less,area_more,area_difference,input_value
count,6024.0,6024.0,6024.0,6024.0,6024.0,6024.0,6024.0,6024.0,6024.0
mean,937533.94,74.04,179.16,0.69,38.26,0.05,0.5,5.3698312738226766e+19,38.26
std,402675.89,18.62,105.07,1.39,18.2,0.13,0.29,2.356342532162225e+21,18.2
min,501858.0,50.0,0.0,0.0,1.0,0.0,0.0,1.12,1.0
5%,514018.0,51.0,15.0,0.0,8.0,0.0,0.05,2.31,8.0
10%,516502.0,52.0,35.0,0.0,13.0,0.0,0.09,5.2,13.0
15%,530549.0,53.0,50.0,0.0,18.0,0.0,0.13,12.17,18.0
20%,541947.0,54.0,70.0,0.0,21.0,0.0,0.2,33.88,21.0
25%,562962.5,55.0,90.0,0.0,25.0,0.0,0.26,74.91,25.0
30%,571193.0,59.0,105.0,0.0,29.0,0.0,0.32,216.54,29.0


In [37]:
# final_df_more의 time_interval이 0인 경우
final_df_more[final_df_more['time_interval'] == 0].describe(percentiles=[0.05, 0.1, 0.15, 0.20, 0.25, 0.3, 0.35, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.97, 1])

Unnamed: 0,post_id,comment_count,time_interval,count,cumulative_num,area_less,area_more,area_difference,input_value
count,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0
mean,935382.96,73.92,0.0,3.95,3.95,0.3,0.53,332789534862.82,3.95
std,404916.06,18.75,0.0,2.73,2.73,0.28,0.27,3031856866859.52,2.73
min,501858.0,50.0,0.0,1.0,1.0,0.0,0.0,1.12,1.0
5%,514127.8,51.0,0.0,1.0,1.0,0.0,0.03,1.12,1.0
10%,516874.0,52.0,0.0,1.0,1.0,0.0,0.13,1.12,1.0
15%,531031.4,53.0,0.0,1.3,1.3,0.01,0.23,1.21,1.3
20%,546481.8,54.0,0.0,2.0,2.0,0.01,0.23,1.41,2.0
25%,562064.0,55.0,0.0,2.0,2.0,0.04,0.35,1.41,2.0
30%,570320.2,58.6,0.0,2.0,2.0,0.04,0.35,1.41,2.0


In [38]:
# final_df_more의 time_interval이 0인 경우
final_df_less[final_df_less['time_interval'] == 0].describe(percentiles=[0.05, 0.1, 0.15, 0.20, 0.25, 0.3, 0.35, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.97, 1])

Unnamed: 0,post_id,comment_count,time_interval,count,cumulative_num,area_less,area_more,area_difference,input_value
count,2530.0,2530.0,2530.0,2530.0,2530.0,2530.0,2530.0,2530.0,2530.0
mean,761964.94,12.79,0.0,2.15,2.15,0.54,0.73,5748.24,2.15
std,362726.02,10.1,0.0,1.53,1.53,0.25,0.18,254140.52,1.53
min,345305.0,1.0,0.0,1.0,1.0,0.0,0.0,1.12,1.0
5%,490619.0,2.0,0.0,1.0,1.0,0.04,0.35,1.12,1.0
10%,513158.1,3.0,0.0,1.0,1.0,0.12,0.49,1.12,1.0
15%,514369.15,4.0,0.0,1.0,1.0,0.12,0.49,1.12,1.0
20%,516121.4,4.0,0.0,1.0,1.0,0.3,0.63,1.12,1.0
25%,519162.5,5.0,0.0,1.0,1.0,0.3,0.63,1.12,1.0
30%,522638.8,6.0,0.0,1.0,1.0,0.54,0.76,1.12,1.0


In [40]:
# final_df_more의 time_interval이 0인 경우
final_df_more[final_df_more['time_interval'] == 30].describe(percentiles=[0.05, 0.1, 0.15, 0.20, 0.25, 0.3, 0.35, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.97, 1])

Unnamed: 0,post_id,comment_count,time_interval,count,cumulative_num,area_less,area_more,area_difference,input_value
count,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0
mean,935382.96,73.92,30.0,1.92,19.13,0.1,0.51,374480504.99,19.13
std,404916.06,18.75,0.0,1.75,9.57,0.18,0.3,2234849237.84,9.57
min,501858.0,50.0,30.0,0.0,3.0,0.0,0.01,1.31,3.0
5%,514127.8,51.0,30.0,0.0,6.0,0.0,0.04,1.84,6.0
10%,516874.0,52.0,30.0,0.0,7.2,0.0,0.09,2.23,7.2
15%,531031.4,53.0,30.0,0.0,9.3,0.0,0.11,3.42,9.3
20%,546481.8,54.0,30.0,0.0,11.0,0.0,0.16,5.2,11.0
25%,562064.0,55.0,30.0,1.0,12.0,0.0,0.25,6.96,12.0
30%,570320.2,58.6,30.0,1.0,13.0,0.0,0.31,9.57,13.0


In [39]:
# final_df_more의 time_interval이 0인 경우
final_df_less[final_df_less['time_interval'] == 30].describe(percentiles=[0.05, 0.1, 0.15, 0.20, 0.25, 0.3, 0.35, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.97, 1])

Unnamed: 0,post_id,comment_count,time_interval,count,cumulative_num,area_less,area_more,area_difference,input_value
count,2329.0,2329.0,2329.0,2329.0,2329.0,2329.0,2329.0,2329.0,2329.0
mean,762986.44,13.68,30.0,0.39,5.98,0.54,0.89,3322.15,5.98
std,363261.81,10.02,0.0,0.8,4.91,0.27,0.12,84930.58,4.91
min,345305.0,2.0,30.0,0.0,1.0,0.0,0.07,1.15,1.0
5%,475706.4,3.0,30.0,0.0,1.0,0.02,0.63,1.15,1.0
10%,513050.2,4.0,30.0,0.0,1.0,0.08,0.74,1.15,1.0
15%,514350.8,5.0,30.0,0.0,2.0,0.15,0.8,1.22,2.0
20%,516232.6,5.0,30.0,0.0,2.0,0.27,0.85,1.22,2.0
25%,519180.0,6.0,30.0,0.0,2.0,0.34,0.88,1.22,2.0
30%,523006.2,7.0,30.0,0.0,3.0,0.42,0.9,1.31,3.0


In [42]:
# final_df_more의 time_interval이 0인 경우
final_df_more[final_df_more['time_interval'] == 240].describe(percentiles=[0.05, 0.1, 0.15, 0.20, 0.25, 0.3, 0.35, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.97, 1])

Unnamed: 0,post_id,comment_count,time_interval,count,cumulative_num,area_less,area_more,area_difference,input_value
count,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0
mean,935382.96,73.92,240.0,0.24,45.57,0.02,0.5,2558119988509887.0,45.57
std,404916.06,18.75,0.0,0.73,15.67,0.09,0.29,2.328680276500173e+16,15.67
min,501858.0,50.0,240.0,0.0,8.0,0.0,0.0,1.54,8.0
5%,514127.8,51.0,240.0,0.0,21.1,0.0,0.06,8.36,21.1
10%,516874.0,52.0,240.0,0.0,23.2,0.0,0.09,13.01,23.2
15%,531031.4,53.0,240.0,0.0,29.3,0.0,0.14,62.52,29.3
20%,546481.8,54.0,240.0,0.0,32.2,0.0,0.21,173.04,32.2
25%,562064.0,55.0,240.0,0.0,37.0,0.0,0.25,828.5,37.0
30%,570320.2,58.6,240.0,0.0,39.0,0.0,0.34,1820.45,39.0


In [41]:
# final_df_more의 time_interval이 0인 경우
final_df_less[final_df_less['time_interval'] == 240].describe(percentiles=[0.05, 0.1, 0.15, 0.20, 0.25, 0.3, 0.35, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.97, 1])

Unnamed: 0,post_id,comment_count,time_interval,count,cumulative_num,area_less,area_more,area_difference,input_value
count,1778.0,1778.0,1778.0,1778.0,1778.0,1778.0,1778.0,1778.0,1778.0
mean,772742.43,15.44,240.0,0.04,11.09,0.53,0.97,44.13,11.09
std,366539.4,10.37,0.0,0.23,8.25,0.28,0.05,412.41,8.25
min,345305.0,2.0,240.0,0.0,1.0,0.0,0.57,1.12,1.0
5%,469661.6,4.0,240.0,0.0,2.0,0.02,0.87,1.15,2.0
10%,511410.4,5.0,240.0,0.0,3.0,0.07,0.92,1.19,3.0
15%,514248.25,5.0,240.0,0.0,3.0,0.14,0.95,1.19,3.0
20%,516489.6,6.0,240.0,0.0,4.0,0.24,0.97,1.24,4.0
25%,519389.25,8.0,240.0,0.0,5.0,0.32,0.97,1.29,5.0
30%,524115.8,8.0,240.0,0.0,6.0,0.36,0.98,1.36,6.0
