In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from utils import *

# Data Loading & Preprocessing

In [15]:
event = "아이오닉 iccu"
communities = ['clien', 'bobae', 'fmkorea', 'naver_cafe']

In [16]:
per_community_dfs = []
for community in communities:
    posts_df = pd.read_csv(f'../data/{event}/{community}_posts.csv') 
    posts_df['from'] = community
    posts_df.created_at = posts_df.created_at.apply(parse_dates)

    comments_df = pd.read_csv(f'../data/{event}/{community}_comments.csv') 
    if community=='clien': #TODO: clien dataset cmt_author, post_id가 바뀌어 있음.
        comments_df.columns = ['cmt_author', 'cmt_count', 'post_id', 'cmt_created_at']
    comments_df = comments_df.groupby(['post_id'], as_index = False).agg({
        'cmt_author': ['count'],
    })

    comments_df.columns = comments_df.columns.droplevel(0)
    comments_df.columns = ['post_id', 'cmt_count']
    per_community_df = pd.merge(posts_df, comments_df, left_on='id', right_on='post_id', how='left')
    per_community_dfs.append(per_community_df)
df = pd.concat(per_community_dfs)

In [17]:
print(df.shape)
df.isnull().sum()

(1403, 12)


id               0
title            0
content         32
likes            0
url              0
author           0
views            0
created_at       0
updated_at    1267
from             0
post_id         35
cmt_count       35
dtype: int64

In [18]:
# Preprocessing
df = df.dropna(subset=['created_at']) # 생성 시간이 없는 게시물 제거
df.views = df.views.map(str).apply(remove_commna).apply(convert_str_to_int)
df.likes = df.likes.map(str).apply(remove_commna).apply(convert_str_to_float)
df.cmt_count = df.cmt_count.fillna(0).map(int)
print(df.shape)

# filtering by keyword & add created_day column
keywords = list(event.split())
filtered_df = filter_by_keyword(df, keywords)
filtered_df = add_created_day_col(filtered_df).sort_values(by=['created_at'])

(1403, 12)


In [19]:
filtered_df.created_day.min()

'2021-02-26'

# 화제성 시각화

In [20]:
# Per day Post (count)
vis_df = filtered_df
day_start = filtered_df.created_day.min()
day_end = filtered_df.created_day.max()
day_start = '2024-01-01'
day_end = '2024-07-31'

title = "Per day Number of Posts (count)"
result_df = plot_per_day_post_counts_with_events(vis_df, title=title, day_start=day_start, day_end=day_end)

In [21]:
# Per day number of views (total)
vis_df = filtered_df
day_start = '2024-01-01'
day_end = '2024-07-31'
title = "Per day Number of Views (Total)"
result_df = plot_per_day_target_val_sum_with_events(vis_df, title=title, target_val='views', day_start=day_start, day_end=day_end)

In [22]:
# Per day number of comments (total)
vis_df = filtered_df
day_start = '2024-01-01'
day_end = '2024-07-31'
title = "Per day Number of Comments (Total)"
result_df = plot_per_day_target_val_sum_with_events(vis_df, title=title, target_val='cmt_count', day_start=day_start, day_end=day_end)

In [23]:
# Per day number of likes (total)
vis_df = filtered_df
day_start = '2024-01-01'
day_end = '2024-07-31'
title = "Per day Number of Likes (Total)"
result_df = plot_per_day_target_val_sum_with_events(vis_df, title=title, target_val='likes', day_start=day_start, day_end=day_end)

In [24]:
a = filtered_df[filtered_df['created_day'] =='2024-06-05']