In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import os
from utils import *
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.datasets import make_regression
from scipy.optimize import curve_fit

DATA_DIR = "../data"

# Load All Dataset

In [2]:
# events = ["벤츠 화재", "아이오닉 누수", "아이오닉 iccu", "코나 화재"]
events = ["코나 화재"]
communities = ['clien', 'bobae', 'fmkorea', 'naver_cafe']

In [3]:
# Read All Posts with count of comments
total_dfs = []
for event in events:
    per_community_dfs = []
    for community in communities:
        posts_df = pd.read_csv(f'{DATA_DIR}/{event}/{community}_posts.csv') 
        posts_df['from'] = community

        comments_df = pd.read_csv(f'../data/{event}/{community}_comments.csv') 
        if community=='clien': #TODO: clien dataset cmt_author, post_id가 바뀌어 있음.
            comments_df.columns = ['cmt_author', 'cmt_count', 'post_id', 'cmt_created_at']
        comments_df = comments_df.groupby(['post_id'], as_index = False).agg({
            'cmt_author': ['count'],
        })

        comments_df.columns = comments_df.columns.droplevel(0)
        comments_df.columns = ['post_id', 'cmt_count']
        per_community_df = pd.merge(posts_df, comments_df, left_on='id', right_on='post_id', how='left')
        per_community_dfs.append(per_community_df)
    per_event_df = pd.concat(per_community_dfs)
    total_dfs.append(per_event_df)
df = pd.concat(total_dfs)

In [4]:
# Preprocessing
df = df.dropna(subset=['created_at']) # 생성 시간이 없는 게시물 제거
df.views = df.views.map(str).apply(remove_commna).apply(convert_str_to_int)
df.likes = df.likes.map(str).apply(remove_commna).apply(convert_str_to_float)
df.created_at = df.created_at.apply(parse_dates)
df.cmt_count = df.cmt_count.fillna(0).map(int)

# Split Dataset (Hot Posts / Cold Posts)

In [5]:
top_3_percent_view = get_target_val_by_percent(df, 'views', 0.97)
top_3_percent_number_of_comments = get_target_val_by_percent(df, 'cmt_count', 0.97)
top_3_percent_likes = get_target_val_by_percent(df, 'likes', 0.97)
print(f"top_5_percent_view: {top_3_percent_view} \
    \ntop_5_percent_number_of_comments: {top_3_percent_number_of_comments} \
    \ntop_5_percent_likes: {top_3_percent_likes}"
    )

top_5_percent_view: 10186     
top_5_percent_number_of_comments: 56     
top_5_percent_likes: 17


In [6]:
condition = ((df.views >= top_3_percent_view) \
    | (df.cmt_count>=top_3_percent_number_of_comments) \
    # | (df.likes>=top_3_percent_likes) \
    )
    
df = df.drop(['post_id', 'cmt_count'], axis=1) # Only the information obtained from the posts is kept.

hot_posts_df = df[condition]
cold_posts_df = df[~condition]
assert hot_posts_df.shape[0]+cold_posts_df.shape[0] == df.shape[0]

os.makedirs(f"{DATA_DIR}/splitted/", exist_ok=True)           
hot_posts_df.to_csv(f'{DATA_DIR}/splitted/hot_posts.csv', index=False)
cold_posts_df.to_csv(f'{DATA_DIR}/splitted/cold_posts.csv', index=False)