In [4]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from datetime import datetime

# from bs4 import BeautifulSoup
# from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
from time import sleep

In [51]:
from utils.scraping_utils import get_proxy_list
from random import choice
PROXY_SERVER_LIST = get_proxy_list(only_https=True)

In [67]:
CHROME_PATH = "/opt/homebrew/bin/chromedriver"
HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
    }
PARAMS = {}


def scrape_wadiz(use_proxy=False)->pd.DataFrame:
    URL = "https://www.wadiz.kr/web/wreward/main?keyword=&endYn=ALL&order=recommend"

    options = webdriver.ChromeOptions()
    options.add_argument(f"user-agent={HEADERS['User-Agent']}")
    if use_proxy:
        proxy_server = choice(PROXY_SERVER_LIST)
        options.add_argument(f"--proxy-server={proxy_server}")
        print(f"Using proxy server {proxy_server}")

    driver = webdriver.Chrome(executable_path=CHROME_PATH, options=options)
    
    # 와디즈 페이지 열기
    MAX_ATTEPMTS = 5
    for attempt in range(MAX_ATTEPMTS):
        try:
            driver.get(URL)
            driver.implicitly_wait(15)
            print("Driver get succeed.")
            break
        except:
            print(f"Driver get failed. Retry {attempt + 1}")
            continue
    else:
        print("Driver get failed. Return None.")
        driver.quit()
        return None

    # 바닥까지 스크롤 내리기
    try:
        for i in tqdm(range(1_000_000)):
            btn_view_more = driver.find_element(By.CLASS_NAME, "ProjectListMoreButton_button__27eTb")
            sleep(0.3)
            driver.execute_script("arguments[0].click();", btn_view_more)
    except:
        print("Expanding all done.")
    else:
        print("Expanding all done by max expanding.")   

    table = driver.find_element(By.CLASS_NAME, "ProjectCardList_container__3Y14k")
    rows = table.find_elements(By.CLASS_NAME, "ProjectCardList_item__1owJa")

    # 모든 card 데이터 수집
    df = []
    for i, card in tqdm(enumerate(rows)):
        try:
            link = card.find_element(By.CLASS_NAME, "CardLink_link__1k83H").get_attribute("href")
            title = card.find_element(By.CLASS_NAME, "CommonCard_title__1oKJY").text
            category = card.find_element(By.CLASS_NAME, "RewardProjectCard_category__2muXk").text
            maker_name = card.find_element(By.CLASS_NAME, "RewardProjectCard_makerName__2q4oH").text
            fund_percent = card.find_element(By.CLASS_NAME, "RewardProjectCard_percent__3TW4_").text
            fund_amount = card.find_element(By.CLASS_NAME, "RewardProjectCard_amount__2AyJF").text
            remaining_day = card.find_element(By.CLASS_NAME, "RewardProjectCard_remainingDay__2TqyN").text
        except:
            print(f"Row {i} failed to read and skipped.")
            continue
        else:
            df.append([title, category, maker_name, fund_percent, fund_amount, remaining_day, link])
    
    driver.quit()
    return pd.DataFrame(df, columns=["Title", "Category", "Maker", "FundPercentage", "FundAmount", "RemainingDay", "Link"])


In [5]:
df = scrape_wadiz(use_proxy=False)
df.to_csv("./data/wadiz_list.csv", encoding="utf-8-sig")
df.info()

NameError: name 'scrape_wadiz' is not defined

In [6]:
df = pd.read_csv("./data/wadiz_list.csv", index_col=0)

df["FundPercentage"] = df["FundPercentage"].apply(lambda x: int(x.replace("%", ""))) # % 제거
df["FundAmount"] = df["FundAmount"].apply(lambda x: x.replace("원", "")) # 원 제거
df["FundAmount"] = df["FundAmount"].apply(lambda x: int(x.replace(",", ""))) # , 제거

df = df[df["RemainingDay"] == "종료"] # 종료된 펀딩만 필터링

df_raw = df
df_raw = df_raw.set_index(["Category"])
df_group = df[["Category", "FundPercentage", "FundAmount"]]
df_group_sum = df_group.groupby(by=["Category"]).sum()
df_group = df_group.groupby(by=["Category"]).mean().sort_values(by=["FundPercentage"], ascending=False)

category_sorter = list(df_group.index)
category_sorter_index = dict(zip(category_sorter, range(len(category_sorter))))

# 그룹 순서에 맞게 정렬
df_raw = df_raw.reset_index()
df_raw["CategoryRank"] = df_raw["Category"].map(category_sorter_index)
df_raw.sort_values(["CategoryRank"], ascending=True, inplace=True)
df_raw.drop("CategoryRank", 1, inplace = True)

df_group_sum = df_group_sum.reset_index()
df_group_sum["CategoryRank"] = df_group_sum["Category"].map(category_sorter_index)
df_group_sum.sort_values(["CategoryRank"], ascending=True, inplace=True)
df_group_sum.drop("CategoryRank", 1, inplace = True)


df_group = df_group.reset_index()

df_group
df_raw
df_group_sum

  df_raw.drop("CategoryRank", 1, inplace = True)
  df_group_sum.drop("CategoryRank", 1, inplace = True)


Unnamed: 0,Category,FundPercentage,FundAmount
7,출판,490449,3406373025
11,테크·가전,7525783,101146987076
3,뷰티,6745345,41457127598
6,여행·레저,1796133,26651461742
2,베이비·키즈,354260,3127504001
8,캐릭터·굿즈,145567,1012642059
5,스포츠·모빌리티,1952713,23970784036
0,게임·취미,487921,6359821728
14,홈·리빙,6504387,57150661709
12,패션·잡화,10525444,90078383283


In [7]:
import plotly.graph_objects as go
import plotly.express as px

In [8]:
fig = px.bar(df_group, x="Category", y="FundPercentage", color="Category", title="펀딩 달성률 평균")
# fig = px.bar(df_group, x="Category", y="FundPercentage", color_discrete_sequence=color_scale, title="펀딩 달성률")
fig.update_layout(showlegend=False, yaxis_title="펀딩 달성률 (%)")
fig.show()

fig2 = px.bar(df_group, x="Category", y="FundAmount", color="Category", title="펀딩 금액 평균")
fig2.update_layout(showlegend=False, yaxis_title="펀딩 금액 (원)")
fig2.show()

fig3 = px.bar(df_group_sum, x="Category", y="FundAmount", color="Category", title="펀딩 금액 총합")
fig3.update_layout(showlegend=False, yaxis_title="펀딩 금액 (원)")
fig3.show()

fig4 = px.box(df_raw, x="Category", y="FundAmount", color="Category", title="펀딩 달성금액")
fig4.update_layout(showlegend=False, yaxis_title="펀딩 달성금액 (원)")
fig4.show()



urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!



* 펀딩 금액 총합 기준으로, 테크/가전, 패션/잡화, 홈/리빙 분야의 소비자 수요가 가장 높다고 볼 수 있다.
    * 크라우드 펀딩이 소비자의 수요를 나타낼 수 있을까?
    * 와디즈의 특화 분야가 있을 수 있다 -> 대한민국의 소비자 수요를 정확히 파악하지 않을 수 있다.
    * 크라우드 펀딩 자체가 어려운 분야가 있을 수 있다. ex) 소프트웨어 개발, 서비스 제공?
    * 리워드형 펀딩 특성상, 제품 위주 -> 지분투자형 크라우드 펀딩이 소비자 수요를 나타내는가?
* 펀딩 달성률 평균은 각 프로젝트의 펀딩 목표 금액에 따라 달라진다 -> 펀딩 목표 금액을 조사해보자.
* 1인당 펀딩 금액이 높은 분야도 알아보자.