In [2]:
import json
from datetime import datetime

In [24]:
timestamp_20150101 = datetime(2015, 1, 1).timestamp()

def filter_condition(json_data, timestamp):
    return json_data['location']['country'] == 'US'\
        and (timestamp - int(json_data['deadline']) >= 0)\
        and (json_data['state'] == 'failed' or json_data['state'] == 'successful')\
        and (int(json_data['launched_at']) > 0)\
        and (not (float(json_data['usd_pledged']) >= int(json_data['goal']) and json_data['state'] == 'failed'))\
        and (json_data['currency'] == 'USD')\
        and (len(json_data['blurb'].split(' ')) >= 10)\
        and len(json_data['name']) > 0 and len(json_data['slug']) > 0 and len(json_data['creator']['name']) > 0\
        and int(json_data['created_at']) >= timestamp_20150101

In [25]:
# 모든 년도를 포함한 json dataset 생성
def generate_total_dataset():
    raw_json_filenames = [
        'Kickstarter_2016-01-28T09_15_08_781Z.json',
        'Kickstarter_2017-01-15T22_21_04_985Z.json',
        'Kickstarter_2018-01-12T10_20_09_196Z.json',
        'Kickstarter_2019-01-17T03_20_02_630Z.json',
        'Kickstarter_2020-01-16T03_20_15_556Z.json',
        'Kickstarter_2021-01-14T03_20_05_328Z.json',
        'Kickstarter_2022-01-20T03_20_11_451Z.json',
        'Kickstarter_2023-01-05T03_20_04_136Z.json',
        'Kickstarter_2024-01-15T14_13_05_649Z.json'
    ]
    scraped_unix_timestamps = [
        datetime(2016, 1, 28).timestamp(),
        datetime(2017, 1, 15).timestamp(),
        datetime(2018, 1, 12).timestamp(),
        datetime(2019, 1, 17).timestamp(),
        datetime(2020, 1, 16).timestamp(),
        datetime(2021, 1, 14).timestamp(),
        datetime(2022, 1, 20).timestamp(),
        datetime(2023, 1, 5).timestamp(),
        datetime(2024, 1, 15).timestamp(),
    ]
    ids = set()  # 동일한 프로젝트 중복을 방지하기 위한 id
    input_base_path = './kickstarter-raw-jsons'
    output_file_path = './kickstarter-filtered-total-json/kickstarter-20150101-20240101.json'
    
    with open(output_file_path, 'w') as output_file:
        for file_idx, raw_json_filename in enumerate(raw_json_filenames, 0):
            input_file_path = f'{input_base_path}/{raw_json_filename}'

            error_samples = 0
            filtered_samples = 0
            pass_cnt = 0

            with open(input_file_path, 'r') as input_file:
                json_str = input_file.readline()
                while json_str:
                    json_data = json.loads(json_str.strip())['data']
                    try:
                        if filter_condition(json_data, scraped_unix_timestamps[file_idx]) and (not (json_data['id'] in ids)):
                            output_file.write(f'{json_str}')
                            ids.add(json_data['id'])
                            pass_cnt += 1
                        else:
                            filtered_samples += 1
                    except KeyError as e:
                        error_samples += 1
                    json_str = input_file.readline()
            print(f'{input_file_path}: filtered={filtered_samples}, error={error_samples}, passed={pass_cnt}')
    print(f'total={len(ids)}')


In [26]:
generate_total_dataset()

./kickstarter-raw-jsons/Kickstarter_2016-01-28T09_15_08_781Z.json: filtered=120654, error=690, passed=26085
./kickstarter-raw-jsons/Kickstarter_2017-01-15T22_21_04_985Z.json: filtered=151919, error=649, passed=20324
./kickstarter-raw-jsons/Kickstarter_2018-01-12T10_20_09_196Z.json: filtered=183177, error=1364, passed=12251
./kickstarter-raw-jsons/Kickstarter_2019-01-17T03_20_02_630Z.json: filtered=187928, error=226, passed=19694
./kickstarter-raw-jsons/Kickstarter_2020-01-16T03_20_15_556Z.json: filtered=199137, error=195, passed=11939
./kickstarter-raw-jsons/Kickstarter_2021-01-14T03_20_05_328Z.json: filtered=211058, error=215, passed=10007
./kickstarter-raw-jsons/Kickstarter_2022-01-20T03_20_11_451Z.json: filtered=28074, error=41, passed=1998
./kickstarter-raw-jsons/Kickstarter_2023-01-05T03_20_04_136Z.json: filtered=7180, error=8, passed=244
./kickstarter-raw-jsons/Kickstarter_2024-01-15T14_13_05_649Z.json: filtered=23169, error=40, passed=2748
total=105290


In [28]:
def generate_yr(start_yr, start_mn, start_dy, end_yr, end_mn, end_dy):
    start_ts = datetime(start_yr, start_mn, start_dy).timestamp()
    end_ts = datetime(end_yr, end_mn, end_dy).timestamp()
    target_file_path = './kickstarter-filtered-total-json/kickstarter-20150101-20240101.json'
    output_base_path = './kickstarter-filtered-yr-json'
    total_cnt = 0

    with open(f'{output_base_path}/kickstarter-{start_yr}-{start_mn}-{start_dy}-{end_yr}={end_mn}={end_dy}.json', 'w') as output_file:
        with open(target_file_path, 'r') as target_file:
            json_str = target_file.readline()
            while json_str:
                json_data = json.loads(json_str.strip())['data']
                if start_ts <= json_data['created_at'] < end_ts:
                    output_file.write(f'{json_str.strip()}')
                    total_cnt += 1
                json_str = target_file.readline()
    
    print(f'total: {total_cnt}')


In [29]:
for yr in range(2015, 2024):
    generate_yr(yr, 1, 1, yr + 1, 1, 1)

total: 29800
total: 20113
total: 15761
total: 14660
total: 11553
total: 8842
total: 1762
total: 1169
total: 1630
