# Final Reader for Kickstarter Data

In [1]:
from glob import glob
from flatten_json import flatten
import json
import pandas as pd
import os

In [None]:
rows = []
suspended_rows = []

path = "D:/Downloads/Kickstarter"

# States to Drop
DROP_STATES = {'live': 0, 'started': 0, 'submitted': 0}

# Sets for Unique Creator and Project IDs
creator_ids = set()
project_ids = set()
unique_rows = set()

suspended_creator_ids = set()
suspended_project_ids = set()
suspended_unique_rows = set()

# Counters
creators_counter = 0
skipped_counter = 0
project_counter = 0
suspended_counter = 0
# Function to reset counters
def reset_counters():
    global creators_counter, project_counter, skipped_counter, DROP_STATES
    creators_counter = project_counter = skipped_counter = 0
    DROP_STATES = {state: 0 for state in DROP_STATES}

filepaths = glob(f"{path}/*.json")
filepaths.sort(key = os.path.getctime)

# Iterate through JSON files
for file in filepaths:
    print(f"Processing: {file}")
    
    with open(file, "r", encoding="utf8") as f:
        for line in f:
            # Flatten JSON and Load Data
            data = flatten(json.loads(line)['data'])
            creator_id, project_id, state, launched_at = data['creator_id'], data['id'], data['state'].lower(), data['launched_at']
            
            # Skip rows with drop states and update counter
            if (state == 'suspended') and (row_id not in suspended_unique_rows):
                suspended_counter += 1
                suspended_creator_ids.add(creator_id)
                suspended_project_ids.add(project_id)
                suspended_unique_rows.add((creator_id, project_id, state, launched_at))
                suspended_rows.append(data)
                continue

            if state in DROP_STATES:
                DROP_STATES[state] += 1
                continue
            
            # Create a unique row identifier
            row_id = (creator_id, project_id, state, launched_at)
            
            # Check if creator is unique
            if creator_id not in creator_ids:
                creators_counter += 1
                creator_ids.add(creator_id)
                project_ids.add(project_id)
                unique_rows.add(row_id)
                rows.append(data)
                continue

            else:
                # Check for uniqueness
                if row_id in unique_rows:
                    skipped_counter += 1
                    continue
                else:
                    project_counter += 1
                    creator_ids.add(creator_id)
                    project_ids.add(project_id)
                    unique_rows.add(row_id)
                    rows.append(data)



    # Display counters
    print(f"Added by Creators: {creators_counter}, Added by Project: {project_counter}, Skipped: {skipped_counter}")
    print(f"Skipped rows by state: {', '.join([f'{state}: {count}' for state, count in DROP_STATES.items()])}")
    print('\n')
    # Reset counters for the next file
    reset_counters()
    

Processing: D:/Downloads/Kickstarter\Kickstarter_2024-02-15T03_20_05_339Z.json
Added by Creators: 20922, Added by Project: 2541, Skipped: 562
Skipped rows by state: live: 612, started: 72, submitted: 871


Processing: D:/Downloads/Kickstarter\Kickstarter_2024-01-15T14_13_05_649Z.json
Added by Creators: 696, Added by Project: 182, Skipped: 23738
Skipped rows by state: live: 441, started: 73, submitted: 825


Processing: D:/Downloads/Kickstarter\Kickstarter_2024-03-14T03_20_05_926Z.json
Added by Creators: 293, Added by Project: 99, Skipped: 24049
Skipped rows by state: live: 653, started: 72, submitted: 901


Processing: D:/Downloads/Kickstarter\Kickstarter_2024-04-15T06_47_07_694Z.json
Added by Creators: 148115, Added by Project: 29403, Skipped: 47912
Skipped rows by state: live: 5635, started: 467, submitted: 9829


Processing: D:/Downloads/Kickstarter\Kickstarter_2024-05-15T08_20_53_524Z.json
Added by Creators: 1323, Added by Project: 778, Skipped: 223477
Skipped rows by state: live: 

In [None]:
df = pd.DataFrame(rows)
df.shape

In [None]:
del rows

In [None]:
suspended_df = pd.DataFrame(suspended_rows)
suspended_df.shape


In [None]:
del suspended_rows

Dropping special cases on import

In [None]:
# Keeping the last instance based on state_changed_at column
df = df.sort_values(by = ['id', 'state_changed_at'], ascending = [True, True]).drop_duplicates(subset = 'id', keep = 'last')
print(df.shape)

Export to Json

In [None]:
json_str = df.to_json(orient='records', lines=True)

with open(r"C:\Users\d0tam\kickstarter-JMP\data\kickstarter.json", 'w' , encoding = 'utf-8') as f:
    f.write(json_str)

In [None]:
json_str = suspended_df.to_json(orient='records', lines=True)

with open(r"C:\Users\d0tam\kickstarter-JMP\data\suspended_data.json", 'w' , encoding = 'utf-8') as f:
    f.write(json_str)