# Final Reader for Kickstarter Data

In [2]:
from glob import glob
from flatten_json import flatten
import json
import pandas as pd

In [3]:
rows = []
path = "D:/Downloads/Kickstarter"

# States to Drop
DROP_STATES = {'live': 0, 'started': 0, 'submitted': 0, 'suspended': 0}

# Sets for Unique Creator and Project IDs
creator_ids = set()
project_ids = set()
unique_rows = set()

# Counters
creators_counter = 0
skipped_counter = 0
project_counter = 0

# Function to reset counters
def reset_counters():
    global creators_counter, project_counter, skipped_counter, DROP_STATES
    creators_counter = project_counter = skipped_counter = 0
    DROP_STATES = {state: 0 for state in DROP_STATES}

# Iterate through JSON files
for file in glob(f"{path}/*.json"):
    print(f"Processing: {file}")
    
    with open(file, "r", encoding="utf8") as f:
        for line in f:
            # Flatten JSON and Load Data
            data = flatten(json.loads(line)['data'])
            creator_id, project_id, state, launched_at = data['creator_id'], data['id'], data['state'].lower(), data['launched_at']
            
            # Skip rows with drop states and update counter
            if state in DROP_STATES:
                DROP_STATES[state] += 1
                continue
            
            # Create a unique row identifier
            row_id = (creator_id, project_id, state, launched_at)
            
            # Check if creator is unique
            if creator_id not in creator_ids:
                creators_counter += 1
                creator_ids.add(creator_id)
                project_ids.add(project_id)
                unique_rows.add(row_id)
                rows.append(data)
                continue

            else:
                # Check for uniqueness
                if row_id in unique_rows:
                    skipped_counter += 1
                    continue
                else:
                    project_counter += 1
                    creator_ids.add(creator_id)
                    project_ids.add(project_id)
                    unique_rows.add(row_id)
                    rows.append(data)



    # Display counters
    print(f"Added by Creators: {creators_counter}, Added by Project: {project_counter}, Skipped: {skipped_counter}")
    print(f"Skipped rows by state: {', '.join([f'{state}: {count}' for state, count in DROP_STATES.items()])}")
    print('\n')
    # Reset counters for the next file
    reset_counters()
    

Processing: D:/Downloads/Kickstarter\Kickstarter_2015-11-01T14_09_04_557Z.json
Added by Creators: 45856, Added by Project: 6233, Skipped: 3
Skipped rows by state: live: 5867, started: 0, submitted: 0, suspended: 50


Processing: D:/Downloads/Kickstarter\Kickstarter_2015-12-17T12_09_06_107Z.json
Added by Creators: 96571, Added by Project: 13644, Skipped: 29291
Skipped rows by state: live: 4098, started: 0, submitted: 0, suspended: 659


Processing: D:/Downloads/Kickstarter\Kickstarter_2016-01-28T09_15_08_781Z.json
Added by Creators: 7268, Added by Project: 1352, Skipped: 134082
Skipped rows by state: live: 4041, started: 0, submitted: 0, suspended: 686


Processing: D:/Downloads/Kickstarter\Kickstarter_2016-03-22T07_41_08_591Z.json
Added by Creators: 8663, Added by Project: 1775, Skipped: 136774
Skipped rows by state: live: 4538, started: 0, submitted: 0, suspended: 705


Processing: D:/Downloads/Kickstarter\Kickstarter_2016-04-15T02_09_04_328Z.json
Added by Creators: 3344, Added by Pro

In [4]:
df = pd.DataFrame(rows)
df.shape

(429467, 140)

In [5]:
del rows

In [6]:
json_str = df.to_json(orient='records', lines=True)

with open(r"C:\Users\d0tam\kickstarter-JMP\data\kickstarter.json", 'w' , encoding = 'utf-8') as f:
    f.write(json_str)