In [338]:
# ========================================
# IMPORT LIBRARIES
# ========================================
import pandas as pd
import numpy as np
import json
import dill 
from IPython.display import display
from datetime import datetime

In [None]:
# ========================================
# LOAD/SAVE SESSIONS
# ========================================
#filename = 'notebook_env_analysis.db'
#dill.dump_session(filename)

# and to load the session again:
#dill.load_session(filename)

In [3]:
# ========================================
# READ/EXTRACT RELEVANT DATA
# ========================================
with open("data/Kickstarter_2018-10-18T03_20_48_880Z/Kickstarter_2018-10-18T03_20_48_880Z.json",
          encoding="utf8") as json_file:
    json_obj = [json.loads(line) for line in json_file]

In [4]:
# ---- QUICK EXPLORATION ----

In [4]:
type(json_obj)

list

In [5]:
len(json_obj)

205696

In [6]:
type(json_obj[0])

dict

In [7]:
len(json_obj[0])

4

In [8]:
json_obj[len(json_obj)-1] # look at final entry

{'table_id': 'Kickstarter',
 'robot_id': 'Kickstarter',
 'run_id': 'Kickstarter_2018-10-18T03_20_48_880Z',
 'data': {'id': 1991284368,
  'photo': {'key': 'assets/011/796/998/9d7e9715ca5b8227f29de01f3c4abc8f_original.png',
   'full': 'https://ksr-ugc.imgix.net/assets/011/796/998/9d7e9715ca5b8227f29de01f3c4abc8f_original.png?ixlib=rb-1.1.0&crop=faces&w=560&h=315&fit=crop&v=1463696010&auto=format&frame=1&q=92&s=02368a096ce01fc6cae29ab85df52d53',
   'ed': 'https://ksr-ugc.imgix.net/assets/011/796/998/9d7e9715ca5b8227f29de01f3c4abc8f_original.png?ixlib=rb-1.1.0&crop=faces&w=352&h=198&fit=crop&v=1463696010&auto=format&frame=1&q=92&s=7c87714e96670269be821591da0ddbb1',
   'med': 'https://ksr-ugc.imgix.net/assets/011/796/998/9d7e9715ca5b8227f29de01f3c4abc8f_original.png?ixlib=rb-1.1.0&crop=faces&w=272&h=153&fit=crop&v=1463696010&auto=format&frame=1&q=92&s=7ce6128059b609c7603ab19d113e7c55',
   'little': 'https://ksr-ugc.imgix.net/assets/011/796/998/9d7e9715ca5b8227f29de01f3c4abc8f_original.png?i

In [9]:
json_obj[0].keys()

dict_keys(['table_id', 'robot_id', 'run_id', 'data'])

In [10]:
# The useful keys are 'data'

In [11]:
# Extract relevant data
# pre-allocate
json_obj2 = []
# append 'data' dictionary only
for x in range(0, len(json_obj)):
    json_obj2.append(json_obj[x]["data"])

In [12]:
len(json_obj2) - len(json_obj) # Check that all rows extracted

0

In [13]:
json_obj2[0]

{'id': 895922629,
 'photo': {'key': 'assets/011/911/160/2ff8292ff8ad7486f38ea865a9c030bb_original.jpg',
  'full': 'https://ksr-ugc.imgix.net/assets/011/911/160/2ff8292ff8ad7486f38ea865a9c030bb_original.jpg?ixlib=rb-1.1.0&crop=faces&w=560&h=315&fit=crop&v=1463713991&auto=format&frame=1&q=92&s=846b5db7dd4a36491881770969721fab',
  'ed': 'https://ksr-ugc.imgix.net/assets/011/911/160/2ff8292ff8ad7486f38ea865a9c030bb_original.jpg?ixlib=rb-1.1.0&crop=faces&w=352&h=198&fit=crop&v=1463713991&auto=format&frame=1&q=92&s=c2d76a75d1cd815488d3d3b651b49222',
  'med': 'https://ksr-ugc.imgix.net/assets/011/911/160/2ff8292ff8ad7486f38ea865a9c030bb_original.jpg?ixlib=rb-1.1.0&crop=faces&w=272&h=153&fit=crop&v=1463713991&auto=format&frame=1&q=92&s=f938afb8f27ea25f81b3f57bc8349d88',
  'little': 'https://ksr-ugc.imgix.net/assets/011/911/160/2ff8292ff8ad7486f38ea865a9c030bb_original.jpg?ixlib=rb-1.1.0&crop=faces&w=208&h=117&fit=crop&v=1463713991&auto=format&frame=1&q=92&s=69632b0cf7a30977d7eb58a05f3bd315',
 

In [14]:
# ---- EXTRACT RELEVANT COLUMNS ----
json_obj2[0].keys()

dict_keys(['id', 'photo', 'name', 'blurb', 'goal', 'pledged', 'state', 'slug', 'disable_communication', 'country', 'currency', 'currency_symbol', 'currency_trailing_code', 'deadline', 'state_changed_at', 'created_at', 'launched_at', 'staff_pick', 'is_starrable', 'backers_count', 'static_usd_rate', 'usd_pledged', 'converted_pledged_amount', 'fx_rate', 'current_currency', 'usd_type', 'creator', 'location', 'category', 'profile', 'spotlight', 'urls', 'source_url'])

In [15]:
# Remove probably useless keys as well as and nested-dictionary keys (to be added back later if desired): 
#   - Useless: photo, slug, urls, source_url
#   - Nested: creator, location, category, profile (to maybe be added later)
keys=('id', 'name', 'blurb', 'goal', 'pledged', 'state', 'disable_communication', 'country', 'currency', 'currency_symbol',
      'currency_trailing_code', 'deadline', 'state_changed_at', 'created_at', 'launched_at', 'staff_pick', 'is_starrable',
      'backers_count', 'static_usd_rate', 'usd_pledged', 'converted_pledged_amount', 'fx_rate', 'current_currency', 'usd_type',
      'spotlight')

# pre-allocate
json_obj3 = []
# Append
for x in range(0, len(json_obj2)):
    json_obj3.append({k:json_obj2[x][k] for k in keys})

In [16]:
# ==== ADD BACK USEFUL SUB-DICTIONARY VALUES ====
# Previously removed: 'creator', 'location', 'category', 'profile'

In [17]:
# ---- Explore 'creator' ----

In [18]:
# Grab useful previously removed sub key-value pairs:
json_obj2[0]["creator"].keys()

dict_keys(['id', 'name', 'slug', 'is_registered', 'chosen_currency', 'avatar', 'urls'])

In [19]:
json_obj2[0]["creator"]

{'id': 69089661,
 'name': 'AJ Smith and Brandon Maier (deleted)',
 'slug': 'joziijobs',
 'is_registered': True,
 'chosen_currency': None,
 'avatar': {'thumb': 'https://ksr-ugc.imgix.net/missing_user_avatar.png?ixlib=rb-1.1.0&w=40&h=40&fit=crop&v=&auto=format&frame=1&q=92&s=c8baefb239621e7b5b26957577e078db',
  'small': 'https://ksr-ugc.imgix.net/missing_user_avatar.png?ixlib=rb-1.1.0&w=160&h=160&fit=crop&v=&auto=format&frame=1&q=92&s=fea4448b425bb704dead957d7448b9cc',
  'medium': 'https://ksr-ugc.imgix.net/missing_user_avatar.png?ixlib=rb-1.1.0&w=160&h=160&fit=crop&v=&auto=format&frame=1&q=92&s=fea4448b425bb704dead957d7448b9cc'},
 'urls': {'web': {'user': 'https://www.kickstarter.com/profile/joziijobs'},
  'api': {'user': 'https://api.kickstarter.com/v1/users/69089661?signature=1539920420.2da0e34dfcb087a277252ace8e292e04f47e953d'}}}

In [20]:
# Useful keys are: 'is_registered'

In [25]:
# Add 'is_registered' to json_obj3
for x in range(0, len(json_obj3)):
    json_obj3[x]["creator_registered"] = json_obj2[x]["creator"]["is_registered"]

In [26]:
# ---- Explore 'location' ----

In [27]:
json_obj2[0]["location"].keys()

dict_keys(['id', 'name', 'slug', 'short_name', 'displayable_name', 'localized_name', 'country', 'state', 'type', 'is_root', 'urls'])

In [28]:
json_obj2[0]["location"]

{'id': 12589342,
 'name': 'Manhattan',
 'slug': 'manhattan-ny',
 'short_name': 'Manhattan, NY',
 'displayable_name': 'Manhattan, NY',
 'localized_name': 'Manhattan',
 'country': 'US',
 'state': 'NY',
 'type': 'County',
 'is_root': False,
 'urls': {'web': {'discover': 'https://www.kickstarter.com/discover/places/manhattan-ny',
   'location': 'https://www.kickstarter.com/locations/manhattan-ny'},
  'api': {'nearby_projects': 'https://api.kickstarter.com/v1/discover?signature=1539904398.aae3553d1e62a16aac55da44ac8c71966f64bb86&woe_id=12589342'}}}

In [29]:
# Useful keys: country, state

In [30]:
for x in range(0, len(json_obj3)):
    # Extract country and state data
    if "location" in json_obj2[x]:
        json_obj3[x]["loc_country"] = json_obj2[x]["location"]["country"]
        json_obj3[x]["loc_state"] = json_obj2[x]["location"]["state"]
    # Add NaN when country/state data does not exist
    else:
        json_obj3[x]["loc_country"] = float('NaN')
        json_obj3[x]["loc_state"] = float('NaN')

In [31]:
# ---- Explore 'category' ----

In [32]:
# Grab useful previously removed sub key-value pairs:
json_obj2[0]["category"].keys()

dict_keys(['id', 'name', 'slug', 'position', 'parent_id', 'color', 'urls'])

In [33]:
json_obj2[0]["category"]

{'id': 342,
 'name': 'Web',
 'slug': 'technology/web',
 'position': 15,
 'parent_id': 16,
 'color': 6526716,
 'urls': {'web': {'discover': 'http://www.kickstarter.com/discover/categories/technology/web'}}}

In [34]:
# Useful keys: name, slug, position, parent_id

In [35]:
# Check for null entries
x_category = 0
for x in range(0, len(json_obj2)):
    if "category" in json_obj2[x]:
        x_category = x_category + 1
    else:
        break
x_category - len(json_obj2)

0

In [36]:
json_obj2[0]["category"]

{'id': 342,
 'name': 'Web',
 'slug': 'technology/web',
 'position': 15,
 'parent_id': 16,
 'color': 6526716,
 'urls': {'web': {'discover': 'http://www.kickstarter.com/discover/categories/technology/web'}}}

In [37]:
x_name = 0
for x in range(0, len(json_obj2)):
    if "name" in json_obj2[x]["category"]:
        x_name = x_name + 1
x_name - len(json_obj2)

0

In [38]:
x_slug = 0
for x in range(0, len(json_obj2)):
    if "slug" in json_obj2[x]["category"]:
        x_slug = x_slug + 1
x_slug - len(json_obj2)

0

In [39]:
x_position = 0
for x in range(0, len(json_obj2)):
    if "position" in json_obj2[x]["category"]:
        x_position = x_position + 1
x_position - len(json_obj2)

0

In [40]:
x_parent_id = 0
for x in range(0, len(json_obj2)):
    if "parent_id" in json_obj2[x]["category"]:
        x_parent_id = x_parent_id + 1
x_parent_id - len(json_obj2)

-17378

In [41]:
# NOTE: parent_id has 17378 null values!

In [42]:
for x in range(0, len(json_obj3)):
    # Extract name, slug, position, and parent_id data
    json_obj3[x]["category_name"] = json_obj2[x]["category"]["name"]
    json_obj3[x]["category_slug"] = json_obj2[x]["category"]["slug"]
    json_obj3[x]["category_position"] = json_obj2[x]["category"]["position"]
    if "parent_id" in json_obj2[x]["category"]:
        json_obj3[x]["category_parent_id"] = json_obj2[x]["category"]["parent_id"]
    # Add NaN when data does not exist
    else:
        json_obj3[x]["category_parent_id"] = float('NaN')

In [43]:
json_obj3[0]

{'id': 895922629,
 'name': 'Jozii-College Jobs Made Easy',
 'blurb': 'Jozii is the premier service for full-time university students to find paid part-time work and internships.',
 'goal': 3500,
 'pledged': 50,
 'state': 'failed',
 'disable_communication': False,
 'country': 'US',
 'currency': 'USD',
 'currency_symbol': '$',
 'currency_trailing_code': True,
 'deadline': 1417150740,
 'state_changed_at': 1417150742,
 'created_at': 1413476386,
 'launched_at': 1414525374,
 'staff_pick': False,
 'is_starrable': False,
 'backers_count': 1,
 'static_usd_rate': 1,
 'usd_pledged': '50.0',
 'converted_pledged_amount': 50,
 'fx_rate': 1,
 'current_currency': 'USD',
 'usd_type': 'international',
 'spotlight': False,
 'creator_registered': True,
 'loc_country': 'US',
 'loc_state': 'NY',
 'category_name': 'Web',
 'category_slug': 'technology/web',
 'category_position': 15,
 'category_parent_id': 16}

In [44]:
# ---- Explore 'profile' ----

In [45]:
# Grab useful previously removed sub key-value pairs:
json_obj2[0]["profile"].keys()

dict_keys(['id', 'project_id', 'state', 'state_changed_at', 'name', 'blurb', 'background_color', 'text_color', 'link_background_color', 'link_text_color', 'link_text', 'link_url', 'show_feature_image', 'background_image_opacity', 'should_show_feature_image_section', 'feature_image_attributes'])

In [46]:
json_obj2[56037]["profile"]

{'id': 1841891,
 'project_id': 1841891,
 'state': 'inactive',
 'state_changed_at': 1429068314,
 'name': None,
 'blurb': None,
 'background_color': None,
 'text_color': None,
 'link_background_color': None,
 'link_text_color': None,
 'link_text': None,
 'link_url': None,
 'show_feature_image': False,
 'background_image_opacity': 0.8,
 'should_show_feature_image_section': True,
 'feature_image_attributes': {'image_urls': {'default': 'https://ksr-ugc.imgix.net/assets/012/107/897/fa251d5986d6744d80e917ce8402ae0b_original.jpg?ixlib=rb-1.1.0&crop=faces&w=1552&h=873&fit=crop&v=1463737460&auto=format&frame=1&q=92&s=3417203c5d3a4e034ffc31fa89c1c892',
   'baseball_card': 'https://ksr-ugc.imgix.net/assets/012/107/897/fa251d5986d6744d80e917ce8402ae0b_original.jpg?ixlib=rb-1.1.0&crop=faces&w=560&h=315&fit=crop&v=1463737460&auto=format&frame=1&q=92&s=ddce863a58a3fae73b17d0e6610a85a2'}}}

In [47]:
json_obj2[4]["profile"]["state"]

'active'

In [48]:
# check to see how many profile states are 'inactive'
x_inactive = 0
for x in range(0, len(json_obj2)):
    if json_obj2[x]["profile"]["state"] == 'inactive':
        x_inactive += 1
x_inactive

181513

In [49]:
# check to see how many profile states are 'active'
x_active = 0
for x in range(0, len(json_obj2)):
    if json_obj2[x]["profile"]["state"] == 'active':
        x_active += 1
x_active

24183

In [50]:
len(json_obj2) - x_active - x_inactive

0

In [51]:
# There are only two options for state: active and inactive - there are many inactive profile states

In [52]:
# Useful keys: none
# This is a bit of a guess. Most of the profile states are labeled as 'inactive'.
# My guess is that the profiles go latent once a projct is finished (perhaps regardless of whether it was successful or not)

In [53]:
# ---- CONVERT TO DATA FRAME ----

In [54]:
json_obj3[0]

{'id': 895922629,
 'name': 'Jozii-College Jobs Made Easy',
 'blurb': 'Jozii is the premier service for full-time university students to find paid part-time work and internships.',
 'goal': 3500,
 'pledged': 50,
 'state': 'failed',
 'disable_communication': False,
 'country': 'US',
 'currency': 'USD',
 'currency_symbol': '$',
 'currency_trailing_code': True,
 'deadline': 1417150740,
 'state_changed_at': 1417150742,
 'created_at': 1413476386,
 'launched_at': 1414525374,
 'staff_pick': False,
 'is_starrable': False,
 'backers_count': 1,
 'static_usd_rate': 1,
 'usd_pledged': '50.0',
 'converted_pledged_amount': 50,
 'fx_rate': 1,
 'current_currency': 'USD',
 'usd_type': 'international',
 'spotlight': False,
 'creator_registered': True,
 'loc_country': 'US',
 'loc_state': 'NY',
 'category_name': 'Web',
 'category_slug': 'technology/web',
 'category_position': 15,
 'category_parent_id': 16}

In [55]:
len(json_obj3)

205696

In [56]:
df = pd.DataFrame.from_records(json_obj3)
df.head()

Unnamed: 0,backers_count,blurb,category_name,category_parent_id,category_position,category_slug,converted_pledged_amount,country,created_at,creator_registered,...,loc_state,name,pledged,spotlight,staff_pick,state,state_changed_at,static_usd_rate,usd_pledged,usd_type
0,1,Jozii is the premier service for full-time uni...,Web,16.0,15,technology/web,50,US,1413476386,True,...,NY,Jozii-College Jobs Made Easy,50.0,False,False,failed,1417150742,1.0,50.0,international
1,63,Dropsy is an adventure game with a focus on ex...,Video Games,12.0,7,games/video games,1613,US,1317217452,True,...,FL,DROPSY: A different take on the old school adv...,1613.47,True,True,successful,1320259153,1.0,1613.47,international
2,113,We've bought a press and already found a studi...,Letterpress,18.0,7,publishing/letterpress,6633,US,1403324529,True,...,OR,Letra Chueca / Crooked Letter: a printshop & b...,6633.0,True,False,successful,1410502338,1.0,6633.0,international
3,1,"Premium, fashion sneaker brand intertwining hi...",Footwear,9.0,5,fashion/footwear,1,GB,1457103832,True,...,England,High End Premium Handmade Sneakers,1.0,False,False,failed,1462305856,1.407899,1.40789944,international
4,99,Made in Europe from the finest quality leather...,Footwear,9.0,5,fashion/footwear,20654,CA,1423776567,True,...,ON,The Domenico Sneakerboot by Luigi Sardo Shoes,25338.0,True,False,successful,1429678899,0.782122,19817.40571572,international


In [57]:
df.tail()

Unnamed: 0,backers_count,blurb,category_name,category_parent_id,category_position,category_slug,converted_pledged_amount,country,created_at,creator_registered,...,loc_state,name,pledged,spotlight,staff_pick,state,state_changed_at,static_usd_rate,usd_pledged,usd_type
205691,66,Re: Magazine is the Savannah College of Art an...,Periodicals,18.0,10,publishing/periodicals,4846,US,1329261163,True,...,GA,Re: Magazine,4846.79,True,True,successful,1330664342,1.0,4846.79,international
205692,113,"Due to popularity, demand, and based on feedba...",Events,3.0,3,comics/events,8596,GB,1389612547,True,...,England,Melksham Comic Con 2014 - The Expansion!,5186.5,True,False,successful,1396700229,1.668265,8652.45860083,international
205693,259,An epic and timeless hand illustrated deck of ...,Playing Cards,12.0,4,games/playing cards,7623,US,1409509864,True,...,FL,A Deck of Skeletons. Vintage Playing Cards.,7623.0,True,False,successful,1412891110,1.0,7623.0,international
205694,2,The slasher film has returned.,Horror,11.0,10,film & video/horror,80,US,1475388732,True,...,OH,I'm Scared - Inde Film,80.0,False,False,failed,1477985542,1.0,80.0,international
205695,3,We are opening what we hope will be a wonderfu...,Restaurants,10.0,9,food/restaurants,35,US,1405914894,True,...,TX,Get Hula Cowgirl Shaved Ice Over the Last Bump!,35.0,False,False,failed,1408057020,1.0,35.0,international


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205696 entries, 0 to 205695
Data columns (total 32 columns):
backers_count               205696 non-null int64
blurb                       205696 non-null object
category_name               205696 non-null object
category_parent_id          188318 non-null float64
category_position           205696 non-null int64
category_slug               205696 non-null object
converted_pledged_amount    205696 non-null int64
country                     205696 non-null object
created_at                  205696 non-null int64
creator_registered          205696 non-null bool
currency                    205696 non-null object
currency_symbol             205696 non-null object
currency_trailing_code      205696 non-null bool
current_currency            205696 non-null object
deadline                    205696 non-null int64
disable_communication       205696 non-null bool
fx_rate                     205696 non-null float64
goal                        205

In [59]:
df.describe()

Unnamed: 0,backers_count,category_parent_id,category_position,converted_pledged_amount,created_at,deadline,fx_rate,goal,id,launched_at,pledged,state_changed_at,static_usd_rate
count,205696.0,188318.0,205696.0,205696.0,205696.0,205696.0,205696.0,205696.0,205696.0,205696.0,205696.0,205696.0,205696.0
mean,146.106132,11.782331,7.137514,12233.41,1412305000.0,1418814000.0,1.003671,45134.57,1074607000.0,1415885000.0,13730.92,1418616000.0,1.017976
std,1033.31731,5.540618,4.575614,92569.58,70254630.0,70417520.0,0.197427,1141468.0,619090700.0,70564120.0,134613.5,70182380.0,0.201805
min,0.0,1.0,1.0,0.0,1240335000.0,1241334000.0,0.008886,0.01,18520.0,1240603000.0,0.0,1241334000.0,0.008771
25%,4.0,10.0,3.0,110.0,1358319000.0,1365907000.0,1.0,1500.0,537952800.0,1363131000.0,110.0,1365890000.0,1.0
50%,27.0,12.0,6.0,1558.0,1415725000.0,1423431000.0,1.0,5000.0,1077976000.0,1420672000.0,1560.32,1423275000.0,1.0
75%,85.0,16.0,10.0,6220.0,1463547000.0,1470328000.0,1.0,13400.0,1608983000.0,1467530000.0,6300.0,1470191000.0,1.0
max,105857.0,26.0,19.0,10266840.0,1539821000.0,1545013000.0,10.268501,100000000.0,2147476000.0,1539838000.0,23343870.0,1539839000.0,1.716408


In [62]:
# ========================================
# CLEAN DATA
# ========================================

In [63]:
# ---- REARRANGE COLUMNS ----
df.columns

Index(['backers_count', 'blurb', 'category_name', 'category_parent_id',
       'category_position', 'category_slug', 'converted_pledged_amount',
       'country', 'created_at', 'creator_registered', 'currency',
       'currency_symbol', 'currency_trailing_code', 'current_currency',
       'deadline', 'disable_communication', 'fx_rate', 'goal', 'id',
       'is_starrable', 'launched_at', 'loc_country', 'loc_state', 'name',
       'pledged', 'spotlight', 'staff_pick', 'state', 'state_changed_at',
       'static_usd_rate', 'usd_pledged', 'usd_type'],
      dtype='object')

In [64]:
json_obj3[0].keys()

dict_keys(['id', 'name', 'blurb', 'goal', 'pledged', 'state', 'disable_communication', 'country', 'currency', 'currency_symbol', 'currency_trailing_code', 'deadline', 'state_changed_at', 'created_at', 'launched_at', 'staff_pick', 'is_starrable', 'backers_count', 'static_usd_rate', 'usd_pledged', 'converted_pledged_amount', 'fx_rate', 'current_currency', 'usd_type', 'spotlight', 'creator_registered', 'loc_country', 'loc_state', 'category_name', 'category_slug', 'category_position', 'category_parent_id'])

In [None]:
# The df keys got rearranged into alphabetical order.

In [65]:
# Re-order columns
df = df[['id', 'name', 'blurb', 'category_name', 'category_slug', 'category_position', 'category_parent_id', 'goal', 'pledged', 
         'disable_communication', 'loc_country', 'loc_state', 'country', 'currency', 'currency_symbol', 'currency_trailing_code', 'deadline', 
         'state_changed_at', 'created_at', 'launched_at', 'staff_pick', 'is_starrable', 'backers_count', 'static_usd_rate', 
         'usd_pledged', 'converted_pledged_amount', 'fx_rate', 'current_currency', 'usd_type', 'spotlight', 
         'creator_registered', 'state']]

In [66]:
df.columns

Index(['id', 'name', 'blurb', 'category_name', 'category_slug',
       'category_position', 'category_parent_id', 'goal', 'pledged',
       'disable_communication', 'loc_country', 'loc_state', 'country',
       'currency', 'currency_symbol', 'currency_trailing_code', 'deadline',
       'state_changed_at', 'created_at', 'launched_at', 'staff_pick',
       'is_starrable', 'backers_count', 'static_usd_rate', 'usd_pledged',
       'converted_pledged_amount', 'fx_rate', 'current_currency', 'usd_type',
       'spotlight', 'creator_registered', 'state'],
      dtype='object')

In [67]:
len(df.columns)

32

In [68]:
# ---- REMOVE DUPLICATES ----

In [69]:
df.sort_values(by=["backers_count"],ascending=False)[["id","name","backers_count"]]

Unnamed: 0,id,name,backers_count
69279,557230947,"Bring Reading Rainbow Back for Every Child, Ev...",105857
188496,557230947,"Bring Reading Rainbow Back for Every Child, Ev...",105857
150213,1755266685,The Veronica Mars Movie Project,91585
13732,1929840910,Double Fine Adventure,87142
53281,1929840910,Double Fine Adventure,87142
129025,286165030,Torment: Tides of Numenera,74405
136791,286165030,Torment: Tides of Numenera,74405
186022,458565935,Project Eternity,73986
141451,458565935,Project Eternity,73986
174366,1194278206,Yooka-Laylee - A 3D Platformer Rare-vival!,73206


In [70]:
df.drop_duplicates(inplace=True)

In [71]:
df.shape

(189240, 32)

In [72]:
# We dropped 205696-189240=16456 duplicate rows

In [73]:
# Search for more duplicates
len(df["id"].unique()) - len(df)

-2166

In [74]:
# There are still 2166 duplicate IDs

In [75]:
# Explore the duplicate ID rows
pd.concat(g for _, g in df.groupby("id") if len(g) > 1)

Unnamed: 0,id,name,blurb,category_name,category_slug,category_position,category_parent_id,goal,pledged,disable_communication,...,backers_count,static_usd_rate,usd_pledged,converted_pledged_amount,fx_rate,current_currency,usd_type,spotlight,creator_registered,state
10531,422207,The Arduino Compatible Educational Sensor Lear...,This arduino / Raspberry Pi compatible board s...,Hardware,technology/hardware,8,16.0,1200.0,7728.00,False,...,180,0.911649,7045.22602224,6951,0.771892,USD,international,True,True,successful
69099,422207,The Arduino Compatible Educational Sensor Lear...,This arduino / Raspberry Pi compatible board s...,Hardware,technology/hardware,8,16.0,1200.0,7728.00,False,...,180,0.911649,7045.22602224,6951,0.766650,USD,international,True,True,successful
184333,1033120,Fields of Fire: Miniatures rules for modern co...,"With this simple, yet realistic rules compilat...",Tabletop Games,games/tabletop games,6,12.0,2500.0,2575.00,False,...,35,1.000000,2575.0,2575,1.000000,USD,international,True,True,successful
204054,1033120,Fields of Fire: Miniatures rules for modern co...,"With this simple, yet realistic rules compilat...",Tabletop Games,games/tabletop games,6,12.0,2500.0,2575.00,False,...,35,1.000000,2575.0,3358,1.304376,CAD,,True,True,successful
83960,2705005,"LOVE MEI waterproof case, protect iPhone X/Xs ...","This IP68 Waterproof Case for iPhone X/Xs, wit...",Gadgets,technology/gadgets,7,16.0,24000.0,1110.00,False,...,11,0.128052,142.1373093,141,0.127570,USD,international,False,True,live
203854,2705005,"LOVE MEI waterproof case, protect iPhone X/Xs ...","This IP68 Waterproof Case for iPhone X/Xs, wit...",Gadgets,technology/gadgets,7,16.0,24000.0,1110.00,False,...,11,0.128006,142.0867266,141,0.127570,USD,international,False,True,live
67679,4534946,'Skin Walkers' knit FMP,I'm a third year student currently designing &...,Couture,fashion/couture,4,9.0,50.0,85.00,False,...,6,1.683130,143.06609165,144,1.317483,USD,international,True,True,successful
160239,4534946,'Skin Walkers' knit FMP,I'm a third year student currently designing &...,Couture,fashion/couture,4,9.0,50.0,85.00,False,...,6,1.683130,143.06609165,145,1.708667,CAD,,True,True,successful
84917,5090855,Charmed Flying Broom,A spooky horror Halloween old school 3D platfo...,Video Games,games/video games,7,12.0,250000.0,418.80,False,...,2,0.053033,22.210417236,22,0.052969,USD,domestic,False,True,live
105652,5090855,Charmed Flying Broom,A spooky horror Halloween old school 3D platfo...,Video Games,games/video games,7,12.0,250000.0,418.80,False,...,2,0.053033,22.210417236,33,0.080902,NZD,,False,True,live


In [76]:
# It looks like some of the rows look different due to usd_pledged, converted_pledged_amount, fx_rate (which appears to be 
# exchange rate), and current_currency. Let's just delete the following currency-related columns:
# currency_symbol, static_usd_rate, convertd_pledged_amount, fx_rate, current_currency, usd_type
df.drop(columns=['currency_symbol','static_usd_rate','converted_pledged_amount','fx_rate','current_currency','usd_type'], 
        inplace=True)

In [77]:
df.shape

(189240, 26)

In [78]:
df.columns

Index(['id', 'name', 'blurb', 'category_name', 'category_slug',
       'category_position', 'category_parent_id', 'goal', 'pledged',
       'disable_communication', 'loc_country', 'loc_state', 'country',
       'currency', 'currency_trailing_code', 'deadline', 'state_changed_at',
       'created_at', 'launched_at', 'staff_pick', 'is_starrable',
       'backers_count', 'usd_pledged', 'spotlight', 'creator_registered',
       'state'],
      dtype='object')

In [79]:
df.drop_duplicates(inplace=True)

In [80]:
df.shape

(187367, 26)

In [81]:
# We dropped 189240-187367=1873 duplicate rows

In [82]:
# Search for more duplicates
len(df["id"].unique()) - len(df)

-293

In [83]:
# There are still 293 duplicate ID rows

In [84]:
pd.options.display.max_columns = None

In [85]:
df.loc[[171904,205007]]

Unnamed: 0,id,name,blurb,category_name,category_slug,category_position,category_parent_id,goal,pledged,disable_communication,loc_country,loc_state,country,currency,currency_trailing_code,deadline,state_changed_at,created_at,launched_at,staff_pick,is_starrable,backers_count,usd_pledged,spotlight,creator_registered,state
171904,8412272,The ultimate music performance machine for stu...,Pipes is a new audio platform which out-horsep...,Hardware,technology/hardware,8,16.0,100000.0,106609.0,False,US,CA,US,USD,True,1542234085,1539638486,1537803552,1539638485,True,True,243,106609.0,False,True,live
205007,8412272,The ultimate music performance machine for stu...,Pipes is a new audio platform which out-horsep...,Hardware,technology/hardware,8,16.0,100000.0,107407.0,False,US,CA,US,USD,True,1542234085,1539638486,1537803552,1539638485,True,True,245,107407.0,False,True,live


In [154]:
# It looks like there are differences with pledged, backers_count and usd_pledged

In [86]:
# usd_pledged is redundant with pledged - remove usd_pledged
df.drop(columns=['usd_pledged'], inplace=True)

In [87]:
df.shape

(187367, 25)

In [88]:
df.drop_duplicates(inplace=True)

In [89]:
df.shape

(187345, 25)

In [90]:
# we dropped 187367-187345=22 duplicate rows

In [91]:
# Search for more duplicates
len(df["id"].unique()) - len(df)

-271

In [92]:
# There are still 271 duplicates

In [93]:
# is_starrable is unclear and seems to be a problem with duplicates. Let's drop it
df.drop(columns=['is_starrable'], inplace=True)

In [94]:
df.shape

(187345, 24)

In [95]:
df.drop_duplicates(inplace=True)

In [96]:
df.shape

(187344, 24)

In [97]:
# Search for more duplicates
len(df["id"].unique()) - len(df)

-270

In [199]:
# There are still 270 duplicates

In [98]:
df.loc[[171904,205007]]

Unnamed: 0,id,name,blurb,category_name,category_slug,category_position,category_parent_id,goal,pledged,disable_communication,loc_country,loc_state,country,currency,currency_trailing_code,deadline,state_changed_at,created_at,launched_at,staff_pick,backers_count,spotlight,creator_registered,state
171904,8412272,The ultimate music performance machine for stu...,Pipes is a new audio platform which out-horsep...,Hardware,technology/hardware,8,16.0,100000.0,106609.0,False,US,CA,US,USD,True,1542234085,1539638486,1537803552,1539638485,True,243,False,True,live
205007,8412272,The ultimate music performance machine for stu...,Pipes is a new audio platform which out-horsep...,Hardware,technology/hardware,8,16.0,100000.0,107407.0,False,US,CA,US,USD,True,1542234085,1539638486,1537803552,1539638485,True,245,False,True,live


In [101]:
# Still differences between pledged and backers_count. Without some sort of a time stamp we don't know which one is the most
# updated row. Let's look for duplicate IDs, then keep the rows with the largest pledge values

In [102]:
df = df.sort_values('pledged', ascending=False).drop_duplicates('id').sort_index()

In [100]:
df.loc[df["id"] == 8412272].pledged

205007    107407.0
Name: pledged, dtype: float64

In [235]:
# Looks like it worked! There is only one entry for ID 8412272 and it is the larger pledge of the two initial entries.

In [108]:
# Search for more duplicates
len(df["id"].unique()) - len(df)

0

In [109]:
# No more duplicates!

In [110]:
df.shape

(187074, 24)

In [111]:
df.columns

Index(['id', 'name', 'blurb', 'category_name', 'category_slug',
       'category_position', 'category_parent_id', 'goal', 'pledged',
       'disable_communication', 'loc_country', 'loc_state', 'country',
       'currency', 'currency_trailing_code', 'deadline', 'state_changed_at',
       'created_at', 'launched_at', 'staff_pick', 'backers_count', 'spotlight',
       'creator_registered', 'state'],
      dtype='object')

In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 187074 entries, 0 to 205695
Data columns (total 24 columns):
id                        187074 non-null int64
name                      187074 non-null object
blurb                     187074 non-null object
category_name             187074 non-null object
category_slug             187074 non-null object
category_position         187074 non-null int64
category_parent_id        169696 non-null float64
goal                      187074 non-null float64
pledged                   187074 non-null float64
disable_communication     187074 non-null bool
loc_country               186198 non-null object
loc_state                 186155 non-null object
country                   187074 non-null object
currency                  187074 non-null object
currency_trailing_code    187074 non-null bool
deadline                  187074 non-null int64
state_changed_at          187074 non-null int64
created_at                187074 non-null int64
launched_at  

In [117]:
# Re-index
df.reset_index(drop=True, inplace=True)

In [118]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187074 entries, 0 to 187073
Data columns (total 24 columns):
id                        187074 non-null int64
name                      187074 non-null object
blurb                     187074 non-null object
category_name             187074 non-null object
category_slug             187074 non-null object
category_position         187074 non-null int64
category_parent_id        169696 non-null float64
goal                      187074 non-null float64
pledged                   187074 non-null float64
disable_communication     187074 non-null bool
loc_country               186198 non-null object
loc_state                 186155 non-null object
country                   187074 non-null object
currency                  187074 non-null object
currency_trailing_code    187074 non-null bool
deadline                  187074 non-null int64
state_changed_at          187074 non-null int64
created_at                187074 non-null int64
launched_at  

In [None]:
df.to_csv('analysis.csv', sep=',')

In [261]:
# ---- FURTHER REFINE VARIABLES ----

In [125]:
df.columns

Index(['id', 'name', 'blurb', 'category_name', 'category_slug',
       'category_position', 'category_parent_id', 'goal', 'pledged',
       'disable_communication', 'loc_country', 'loc_state', 'country',
       'currency', 'currency_trailing_code', 'deadline', 'state_changed_at',
       'created_at', 'launched_at', 'staff_pick', 'backers_count', 'spotlight',
       'creator_registered', 'state'],
      dtype='object')

# variable notes

## delete
* blurb - delete
* loc_state - delete (too granular)
* country - delete (not sure how it differs from loc_country; largely redundant)
* currency - delete (pledges is in usd)
* currency_trailing_code - delete (what is it?)
* state_changed_at - delete

## keep
* id - keep (primary key)
* name - keep (for reference)
* category_name - keep; change to sub_category
* category slug - keep; extract first word; change to category
* category_position - keep; change to sub_category_id; move before sub_category
* category_parent_id - keep; change to category_id; move before category
* goal - keep
* pledged - keep
* disable_communication - keep (but what is it??)
* loc_country - keep (where the project is launche from); rename as country
* deadline - keep; convert to datetime
* created_at - keep; convert to datetime
* launched_at - keep; convert to datetime
* staff_pick - keep
* backers_count - keep; move before pledged
* spotlight - keep (but what is it?)
* creator_registered - keep
* state - keep

In [130]:
df.drop(columns=['blurb','loc_state','country','currency','currency_trailing_code','state_changed_at'], inplace=True)

In [133]:
len(df.columns)

18

In [136]:
# Rename columns
df.rename(columns={'category_name':'sub_category', 'category_slug':'category', 'category_position':'sub_category_id', 
                  'category_parent_id':'category_id', 'loc_country':'country', 'state':'launch_state'}, inplace=True)

In [138]:
df.columns

Index(['id', 'name', 'sub_category', 'category', 'sub_category_id',
       'category_id', 'goal', 'pledged', 'disable_communication', 'country',
       'deadline', 'created_at', 'launched_at', 'staff_pick', 'backers_count',
       'spotlight', 'creator_registered', 'launch_state'],
      dtype='object')

In [139]:
# Re-arrange columns
df = df[['id', 'name', 'sub_category_id', 'sub_category', 'category_id', 'category', 'goal', 'backers_count', 'pledged', 
         'disable_communication', 'country','deadline', 'created_at', 'launched_at', 'staff_pick', 'spotlight', 
         'creator_registered', 'launch_state']]

In [140]:
df.columns

Index(['id', 'name', 'sub_category_id', 'sub_category', 'category_id',
       'category', 'goal', 'backers_count', 'pledged', 'disable_communication',
       'country', 'deadline', 'created_at', 'launched_at', 'staff_pick',
       'spotlight', 'creator_registered', 'launch_state'],
      dtype='object')

In [144]:
df.sort_values(by=['category_id','sub_category_id'])

Unnamed: 0,id,name,sub_category_id,sub_category,category_id,category,goal,backers_count,pledged,disable_communication,country,deadline,created_at,launched_at,staff_pick,spotlight,creator_registered,launch_state
203,710268515,Decker-Smith Pottery Returns!,1,Ceramics,1.0,art/ceramics,2425.00,103,7550.00,False,US,1425855600,1424404478,1424460276,True,True,True,successful
221,367139629,One Piece fan.. full collection!,1,Ceramics,1.0,art/ceramics,85.00,0,0.00,False,BE,1442427120,1439830504,1439835120,False,False,True,failed
237,2144529772,WilcoxsonBrooklynCeramics,1,Ceramics,1.0,art/ceramics,20000.00,9,276.00,False,US,1452962031,1449592381,1450370031,False,False,True,failed
252,703029125,Personalised & Hand-glazed ceramics.,1,Ceramics,1.0,art/ceramics,4500.00,23,533.00,False,GB,1501579528,1495788869,1498987528,False,False,True,failed
456,1480803309,A Commemorative Beer Stein - Inspired by Brett...,1,Ceramics,1.0,art/ceramics,10000.00,12,384.00,False,US,1541032247,1538439659,1538675280,False,False,True,live
481,1455271842,BRICK Ceramics Studio Opening!,1,Ceramics,1.0,art/ceramics,15000.00,119,15299.00,False,US,1430981940,1423575259,1428320006,False,True,True,successful
519,1460620306,Alisha's Pottery Studio,1,Ceramics,1.0,art/ceramics,700.00,8,345.00,False,US,1448156043,1443735046,1445560443,False,False,True,failed
627,995319524,JUMPING CREEK POTTERY STUDIO EXPANSION,1,Ceramics,1.0,art/ceramics,25000.00,215,26229.00,False,CA,1426120620,1422502366,1423532220,True,True,True,successful
649,2111622512,Michelle Dziadkowicz Ceramics Studio - A place...,1,Ceramics,1.0,art/ceramics,30000.00,14,1775.00,False,US,1431759540,1426282073,1426612843,False,False,True,canceled
658,829968813,Launching BK Clay Cafe In 60 Days + Coffee Hou...,1,Ceramics,1.0,art/ceramics,3000.00,8,482.00,False,US,1413202144,1409803618,1410610144,False,False,True,failed


In [149]:
len(df.sub_category_id.unique())

19

In [150]:
len(df.sub_category.unique())

159

In [153]:
len(df.category_id.unique())

16

In [154]:
len(df.category.unique())

169

In [155]:
# The number of category IDs does not match the category titles (same for sub-).

In [165]:
df.sort_values(by='sub_category_id').sub_category_id.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19], dtype=int64)

In [170]:
df[df.sub_category_id == 1].sub_category.unique()

array(['Audio', 'Animals', 'Candles', 'Bacon', 'Accessories', 'Art',
       'Ceramics', 'Performances', 'Gaming Hardware', '3D Printing',
       'Action', 'Comedy', 'Anthologies', 'Academic', 'Blues',
       'Architecture'], dtype=object)

In [171]:
df[df.sub_category_id == 2].sub_category.unique()

array(['Animation', 'Experimental', 'Comics', 'Comic Books',
       'Anthologies', 'Apps', 'Conceptual Art', 'Residencies', 'Crochet',
       'Photo', 'Fine Art', 'Apparel', 'Civic Design',
       'Community Gardens', 'Chiptune', 'Live Games'], dtype=object)

In [172]:
df[df.sub_category_id == 3].sub_category.unique()

array(['Spaces', 'Crafts', 'Camera Equipment', 'Comedy', 'Nature', 'DIY',
       'Art Books', 'Digital Art', 'Mobile Games', 'Childrenswear',
       'Classical Music', 'Festivals', 'Events', 'Cookbooks', 'Print',
       'Graphic Design'], dtype=object)

In [173]:
df[df.sub_category_id == 4].sub_category.unique()

array(['Embroidery', 'Playing Cards', 'Illustration', 'Calendars',
       'Graphic Novels', 'DIY Electronics', 'Couture', 'Video', 'Dance',
       'People', 'Immersive', 'Documentary', 'Drinks',
       'Interactive Design', 'Workshops', 'Comedy'], dtype=object)

In [174]:
df[df.sub_category_id == 5].sub_category.unique()

array(['Footwear', 'Webcomics', 'Photobooks', 'Musical',
       "Children's Books", 'Country & Folk', 'Puzzles',
       'Fabrication Tools', 'Glass', 'Product Design', 'Design',
       'Installations', 'Drama', 'Events', 'Web'], dtype=object)

In [None]:
# there does not seem to be a strong correlation between sub_category_id and sub_category. Let's drop the sub_category_id 
# and (maybe) keep sub_category

In [198]:
df.sort_values(by='category_id').category_id.unique()

array([ 1.,  3.,  6.,  7.,  9., 10., 11., 12., 13., 14., 15., 16., 17.,
       18., 26., nan])

In [176]:
df[df.category_id == 1].category.unique()

array(['art/painting', 'art/textiles', 'art/illustration',
       'art/mixed media', 'art/ceramics', 'art/sculpture',
       'art/digital art', 'art/conceptual art', 'art/public art',
       'art/performance art', 'art/installations', 'art/video art'],
      dtype=object)

In [178]:
df[df.category_id == 3].category.unique()

array(['comics/webcomics', 'comics/comic books', 'comics/graphic novels',
       'comics/events', 'comics/anthologies'], dtype=object)

In [179]:
df[df.category_id == 6].category.unique()

array(['dance/spaces', 'dance/performances', 'dance/residencies',
       'dance/workshops'], dtype=object)

In [180]:
df[df.category_id == 7].category.unique()

array(['design/product design', 'design/civic design',
       'design/interactive design', 'design/typography',
       'design/graphic design', 'design/architecture'], dtype=object)

In [181]:
df[df.category_id == 9].category.unique()

array(['fashion/footwear', 'fashion/accessories', 'fashion/ready-to-wear',
       'fashion/couture', 'fashion/childrenswear', 'fashion/pet fashion',
       'fashion/apparel', 'fashion/jewelry'], dtype=object)

In [182]:
df[df.category_id == 10].category.unique()

array(['food/bacon', 'food/vegan', 'food/restaurants', 'food/food trucks',
       'food/cookbooks', 'food/small batch', 'food/farms', 'food/events',
       'food/drinks', 'food/spaces', 'food/community gardens',
       "food/farmer's markets"], dtype=object)

In [183]:
df[df.category_id == 11].category.unique()

array(['film & video/animation', 'film & video/family',
       'film & video/comedy', 'film & video/horror',
       'film & video/fantasy', 'film & video/music videos',
       'film & video/narrative film', 'film & video/movie theaters',
       'film & video/action', 'film & video/romance',
       'film & video/webseries', 'film & video/shorts',
       'film & video/festivals', 'film & video/drama',
       'film & video/documentary', 'film & video/thrillers',
       'film & video/television', 'film & video/experimental',
       'film & video/science fiction'], dtype=object)

In [184]:
df[df.category_id == 12].category.unique()

array(['games/video games', 'games/playing cards', 'games/tabletop games',
       'games/mobile games', 'games/gaming hardware', 'games/puzzles',
       'games/live games'], dtype=object)

In [185]:
df[df.category_id == 13].category.unique()

array(['journalism/audio', 'journalism/video', 'journalism/photo',
       'journalism/print', 'journalism/web'], dtype=object)

In [186]:
df[df.category_id == 14].category.unique()

array(['music/jazz', 'music/electronic music', 'music/faith', 'music/pop',
       'music/metal', 'music/r&b', 'music/kids', 'music/hip-hop',
       'music/country & folk', 'music/rock', 'music/classical music',
       'music/indie rock', 'music/world music', 'music/latin',
       'music/punk', 'music/blues', 'music/comedy', 'music/chiptune'],
      dtype=object)

In [187]:
df[df.category_id == 15].category.unique()

array(['photography/photobooks', 'photography/animals',
       'photography/places', 'photography/nature', 'photography/fine art',
       'photography/people'], dtype=object)

In [188]:
df[df.category_id == 16].category.unique()

array(['technology/web', 'technology/hardware',
       'technology/camera equipment', 'technology/software',
       'technology/wearables', 'technology/makerspaces',
       'technology/sound', 'technology/robots', 'technology/apps',
       'technology/diy electronics', 'technology/gadgets',
       'technology/3d printing', 'technology/fabrication tools',
       'technology/space exploration', 'technology/flight'], dtype=object)

In [189]:
df[df.category_id == 17].category.unique()

array(['theater/experimental', 'theater/spaces', 'theater/musical',
       'theater/plays', 'theater/festivals', 'theater/comedy',
       'theater/immersive'], dtype=object)

In [191]:
df[df.category_id == 18].category.unique()

array(['publishing/letterpress', 'publishing/periodicals',
       'publishing/radio & podcasts', "publishing/children's books",
       'publishing/nonfiction', 'publishing/anthologies',
       'publishing/fiction', 'publishing/poetry', 'publishing/calendars',
       'publishing/art books', 'publishing/literary journals',
       'publishing/translations', 'publishing/literary spaces',
       'publishing/zines', 'publishing/academic',
       'publishing/young adult', 'publishing/comedy'], dtype=object)

In [193]:
df[df.category_id == 26].category.unique()

array(['crafts/woodworking', 'crafts/quilts', 'crafts/embroidery',
       'crafts/knitting', 'crafts/candles', 'crafts/diy',
       'crafts/stationery', 'crafts/crochet', 'crafts/pottery',
       'crafts/glass', 'crafts/weaving', 'crafts/printing',
       'crafts/taxidermy'], dtype=object)

In [229]:
df[np.isnan(df.category_id)].category.unique()

array(['theater', 'photography', 'crafts', 'comics', 'technology', 'art',
       'publishing', 'music', 'dance', 'fashion', 'design', 'games',
       'film & video', 'journalism', 'food'], dtype=object)

In [232]:
df.sort_values('sub_category').sub_category.unique()

array(['3D Printing', 'Academic', 'Accessories', 'Action', 'Animals',
       'Animation', 'Anthologies', 'Apparel', 'Apps', 'Architecture',
       'Art', 'Art Books', 'Audio', 'Bacon', 'Blues', 'Calendars',
       'Camera Equipment', 'Candles', 'Ceramics', "Children's Books",
       'Childrenswear', 'Chiptune', 'Civic Design', 'Classical Music',
       'Comedy', 'Comic Books', 'Comics', 'Community Gardens',
       'Conceptual Art', 'Cookbooks', 'Country & Folk', 'Couture',
       'Crafts', 'Crochet', 'DIY', 'DIY Electronics', 'Dance', 'Design',
       'Digital Art', 'Documentary', 'Drama', 'Drinks',
       'Electronic Music', 'Embroidery', 'Events', 'Experimental',
       'Fabrication Tools', 'Faith', 'Family', 'Fantasy',
       "Farmer's Markets", 'Farms', 'Fashion', 'Festivals', 'Fiction',
       'Film & Video', 'Fine Art', 'Flight', 'Food', 'Food Trucks',
       'Footwear', 'Gadgets', 'Games', 'Gaming Hardware', 'Glass',
       'Graphic Design', 'Graphic Novels', 'Hardware', 'Hip-Ho

In [233]:
df.sort_values('category').category.unique()

array(['art', 'art/ceramics', 'art/conceptual art', 'art/digital art',
       'art/illustration', 'art/installations', 'art/mixed media',
       'art/painting', 'art/performance art', 'art/public art',
       'art/sculpture', 'art/textiles', 'art/video art', 'comics',
       'comics/anthologies', 'comics/comic books', 'comics/events',
       'comics/graphic novels', 'comics/webcomics', 'crafts',
       'crafts/candles', 'crafts/crochet', 'crafts/diy',
       'crafts/embroidery', 'crafts/glass', 'crafts/knitting',
       'crafts/pottery', 'crafts/printing', 'crafts/quilts',
       'crafts/stationery', 'crafts/taxidermy', 'crafts/weaving',
       'crafts/woodworking', 'dance', 'dance/performances',
       'dance/residencies', 'dance/spaces', 'dance/workshops', 'design',
       'design/architecture', 'design/civic design',
       'design/graphic design', 'design/interactive design',
       'design/product design', 'design/typography', 'fashion',
       'fashion/accessories', 'fashion/appa

# CATEGORY NOTES

So, what do we keep? It doesn't make much sense to keep sub-categories without the parent category. And for now, dealing with so many categories to begin with it might become overwhelming trying to manage sub-categories too. Let's start with just categories and see if we can make some good predictions (and remember that we can always beef up the algorithm later by adding in the sub-category variables).

As for the category variables, do we want to extract the parent category (eg 'technology') or do we want to keep the sub-categories tacked on (eg 'technology/3d printing' and 'technology/apps')? For now, as mentioned above, let's drop the sub-categories (eg 'technology/3d printing' and 'technology/apps' are both just considered 'technology'). 

We now have two choices: keep the category as categorical or use the IDs. For context, let's keep the categories as categorical and use dummy variables later as needed.

In [234]:
df.columns

Index(['id', 'name', 'sub_category_id', 'sub_category', 'category_id',
       'category', 'goal', 'backers_count', 'pledged', 'disable_communication',
       'country', 'deadline', 'created_at', 'launched_at', 'staff_pick',
       'spotlight', 'creator_registered', 'launch_state'],
      dtype='object')

In [237]:
df.drop(columns=['sub_category_id','sub_category','category_id'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [240]:
len(df.columns)

15

In [252]:
# Now let's extract the primary category from the category column
df.category[0]

'technology/web'

In [270]:
df.category[0].split('/')[0]

'technology'

In [275]:
df.category[24:28]

24         technology/web
25                theater
26            photography
27    technology/hardware
Name: category, dtype: object

In [276]:
[i.split('/')[0] for i in df.category][24:28]

['technology', 'theater', 'photography', 'technology']

In [277]:
df.category = [i.split('/')[0] for i in df.category]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [279]:
df.sort_values(by='category').category.unique()

array(['art', 'comics', 'crafts', 'dance', 'design', 'fashion',
       'film & video', 'food', 'games', 'journalism', 'music',
       'photography', 'publishing', 'technology', 'theater'], dtype=object)

In [282]:
len(df.category.unique())

15

In [285]:
# We are left with 15 categories.
df.columns

Index(['id', 'name', 'category', 'goal', 'backers_count', 'pledged',
       'disable_communication', 'country', 'deadline', 'created_at',
       'launched_at', 'staff_pick', 'spotlight', 'creator_registered',
       'launch_state'],
      dtype='object')

In [284]:
df.to_csv('analysis.csv', sep=',')

In [288]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187074 entries, 0 to 187073
Data columns (total 15 columns):
id                       187074 non-null int64
name                     187074 non-null object
category                 187074 non-null object
goal                     187074 non-null float64
backers_count            187074 non-null int64
pledged                  187074 non-null float64
disable_communication    187074 non-null bool
country                  186198 non-null object
deadline                 187074 non-null int64
created_at               187074 non-null int64
launched_at              187074 non-null int64
staff_pick               187074 non-null bool
spotlight                187074 non-null bool
creator_registered       187074 non-null bool
launch_state             187074 non-null object
dtypes: bool(4), float64(2), int64(5), object(4)
memory usage: 16.4+ MB


In [300]:
df.deadline[0]

1417150740

In [351]:
# Convert relevant values to datetime 
# The default format of these values is in unix time format
df['deadline'] = df['deadline'].apply(datetime.utcfromtimestamp)
df['created_at'] = df['created_at'].apply(datetime.utcfromtimestamp)
df['launched_at'] = df['launched_at'].apply(datetime.utcfromtimestamp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [357]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187074 entries, 0 to 187073
Data columns (total 15 columns):
id                       187074 non-null int64
name                     187074 non-null object
category                 187074 non-null object
goal                     187074 non-null float64
backers_count            187074 non-null int64
pledged                  187074 non-null float64
disable_communication    187074 non-null bool
country                  186198 non-null object
deadline                 187074 non-null datetime64[ns]
created_at               187074 non-null datetime64[ns]
launched_at              187074 non-null datetime64[ns]
staff_pick               187074 non-null bool
spotlight                187074 non-null bool
creator_registered       187074 non-null bool
launch_state             187074 non-null object
dtypes: bool(4), datetime64[ns](3), float64(2), int64(2), object(4)
memory usage: 16.4+ MB


# Stopped here

In [None]:
# ---- Drop columns ----
to_drop = [
    "Edition Statement",
    "Corporate Author",
    "Corporate Contributors",
    "Former owner",
    "Engraver",
    "Contributors",
    "Issuance type",
    "Shelfmarks"]

df.drop(columns=to_drop, inplace=True)

In [None]:
df.head()

In [None]:
# ---- Change index ----
df["Identifier"].is_unique

In [None]:
df.set_index("Identifier", inplace=True)
df.head(10)

In [None]:
# ---- Clean specific columns ----
df.get_dtype_counts()

In [None]:
# ---- Dates ----
  # Extract first 4-digit numbers to be the year
  # This removes any punctuations, converts data ranges to start dates, and changes any non-conforming cells to NaN
extr = df["Date of Publication"].str.extract(r'^(\d{4})', expand=False)
df["Date of Publication"] = pd.to_numeric(extr)
df["Date of Publication"]

In [None]:
print(f"{round(df['Date of Publication'].isnull().sum() / len(df) * 100,1)}% of the data is null.")

In [None]:
# ---- Locations ----
  # Use np.where(condition, then, else)
df["Place of Publication"].head(10)

In [None]:
pub.nunique()

In [None]:
pub.unique()

In [None]:
pub = df["Place of Publication"]
london = pub.str.contains("London")
london[:10]

In [None]:
oxford = pub.str.contains("Oxford")
oxford[:10]

In [None]:
# Fix "London", "Oxford", and entries with "-"
df["Place of Publication"] = np.where(london, "London",
    np.where(oxford, "Oxford",
    pub.str.replace("-", " ")))
df["Place of Publication"].head(10)

In [None]:
df["Place of Publication"].nunique()

In [None]:
# ... Repeat for all data
df["Place of Publication"].unique()

In [None]:
coventry = pub.str.contains("Coventry")
merida_de_yucatan = pub.str.contains("Merida de Yucatan")
moscow = pub.str.contains("Moscow|Москва|Moskwa")

df["Place of Publication"] = np.where(coventry, "Coventry",
    np.where(merida_de_yucatan, "Merida de Yucatan",
    np.where(moscow, "Moscow",df["Place of Publication"])))

print(f"Number of unique entries: {df['Place of Publication'].nunique()}")
print(f"Unique entries: {df['Place of Publication'].unique()}")

In [None]:
paris = pub.str.contains("Paris")
lyon = pub.str.contains("Lyon")
moscow = pub.str.contains("Москвѣ")

df["Place of Publication"] = np.where(paris, "Paris",
    np.where(lyon, "Lyon",
    np.where(moscow, "Moscow", df["Place of Publication"])))

print(f"Number of unique entries: {df['Place of Publication'].nunique()}")
print(f"Unique entries: {df['Place of Publication'].unique()}")

In [None]:
nottingham = pub.str.contains("Nottingham")
viena = pub.str.contains("Viena")

df["Place of Publication"] = np.where(nottingham, "Nottingham",
    np.where(viena, "Viena", df["Place of Publication"]))

print(f"Number of unique entries: {df['Place of Publication'].nunique()}")
print(f"Unique entries: {df['Place of Publication'].unique()}")

In [None]:
df.groupby(["Place of Publication"]).describe()

In [None]:
# ---- Convert locations column to categorical ----
"""
place = pd.get_dummies(df["Place of Publication"],drop_first=True)
df.drop(["Place of Publication"],axis=1,inplace=True)
df = pd.concat([df,place],axis=1)
df.head(10)
"""

In [None]:
# ---- Clean entire dataset using applymap ----
!more Datasets\\university_towns.txt 

In [None]:
university_towns = []
with open("Datasets/university_towns.txt") as file:
    for line in file:
        if "[edit]" in line:
            # Remember this state until the next is found
            state = line
        else:
            # Otherwise we have a city - keep the state
            university_towns.append((state, line))
        
university_towns

In [None]:
towns_df = pd.DataFrame(university_towns, columns=["State", "RegionName"])
towns_df

In [None]:
# Function: Grab state and city data
def get_citystate(item):
    if " (" in item:
        return item[:item.find(" (")]
    elif "[" in item:
        return item[:item.find("[")]
    else:
        return item

In [None]:
# applymap the function
towns_df = towns_df.applymap(get_citystate)
towns_df

In [None]:
# ---- Organize column labels ----
olympics_df = pd.read_csv("Datasets/olympics.csv")
olympics_df

In [None]:
# Skip the top row and set the header
olympics_df = pd.read_csv("Datasets/olympics.csv", header=1)
olympics_df

In [None]:
# Rename the columns
olympics_df.columns

In [None]:
new_names = {'Unnamed: 0': 'Country',
            '? Summer': 'Summer Olympics',
            '01 !': 'Gold',
            '02 !': 'Silver',
            '03 !': 'Bronze',
            '? Winter': 'Winter Olympics',
            '01 !.1': 'Gold.1',
            '02 !.1': 'Silver.1',
            '03 !.1': 'Bronze.1',
            '? Games': '# Games',
            '01 !.2': 'Gold.2',
            '02 !.2': 'Silver.2',
            '03 !.2': 'Bronze.2'}

In [None]:
olympics_df.rename(columns=new_names, inplace=True)
olympics_df