# Set up notebook

## Import libraries

In [1]:
import pandas as pd 
import numpy as np
import scipy
from sklearn import decomposition, manifold, preprocessing, cluster
import string
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

from selenium import webdriver
from bs4 import BeautifulSoup
import requests

from gensim.models import Word2Vec
import nltk
from spacy.en import English

from ipywidgets import interact

%matplotlib inline

## Plot style

In [2]:
plt.style.use("fivethirtyeight")

## Dataframe options

In [3]:
pd.set_option('max_colwidth',500)

# Load data

In [4]:
df = pd.read_csv("https://raw.githubusercontent.com/socathie/paperplane/master/data/most_backed.csv").iloc[:,1:]

# Exploratory data analysis

## What does the data look like

In [5]:
df.head(3)

Unnamed: 0,amt.pledged,blurb,category,currency,goal,location,num.backers,num.backers.tier,pledge.tier,title,url
0,8782571.0,\nThis is a card game for people who are into kittens and explosions and laser beams and sometimes goats.\n,Tabletop Games,usd,10000.0,"Los Angeles, CA",219382,"[15505, 202934, 200, 5]","[20.0, 35.0, 100.0, 500.0]",Exploding Kittens,/projects/elanlee/exploding-kittens
1,6465690.0,"\nAn unusually addicting, high-quality desk toy designed to help you focus. Fidget at work, in class, and at home in style.\n",Product Design,usd,15000.0,"Denver, CO",154926,"[788, 250, 43073, 21796, 41727, 21627, 12215, 7608, 3290, 2189, 82, 85]","[1.0, 14.0, 19.0, 19.0, 35.0, 35.0, 79.0, 79.0, 129.0, 129.0, 849.0, 849.0]",Fidget Cube: A Vinyl Desk Toy,/projects/antsylabs/fidget-cube-a-vinyl-desk-toy
2,5408916.0,\nBring Reading Rainbow’s library of interactive books & video field trips to more platforms & provide free access to classrooms in need!\n,Web,usd,1000000.0,"Los Angeles, CA",105857,"[19639, 14343, 9136, 2259, 5666, 24512, 4957, 4359, 749, 1248, 500, 339, 282, 417, 1548, 517, 749, 155, 160, 60, 5, 10, 3, 100, 75, 12, 295, 69, 156, 1, 33, 1, 1, 8, 6, 30, 13, 39, 1, 4, 6, 21, 9, 3, 23, 11, 12, 20, 14, 11, 4, 14, 36, 10, 1, 1, 10, 2, 2, 8, 13, 10, 1, 1]","[5.0, 10.0, 25.0, 30.0, 35.0, 50.0, 75.0, 100.0, 110.0, 125.0, 140.0, 145.0, 150.0, 175.0, 200.0, 200.0, 250.0, 250.0, 275.0, 300.0, 300.0, 300.0, 300.0, 350.0, 350.0, 350.0, 375.0, 375.0, 400.0, 400.0, 400.0, 450.0, 500.0, 500.0, 500.0, 600.0, 700.0, 750.0, 750.0, 750.0, 750.0, 750.0, 750.0, 800.0, 800.0, 800.0, 800.0, 800.0, 800.0, 900.0, 1200.0, 1500.0, 1700.0, 2500.0, 2600.0, 3000.0, 3500.0, 3500.0, 3500.0, 3750.0, 5000.0, 10000.0, 10000.0, 10000.0]","Bring Reading Rainbow Back for Every Child, Everywhere!",/projects/readingrainbow/bring-reading-rainbow-back-for-every-child-everywh


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 11 columns):
amt.pledged         4000 non-null float64
blurb               4000 non-null object
category            4000 non-null object
currency            4000 non-null object
goal                4000 non-null float64
location            4000 non-null object
num.backers         4000 non-null int64
num.backers.tier    4000 non-null object
pledge.tier         4000 non-null object
title               4000 non-null object
url                 4000 non-null object
dtypes: float64(2), int64(1), object(8)
memory usage: 343.8+ KB


In [7]:
df.describe()

Unnamed: 0,amt.pledged,goal,num.backers
count,4000.0,4000.0,4000.0
mean,289841.0,61710.59,3582.65975
std,711973.4,126661.1,7316.163105
min,1925.0,1.0,1109.0
25%,67559.5,13000.0,1406.0
50%,120377.0,30000.0,1946.0
75%,259197.8,60000.0,3372.0
max,20338990.0,2000000.0,219382.0


In [8]:
df.describe(include=["O"])

Unnamed: 0,blurb,category,currency,location,num.backers.tier,pledge.tier,title,url
count,4000,4000,4000,4000,4000,4000,4000,4000
unique,3980,115,9,808,3998,3978,3995,3998
top,\n,Product Design,usd,"San Francisco, CA","[83, 669, 413, 60]","[90.0, 100.0]",Werewolf,/projects/1609568567/code-hardcore-the-coolest-2d-mecha-battle-game
freq,9,769,3437,280,2,4,2,2


# Data mining/ data wrangling

In [9]:
df2 = df.copy()

## Rename columns

In [10]:
df2.columns = [i.replace(".", "_").replace(" ", "_") for i in df2.columns]

## Standardizing dollar amounts

In [11]:
df2["currency"].unique()

array(['usd', 'gbp', 'cad', 'aud', 'eur', 'sek', 'nzd', 'dkk', 'chf'], dtype=object)

In [12]:
# Create exchange rate dictionary (note: exchange rates based on 11/9/16)
exchange_dict = {
    'usd': 1.00,
    'gbp': 1.25,
    'cad': 0.75,
    'eur': 1.09,
    'aud': 0.77,
    'sek': 0.11,
    'nzd': 0.73,
    'dkk': 0.15,
    'chf': 1.02,}

In [13]:
# Create a new column that maps out exchange rate based on currency
df2['exchange_rate'] = df2['currency'].map(exchange_dict)

# Create new column that converts all data to USD 
df2['pledged_USD'] = df2['amt_pledged'] * df2['exchange_rate']
df2['goal_USD'] = df2['goal'] * df2['exchange_rate']

## Percentage of goal received

In [14]:
# Create a new column that shows pledged/goal in percentage terms
df2['percent_received'] = df2['pledged_USD'] / df2['goal_USD'] * 100.

## Binning dollar amounts

In [15]:
levels = ["low", "medium", "high", "dreaming"]

### Goals

In [16]:
goal_bin = [(i,j) for i,j 
            in enumerate(zip([df2["goal_USD"].quantile(k) 
                              for k in [0.1, 0.33, 0.67, 1.0]], 
                             levels))]

In [17]:
def bin_goals_num(amt):
    for i in goal_bin:
        if amt <= i[1][0]:
            return i[0]

def bin_goals(amt):
    for i in goal_bin:
        if amt <= i[1][0]:
            return i[1][1]

In [18]:
df2["goal_level"] = df2["goal_USD"].map(bin_goals)
df2["goal_level_num"] = df2["goal_USD"].map(bin_goals_num)

### Pledges

In [19]:
pledge_bin = [(i,j) for i,j 
              in enumerate(zip([df2["pledged_USD"].quantile(k) 
                            for k in [0.1, 0.33, 0.67, 1.0]], levels))]

In [20]:
def bin_pledges(amt):
    for i in pledge_bin:
        if amt <= i[1][0]:
            return i[1][1]
        
def bin_pledges_num(amt):
    for i in pledge_bin:
        if amt <= i[1][0]:
            return i[0]

In [21]:
df2["pledge_level"] = df2["goal_USD"].map(bin_pledges)
df2["pledge_level_num"] = df2["goal_USD"].map(bin_pledges_num)

## Getting creator info

In [22]:
df2["creator"] = df2["url"].map(lambda x: x.split("/")[2])

### Some of the creators are just numbers

Scrape Kickstarter profile pages to get their names
- Scraper run outside of Jupyter Notebook for efficiency
    - See code on [GitHub]()

In [23]:
names = np.array(pd.read_csv("names_full.txt", delimiter="\n", header=None)[0])

In [24]:
names

array(['Elan Lee', 'Matthew and Mark McLachlan',
       'LeVar Burton & Reading Rainbow', ..., 'OFF LIFE', 'Rick Davidson',
       'Kevin Crawford'], dtype=object)

In [25]:
df2["creator_names"] = names

## Location

In [26]:
# Each location has a [primary], [secondary] location - we'll separate that
df2["loc1"] = df2["location"].map(lambda x: x.split(", ")[0].strip())
df2["loc2"] = df2["location"].map(lambda x: x.split(", ")[1].strip())

In [27]:
df2.head(2)

Unnamed: 0,amt_pledged,blurb,category,currency,goal,location,num_backers,num_backers_tier,pledge_tier,title,...,goal_USD,percent_received,goal_level,goal_level_num,pledge_level,pledge_level_num,creator,creator_names,loc1,loc2
0,8782571.0,\nThis is a card game for people who are into kittens and explosions and laser beams and sometimes goats.\n,Tabletop Games,usd,10000.0,"Los Angeles, CA",219382,"[15505, 202934, 200, 5]","[20.0, 35.0, 100.0, 500.0]",Exploding Kittens,...,10000.0,87825.71,medium,1,low,0,elanlee,Elan Lee,Los Angeles,CA
1,6465690.0,"\nAn unusually addicting, high-quality desk toy designed to help you focus. Fidget at work, in class, and at home in style.\n",Product Design,usd,15000.0,"Denver, CO",154926,"[788, 250, 43073, 21796, 41727, 21627, 12215, 7608, 3290, 2189, 82, 85]","[1.0, 14.0, 19.0, 19.0, 35.0, 35.0, 79.0, 79.0, 129.0, 129.0, 849.0, 849.0]",Fidget Cube: A Vinyl Desk Toy,...,15000.0,43104.6,medium,1,low,0,antsylabs,Matthew and Mark McLachlan,Denver,CO


In [28]:
# A lot of them had two-letter state representations for the secondary location
# Let's see what were the ones that didn't
df2["is_state"] = df2.loc2.map(lambda x: 1 if len(x) == 2 else 0)

In [29]:
df2["loc2"][df2["is_state"] == 0].unique()

array(['Canada', 'Japan', 'Czech Republic', 'China', 'Norway', 'Belgium',
       'Sweden', 'Germany', 'France', 'Manhattan', 'Poland', 'Slovenia',
       'India', 'Denmark', 'Spain', 'Luxembourg', 'Taiwan', 'Los Angeles',
       'Netherlands', 'Mexico', 'Queens', 'Russia', 'Thailand', 'Israel',
       'Italy', 'Austria', 'South Korea', 'Bulgaria', 'South Africa',
       'Afghanistan', 'Hong Kong', 'Brazil', 'Switzerland', 'Ukraine',
       'Chile', 'Argentina', 'United Arab Emirates', 'Finland', 'Belarus',
       'Turkey', 'Romania', 'Brooklyn', 'Malta', 'Greece', 'Iceland',
       'Slovakia', 'Singapore', 'Nashville', 'Macedonia', 'Philadelphia',
       'San Diego', 'Costa Rica', 'Bakersfield', 'Portland', 'Miami',
       'Philippines', 'Croatia', 'Egypt', 'Ireland', 'Thousand Oaks',
       'Virgin Islands', 'Crested Butte', 'Puerto Rico', 'Indonesia',
       'Cameroon', 'Framingham', 'Hungary', 'Colombia', 'Maplewood',
       'Latvia', 'Kenya'], dtype=object)

In [30]:
# Some of them look like they're in the USA
# Looked at Kickstarter for some of them, projects are allowed to choose sub-locations within their cities
# So we end up with [location, city, state] instead of just [city, state]
# But only the first 2 items get picked up
# For now, we'll manually pick these out
in_USA = ["Manhattan", "Brooklyn", "Nashville", "Philadelphia", "San Diego", "Portland", 
          "Miami", "Thousand Oaks", "Virgin Islands", "Crested Butte", "Framingham", "Maplewood"]

In [31]:
def get_country(loc):
    if loc in in_USA or len(loc) == 2:
        return "USA"
    else:
        return loc

In [32]:
df2["country"] = df2["loc2"].map(get_country)

In [33]:
df2.head(1)

Unnamed: 0,amt_pledged,blurb,category,currency,goal,location,num_backers,num_backers_tier,pledge_tier,title,...,goal_level,goal_level_num,pledge_level,pledge_level_num,creator,creator_names,loc1,loc2,is_state,country
0,8782571.0,\nThis is a card game for people who are into kittens and explosions and laser beams and sometimes goats.\n,Tabletop Games,usd,10000.0,"Los Angeles, CA",219382,"[15505, 202934, 200, 5]","[20.0, 35.0, 100.0, 500.0]",Exploding Kittens,...,medium,1,low,0,elanlee,Elan Lee,Los Angeles,CA,1,USA


## Duration of campaign

Scrape Kickstarter project pages to get details of the duration and dates

- Scraper run outside of Jupyter Notebook for efficiency
    - See code on [GitHub]()

In [34]:
dates = pd.read_csv("dates_info.csv").iloc[:,1:]

In [35]:
df2 = df2.merge(dates, on="url")

In [36]:
df2["start"] = pd.to_datetime(df2["start"])
df2["end"] = pd.to_datetime(df2["end"])

In [37]:
df2.head(2)

Unnamed: 0,amt_pledged,blurb,category,currency,goal,location,num_backers,num_backers_tier,pledge_tier,title,...,pledge_level_num,creator,creator_names,loc1,loc2,is_state,country,start,end,days
0,8782571.0,\nThis is a card game for people who are into kittens and explosions and laser beams and sometimes goats.\n,Tabletop Games,usd,10000.0,"Los Angeles, CA",219382,"[15505, 202934, 200, 5]","[20.0, 35.0, 100.0, 500.0]",Exploding Kittens,...,0,elanlee,Elan Lee,Los Angeles,CA,1,USA,2015-01-20,2015-02-19,30
1,6465690.0,"\nAn unusually addicting, high-quality desk toy designed to help you focus. Fidget at work, in class, and at home in style.\n",Product Design,usd,15000.0,"Denver, CO",154926,"[788, 250, 43073, 21796, 41727, 21627, 12215, 7608, 3290, 2189, 82, 85]","[1.0, 14.0, 19.0, 19.0, 35.0, 35.0, 79.0, 79.0, 129.0, 129.0, 849.0, 849.0]",Fidget Cube: A Vinyl Desk Toy,...,0,antsylabs,Matthew and Mark McLachlan,Denver,CO,1,USA,2016-08-30,2016-10-19,50


## Categories

In [38]:
categories = df2["category"].unique()
categories = [[i.lower()] for i in categories]

In [39]:
kcats = ["Art",
"Comics",
"Crafts",
"Dance",
"Design",
"Fashion",
"Film & Video",
"Food",
"Games",
"Journalism",
"Music",
"Photography",
"Publishing",
"Technology",
"Theater"]

In [40]:
kcats = [[i.lower()] for i in kcats]

In [41]:
categories.extend(kcats)

In [42]:
model = Word2Vec(categories, min_count=0)

In [43]:
for i in kcats:
    print i
    for j in model.most_similar(positive=i, negative=[x[0] for x in kcats if x != i],topn=5):
        print j
    print

['art']
('stationery', 0.18190565705299377)
('hip-hop', 0.17864373326301575)
('playing cards', 0.1732616126537323)
('sound', 0.1540769338607788)
('interactive design', 0.14903217554092407)

['comics']
('action', 0.18668465316295624)
('puzzles', 0.16805586218833923)
("children's books", 0.1503983736038208)
('playing cards', 0.14931637048721313)
('anthologies', 0.14254578948020935)

['crafts']
('rock', 0.21476006507873535)
('spaces', 0.18376420438289642)
('installations', 0.16720616817474365)
('people', 0.1659121960401535)
('anthologies', 0.15770873427391052)

['dance']
('country &amp; folk', 0.21552163362503052)
('spaces', 0.1784234344959259)
('action', 0.17598041892051697)
('video games', 0.17465408146381378)
('vegan', 0.15892380475997925)

['design']
('rock', 0.19560274481773376)
('video', 0.18019834160804749)
("children's books", 0.17965483665466309)
('camera equipment', 0.1769547164440155)
('spaces', 0.17587065696716309)

['fashion']
('thrillers', 0.1677640974521637)
('stationery', 

## Export to csv

In [44]:
df3 = df2.copy()

In [45]:
print list(df3.columns)

['amt_pledged', 'blurb', 'category', 'currency', 'goal', 'location', 'num_backers', 'num_backers_tier', 'pledge_tier', 'title', 'url', 'exchange_rate', 'pledged_USD', 'goal_USD', 'percent_received', 'goal_level', 'goal_level_num', 'pledge_level', 'pledge_level_num', 'creator', 'creator_names', 'loc1', 'loc2', 'is_state', 'country', 'start', 'end', 'days']


In [46]:
columns = ['title', 'blurb', 'category', 'location', 'goal_USD', 'pledged_USD', 'percent_received', 
           'pledged_USD', 'num_backers', 'num_backers_tier', 'pledge_tier', 'url', 
           'exchange_rate', 'goal_level', 'goal_level_num', 'pledge_level', 'pledge_level_num', 
           'creator', 'creator_names', 'country', 'start', 'end', 'days']

In [47]:
df3 = df3[columns]

In [49]:
def remove_n(x):
    try:
        return x.replace("\n", "")
    except:
        return x

In [50]:
df3 = df3.applymap(remove_n)

In [52]:
df3.to_csv("more_data.csv", sep="|")