In [1]:
ls -l "/home/ec2-user/PythonCode/kickstarter-data-analysis/scraped_data/"

total 4804700
-rw-rw-r-- 1 ec2-user ec2-user  43426268 Aug 21 10:56 [0m[01;31mKickstarter_2015-10-22T09_57_48_703Z.json.gz[0m
-rw-rw-r-- 1 ec2-user ec2-user  43224891 Aug 21 10:56 [01;31mKickstarter_2015-11-01T14_09_04_557Z.json.gz[0m
-rw-rw-r-- 1 ec2-user ec2-user 104776302 Aug 21 10:56 [01;31mKickstarter_2015-12-17T12_09_06_107Z.json.gz[0m
-rw-rw-r-- 1 ec2-user ec2-user 107195345 Aug 21 10:55 [01;31mKickstarter_2016-01-28T09_15_08_781Z.json.gz[0m
-rw-rw-r-- 1 ec2-user ec2-user 110967561 Aug 21 10:55 [01;31mKickstarter_2016-03-22T07_41_08_591Z.json.gz[0m
-rw-rw-r-- 1 ec2-user ec2-user 116191310 Aug 21 10:55 [01;31mKickstarter_2016-04-15T02_09_04_328Z.json.gz[0m
-rw-rw-r-- 1 ec2-user ec2-user 110977883 Aug 21 10:54 [01;31mKickstarter_2016-05-15T02_04_46_813Z.json.gz[0m
-rw-rw-r-- 1 ec2-user ec2-user 124758251 Aug 21 10:54 [01;31mKickstarter_2016-06-15T02_04_49_697Z.json.gz[0m
-rw-rw-r-- 1 ec2-user ec2-user 126454297 Aug 21 10:53 [01;31mKickstarter_2016-07-15T

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Loads the data in
data = pd.read_csv('data/ks-projects-2018.csv', delimiter=',', parse_dates=['deadline','launched'], dayfirst=True) 
data.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:00,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:00,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:00,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:00,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:00,1283.0,canceled,14,US,1283.0,1283.0,19500.0


# Scraping

The website https://webrobots.io/kickstarter-datasets/ contains good json files that will be usable

<ul>
    <li>I will download all the .gz files and unzip them.</li>
    <li>I will go through and for each name that is in the kaggle provided dataset I will extract all of that data to a new dataset</li>
    <li>I will then take that data and enrich it by taking the description for each file.</li>
    <li>I will take those words and will remove any video references</li>
    <li>I will create a bag of words for each row and will obtain the most likely and least likely words for a match to Succeeded and Failed. (Potentially through a naive bayes model)</li>
    </ul>

In [2]:
import requests
import re

In [3]:
# Find all the json zips that have been scraped.
kickstarter_req = requests.get("https://webrobots.io/kickstarter-datasets/")
zip_to_down_ls = re.findall("href=\"(.*s3\.amazonaws.*\.json\.gz)\"", kickstarter_req.text)
zip_to_down_ls

['https://s3.amazonaws.com/weruns/forfun/Kickstarter/Kickstarter_2018-02-15T03_20_44_743Z.json.gz',
 'https://s3.amazonaws.com/weruns/forfun/Kickstarter/Kickstarter_2018-01-12T10_20_09_196Z.json.gz',
 'https://s3.amazonaws.com/weruns/forfun/Kickstarter/Kickstarter_2017-12-15T10_20_51_610Z.json.gz',
 'https://s3.amazonaws.com/weruns/forfun/Kickstarter/Kickstarter_2017-11-15T10_21_04_919Z.json.gz',
 'https://s3.amazonaws.com/weruns/forfun/Kickstarter/Kickstarter_2017-10-15T10_20_38_271Z.json.gz',
 'https://s3.amazonaws.com/weruns/forfun/Kickstarter/Kickstarter_2017-09-15T22_20_48_432Z.json.gz',
 'https://s3.amazonaws.com/weruns/forfun/Kickstarter/Kickstarter_2017-08-15T22_20_51_958Z.json.gz',
 'https://s3.amazonaws.com/weruns/forfun/Kickstarter/Kickstarter_2017-07-15T22_20_48_951Z.json.gz',
 'https://s3.amazonaws.com/weruns/forfun/Kickstarter/Kickstarter_2017-06-15T22_20_03_059Z.json.gz',
 'https://s3.amazonaws.com/weruns/forfun/Kickstarter/Kickstarter_2017-05-15T22_21_11_300Z.json.gz',


In [4]:
from os.path import isfile

In [5]:
location = "scraped_data/"
files_ls = []
# Download all of the files
def download_and_save_file(url):
    name = url.split('/')[-1]
    if not isfile(location + name):
        print("Downloading...")
        with open(location + name, 'wb') as opened_file:
            file_down_resp = requests.get(url)
            opened_file.write(file_down_resp.content)
    print("Done: " + name)
    files_ls.append(name)


for url in zip_to_down_ls:
    download_and_save_file(url)
    
print(files_ls)

Done: Kickstarter_2018-02-15T03_20_44_743Z.json.gz
Done: Kickstarter_2018-01-12T10_20_09_196Z.json.gz
Done: Kickstarter_2017-12-15T10_20_51_610Z.json.gz
Done: Kickstarter_2017-11-15T10_21_04_919Z.json.gz
Done: Kickstarter_2017-10-15T10_20_38_271Z.json.gz
Done: Kickstarter_2017-09-15T22_20_48_432Z.json.gz
Done: Kickstarter_2017-08-15T22_20_51_958Z.json.gz
Done: Kickstarter_2017-07-15T22_20_48_951Z.json.gz
Done: Kickstarter_2017-06-15T22_20_03_059Z.json.gz
Done: Kickstarter_2017-05-15T22_21_11_300Z.json.gz
Done: Kickstarter_2017-04-15T22_21_18_122Z.json.gz
Done: Kickstarter_2017-03-15T22_20_55_874Z.json.gz
Done: Kickstarter_2017-02-15T22_22_48_377Z.json.gz
Done: Kickstarter_2017-01-15T22_21_04_985Z.json.gz
Done: Kickstarter_2016-12-15T22_20_52_411Z.json.gz
Done: Kickstarter_2016-11-12T22_20_03_295Z.json.gz
Done: Kickstarter_2016-10-15T02_04_11_689Z.json.gz
Done: Kickstarter_2016-09-15T02_04_03_474Z.json.gz
Done: Kickstarter_2016-08-15T02_04_03_829Z.json.gz
Done: Kickstarter_2016-07-15T02

In [6]:
import json
import gzip
import psycopg2

In [7]:
# For inputting the password
password_str = ""
with open("dbpass.txt", mode="r") as pass_f:
    password_str = pass_f.readline()

In [8]:
#connection_str = "host='sit307kickstarterdb.ciwj1qtwzsou.ap-southeast-2.rds.amazonaws.com'"\
#+ "dbname='postgres' user='sora' password='{}'".format(password_str)

#conn = psycopg2.connect(connection_str)
#conn.autocommit = True
#cur = conn.cursor()

# Reverse the file order to make the oldest go up first and the later ones
# overwrite the old
import csv
files_with_index_error = []
resulting_files = []
for file_name in files_ls:
    with gzip.open(location + file_name, mode="r") as f:
        with open(location + "result_" + file_name.split('.')[0] + '.csv', mode='w') as output_file:
            csv_writer = csv.writer(output_file, delimiter=',')
            resulting_files.append(location + "result_" + file_name.split('.')[0] + '.csv')
            print("Performing file: " + file_name)
            try:
                for line in f.readlines():
                    json_record = json.loads(line)
                    record = (
                        json_record["data"]["id"],
                        json_record["data"]["urls"]["web"]["project"],
                        json_record["data"].get("goal", "NULL"),
                        json_record["data"].get("blurb", "NULL").replace(",", "").replace("\n", "").replace("\r", "").replace("\\", ""),
                        json_record["data"].get("name", "NULL").replace(",", "").replace("\n", "").replace("\r", "").replace("\\", ""),
                        json_record["data"].get("slug", "NULL").replace(",", "").replace("\n", "").replace("\r", "").replace("\\", ""),
                        json_record["data"]["state"],
                        json_record["data"]["creator"].get("name", "NULL").replace(",", "").replace("\n", "").replace("\r", "").replace("\\", "")
                    )
                    csv_writer.writerow(record)
            except OSError as exc:
                # If this occurs delete the file, and run the download script again
                # it will pick it up
                print("An error occured unzipping the file: " + file_name)
            except KeyError as exc:
                try:
                    for line in f.readlines():
                        json_record = json.loads(line)
                        record = (
                            json_record['data']['projects']['id'],
                            json_record['data']['projects']["urls"]["web"]["project"],
                            json_record['data']['projects'].get("goal", "NULL"),
                            json_record['data']['projects'].get("blurb", "NULL").replace(",", "").replace("\n", "").replace("\r", "").replace("\\", ""),
                            json_record['data']['projects'].get("name", "NULL").replace(",", "").replace("\n", "").replace("\r", "").replace("\\", ""),
                            json_record['data']['projects']['slug'].replace(",", "").replace("\n", "").replace("\r", "").replace("\\", ""),
                            json_record['data']['projects'].get(['state'], "NULL"),
                            json_record['data']['projects']['creator'].get("name", "NULL").replace(",", "").replace("\n", "").replace("\r", "").replace("\\", "")
                        )
                        csv_writer.writerow(record)

                except KeyError as exc:
                    print("One of the files still has an error {}".format(file_name))
            
        
# conn.close()

Performing file: Kickstarter_2018-02-15T03_20_44_743Z.json.gz
Performing file: Kickstarter_2018-01-12T10_20_09_196Z.json.gz
Performing file: Kickstarter_2017-12-15T10_20_51_610Z.json.gz
Performing file: Kickstarter_2017-11-15T10_21_04_919Z.json.gz
Performing file: Kickstarter_2017-10-15T10_20_38_271Z.json.gz
Performing file: Kickstarter_2017-09-15T22_20_48_432Z.json.gz
Performing file: Kickstarter_2017-08-15T22_20_51_958Z.json.gz
Performing file: Kickstarter_2017-07-15T22_20_48_951Z.json.gz
Performing file: Kickstarter_2017-06-15T22_20_03_059Z.json.gz
Performing file: Kickstarter_2017-05-15T22_21_11_300Z.json.gz
Performing file: Kickstarter_2017-04-15T22_21_18_122Z.json.gz
Performing file: Kickstarter_2017-03-15T22_20_55_874Z.json.gz
Performing file: Kickstarter_2017-02-15T22_22_48_377Z.json.gz
Performing file: Kickstarter_2017-01-15T22_21_04_985Z.json.gz
Performing file: Kickstarter_2016-12-15T22_20_52_411Z.json.gz
Performing file: Kickstarter_2016-11-12T22_20_03_295Z.json.gz
Performi

In [9]:
connection_str = "host='kickstarter-assignment.ciwj1qtwzsou.ap-southeast-2.rds.amazonaws.com'"\
+ "dbname='sora' user='sora' password='{}'".format(password_str)

conn = psycopg2.connect(connection_str)
conn.autocommit = True
cur = conn.cursor()

for file_name in resulting_files:
    with open(file_name) as csv_file:
        cur.copy_from(csv_file, 
                      "kickstart_primordial", 
                      sep=",",
                     null="NULL",
                     columns=["id", "urlproject", "goal", "blurb", "name", "slug", "state", "creator_name"])

conn.close()

In [26]:
data.head(3)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:00,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:00,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:00,220.0,failed,3,US,220.0,220.0,45000.0


In [58]:
connection_str = "host='kickstarter-assignment.ciwj1qtwzsou.ap-southeast-2.rds.amazonaws.com'"\
+ "dbname='sora' user='sora' password='{}'".format(password_str)

conn = psycopg2.connect(connection_str)
conn.autocommit = True
cur = conn.cursor()
print(data.name)
data_clean = data.copy()
data_clean.loc[:, "name"] = data.loc[:, "name"].map(lambda x: str(x).replace("\t", "").replace("\\", ""))
data_clean.loc[:, "usd pledged"] = data_clean.loc[:, "usd pledged"].map(lambda x: np.nan if x == "" else x)
data_clean = data_clean.loc[data_clean.loc[:, "state"].isin(["failed", "successful"])]
data_clean = data_clean.dropna(how='any')

data_clean.to_csv("data/cleaned_for_copy.csv", sep="\t", index=False)
# Need to clean the file
with open('data/cleaned_for_copy.csv', mode='r') as data_file:
    data_file.readline()
    cur.copy_from(data_file, "kickstarter_actual", sep="\t", null='NULL')
    
    
conn.close()
data_clean

0                           The Songs of Adelaide & Abullah
1             Greeting From Earth: ZGAC Arts Capsule For ET
2                                            Where is Hank?
3         ToshiCapital Rekordz Needs Help to Complete Album
4         Community Film Project: The Art of Neighborhoo...
5                                      Monarch Espresso Bar
6         Support Solar Roasted Coffee & Green Energy!  ...
7         Chaser Strips. Our Strips make Shots their B*tch!
8         SPIN - Premium Retractable In-Ear Headphones w...
9         STUDIO IN THE SKY - A Documentary Feature Film...
10                                      Of Jesus and Madmen
11                                         Lisa Lim New CD!
12                                       The Cottage Market
13        G-Spot Place for Gamers to connect with eachot...
14        Tombstone: Old West tabletop game and miniatur...
15                                           Survival Rings
16                                      

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:00,0.00,failed,0,GB,0.00,0.00,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:00,2421.00,failed,15,US,100.00,2421.00,30000.00
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:00,220.00,failed,3,US,220.00,220.00,45000.00
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:00,1.00,failed,1,US,1.00,1.00,5000.00
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:00,52375.00,successful,224,US,52375.00,52375.00,50000.00
6,1000023410,Support Solar Roasted Coffee & Green Energy! ...,Food,Food,USD,2014-12-21,1000.0,2014-12-01 18:30:00,1205.00,successful,16,US,1205.00,1205.00,1000.00
7,1000030581,Chaser Strips. Our Strips make Shots their B*tch!,Drinks,Food,USD,2016-03-17,25000.0,2016-02-01 20:05:00,453.00,failed,40,US,453.00,453.00,25000.00
10,100004721,Of Jesus and Madmen,Nonfiction,Publishing,CAD,2013-10-09,2500.0,2013-09-09 18:19:00,0.00,failed,0,CA,0.00,0.00,2406.39
11,100005484,Lisa Lim New CD!,Indie Rock,Music,USD,2013-04-08,12500.0,2013-03-09 06:42:00,12700.00,successful,100,US,12700.00,12700.00,12500.00
12,1000055792,The Cottage Market,Crafts,Crafts,USD,2014-10-02,5000.0,2014-09-02 17:11:00,0.00,failed,0,US,0.00,0.00,5000.00


In [57]:
data_clean.iloc[307547, :]

ID                                        859648640
name                Barbecupid Restaurant Expansion
category                                Restaurants
main_category                                  Food
currency                                        CAD
deadline                        2016-12-02 00:00:00
goal                                           5000
launched                        2016-11-02 21:51:00
pledged                                        5445
state                                    successful
backers                                          71
country                                          CA
usd pledged                                  2782.3
usd_pledged_real                            4093.06
usd_goal_real                               3758.55
Name: 351287, dtype: object