In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sb

In [39]:
# Read in data
project = pd.read_csv('ks-projects-201801.csv')
project.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [10]:
project.info()
# The dataset has 378661 rows
# At a glance, only the attributes 'name' and 'usd pledged' are missing data
# 43.3 MB!!!!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 14 columns):
name                378661 non-null object
category            378661 non-null object
main_category       378661 non-null object
currency            378661 non-null object
deadline            378661 non-null object
goal                378661 non-null float64
launched            378661 non-null object
pledged             378661 non-null float64
state               378661 non-null object
backers             378661 non-null int64
country             378661 non-null object
usd_pledged         374864 non-null float64
usd_pledged_real    378661 non-null float64
usd_goal_real       378661 non-null float64
dtypes: float64(5), int64(1), object(8)
memory usage: 40.4+ MB


In [11]:
project.describe()
# The max row is fascinating, find the entries that correspond later.

Unnamed: 0,goal,pledged,backers,usd_pledged,usd_pledged_real,usd_goal_real
count,378661.0,378661.0,378661.0,374864.0,378661.0,378661.0
mean,49080.79,9682.979,105.617476,7036.729,9058.924,45454.4
std,1183391.0,95636.01,907.185035,78639.75,90973.34,1152950.0
min,0.01,0.0,0.0,0.0,0.0,0.01
25%,2000.0,30.0,2.0,16.98,31.0,2000.0
50%,5200.0,620.0,12.0,394.72,624.33,5500.0
75%,16000.0,4076.0,56.0,3034.09,4050.0,15500.0
max,100000000.0,20338990.0,219382.0,20338990.0,20338990.0,166361400.0


### Attribute Descriptions:

- __ID__: Project ID
- __name__: Kickstarter Project name
- __category__: Sub-category to main_category
- __main_category__: Parent category, is split into sub-categories category
- __currency__: Currency of the project
- __deadline__: Deadline for the project to be fully funded
- __goal__: Goal amount in project curency
- __launched__: Date project was launched on kickstarter
- __pledged__: Amount pledged in the project currency
- __state__: The state of the project backing (failed, successful, cancelled, undefined, suspended)
- __backers__: Number of backers that have pledged/supported the project
- __country__: Country origin of project
- __usd_pledged__: Pledged amount in USD (conversion by KS)
- __usd_pedged_real__: Pledged amount in USD (conversion by fixe.io api)
- __usd_goal_real__: Goal amount in USD

## Data Cleanup

#### Things to do still
1. Fill the missing na names values with 'category x' <font color=blue>DONE</font>
2. There are over 3000 usd_pledged na values <font color=blue>DONE, removed usd_pledged column as values look inaccurate</font>
    - See if there is any difference between usd_pledged and usd_pledged_real 
    - If no difference, i can probably remove the usd_pledged column
3. There are a number of country values that have the value N,0"
    - See if there are any instances where currency != country
    - If each currency == country, then we can simply make the missing country values the currency country
4. There are a number of projects that have a state of undefined
    - Confirm the deadline has passed and if the pledged >= goal then set as successful
        if pledged < goal put failed
        if the deadline is past January 2018 (i believe), confrm what to put..... see if viable case.

In [3]:
# Remove the ID column as it provides no value in the analysis
# Rename 'usd pledged' to usd_pledged for consistency in headings

project.drop('ID', axis = 1, inplace = True)
project.rename(columns={'usd pledged': 'usd_pledged'}, inplace=True)
project.columns

Index(['name', 'category', 'main_category', 'currency', 'deadline', 'goal',
       'launched', 'pledged', 'state', 'backers', 'country', 'usd_pledged',
       'usd_pledged_real', 'usd_goal_real'],
      dtype='object')

In [4]:
# Find the null entries for name and usd pledged
na_name = project[project['name'].isna()]
na_name

Unnamed: 0,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged,usd_pledged_real,usd_goal_real
166851,,Narrative Film,Film & Video,USD,2012-02-29,200000.0,2012-01-01 12:35:31,100.0,failed,1,US,100.0,100.0,200000.0
307234,,Video Games,Games,GBP,2013-01-06,2000.0,2012-12-19 23:57:48,196.0,failed,12,GB,317.73,316.05,3224.97
309991,,Product Design,Design,USD,2016-07-18,2500.0,2016-06-18 05:01:47,0.0,suspended,0,US,0.0,0.0,2500.0
338931,,Painting,Art,USD,2011-12-05,35000.0,2011-11-06 23:55:55,220.0,failed,5,US,220.0,220.0,35000.0


In [6]:
# As there are only 4 projects missing names and they are all in different main categories,
# they will be renamed to 'Category X'
project['name'] = project['name'].fillna(project['category'] + ' X')

In [9]:
project.loc[[166851, 307234]]

Unnamed: 0,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged,usd_pledged_real,usd_goal_real
166851,Narrative Film X,Narrative Film,Film & Video,USD,2012-02-29,200000.0,2012-01-01 12:35:31,100.0,failed,1,US,100.0,100.0,200000.0
307234,Video Games X,Video Games,Games,GBP,2013-01-06,2000.0,2012-12-19 23:57:48,196.0,failed,12,GB,317.73,316.05,3224.97


In [13]:
project[project['name'].isna()]

Unnamed: 0,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged,usd_pledged_real,usd_goal_real


## 2: Fill NAs for usd_pledged

In [36]:
na_usd_pledged = project[project['usd_pledged'].isna()]
na_usd_pledged.head()

Unnamed: 0,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged,usd_pledged_real,usd_goal_real
169,STREETFIGHTERZ WHEELIE MURICA,Film & Video,Film & Video,USD,2014-09-20,6500.0,2014-08-06 21:28:36,555.0,undefined,0,"N,0""",,555.0,6500.0
328,Duncan Woods - Chameleon EP,Music,Music,AUD,2015-08-25,4500.0,2015-08-04 12:05:17,4767.0,undefined,0,"N,0""",,3402.08,3211.53
632,The Making of Ashley Kelley's Debut Album,Music,Music,USD,2015-04-09,3500.0,2015-03-10 20:06:13,3576.0,undefined,0,"N,0""",,3576.0,3500.0
647,Butter Side Down Debut Album,Music,Music,USD,2015-11-26,6000.0,2015-11-02 22:09:19,7007.8,undefined,0,"N,0""",,7007.8,6000.0
749,Chase Goehring debut EP,Music,Music,USD,2016-03-21,3000.0,2016-02-23 03:09:49,3660.38,undefined,0,"N,0""",,3660.38,3000.0


In [37]:
na_count = project['usd_pledged'].isna().sum()
na_count

3797

In [43]:
project.loc[(project['country'] != 'US') & (project['backers'] > 0)].head()
# The USD pledged column seems to provide incorrect converted values
# Therefore the column will be removed for accuracy purposes

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
14,1000057089,Tombstone: Old West tabletop game and miniatur...,Tabletop Games,Games,GBP,2017-05-03,5000.0,2017-04-05 19:44:18,94175.0,successful,761,GB,57763.78,121857.33,6469.73
21,1000081649,MikeyJ clothing brand fundraiser,Childrenswear,Fashion,AUD,2017-09-07,2500.0,2017-08-08 01:20:20,1.0,failed,1,AU,0.0,0.81,2026.1
23,1000087442,Mountain brew: A quest for alcohol sustainability,Drinks,Food,NOK,2015-02-25,500.0,2015-01-26 19:17:33,48.0,failed,3,NO,6.18,6.29,65.55
27,1000104688,Permaculture Skills,Webseries,Film & Video,CAD,2014-12-14,17757.0,2014-11-14 18:02:00,48905.0,successful,571,CA,43203.25,42174.03,15313.04
28,1000104953,Rebel Army Origins: The Heroic Story Of Major ...,Comics,Comics,GBP,2016-01-28,100.0,2015-12-29 16:59:29,112.38,successful,27,GB,167.7,160.6,142.91


In [44]:
project.drop('usd pledged', axis = 1, inplace = True)
project.columns

Index(['ID', 'name', 'category', 'main_category', 'currency', 'deadline',
       'goal', 'launched', 'pledged', 'state', 'backers', 'country',
       'usd_pledged_real', 'usd_goal_real'],
      dtype='object')

## 2: Fill NAs for Country

In [63]:
#project[project['country'] == 'N,0"']
project.loc[project['country'] == 'N,0"']

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged_real,usd_goal_real
169,1000694855,STREETFIGHTERZ WHEELIE MURICA,Film & Video,Film & Video,USD,2014-09-20,6500.0,2014-08-06 21:28:36,555.00,undefined,0,"N,0""",555.00,6500.00
328,100149523,Duncan Woods - Chameleon EP,Music,Music,AUD,2015-08-25,4500.0,2015-08-04 12:05:17,4767.00,undefined,0,"N,0""",3402.08,3211.53
632,1003023003,The Making of Ashley Kelley's Debut Album,Music,Music,USD,2015-04-09,3500.0,2015-03-10 20:06:13,3576.00,undefined,0,"N,0""",3576.00,3500.00
647,1003130892,Butter Side Down Debut Album,Music,Music,USD,2015-11-26,6000.0,2015-11-02 22:09:19,7007.80,undefined,0,"N,0""",7007.80,6000.00
749,1003629045,Chase Goehring debut EP,Music,Music,USD,2016-03-21,3000.0,2016-02-23 03:09:49,3660.38,undefined,0,"N,0""",3660.38,3000.00
824,1004013077,Spencer Capier Instrumental Project 2015,Music,Music,CAD,2014-10-10,4000.0,2014-09-10 22:51:01,4760.00,undefined,0,"N,0""",4251.14,3572.39
844,1004126342,LUKAS LIGETI'S 50TH BIRTHDAY FESTIVAL: ORIGINA...,Music,Music,USD,2015-06-11,5000.0,2015-05-15 02:12:35,6370.00,undefined,0,"N,0""",6370.00,5000.00
864,1004255433,The Battle For Breukelen: A Neighborhood Epic,Film & Video,Film & Video,USD,2015-11-07,6000.0,2015-10-10 18:45:24,6695.00,undefined,0,"N,0""",6695.00,6000.00
870,1004298993,"""Tamboura Plays Violin"" - a collection of Pop ...",Music,Music,USD,2015-03-28,2750.0,2015-02-11 19:28:27,2926.00,undefined,0,"N,0""",2926.00,2750.00
890,1004402863,Nightingale Noel - An A Cappella Holiday CD,Music,Music,USD,2015-12-11,1200.0,2015-11-11 21:02:55,1585.00,undefined,0,"N,0""",1585.00,1200.00
