In [82]:
# Import the needed modules
import pandas as pd

In [83]:
def read_all_csvs():
    '''
    Function to read all different CSVs into one pandas data frame.
    '''
    csv_name = 'Kickstarter0'
    df = pd.DataFrame()
    # Iterate through the numbers 0 to 55
    for num in range(0,56):
        # Convert number to a string
        num = str(num)
        # If single digit number, add a 0 to the front
        if len(num) == 1:
            num = '0' + num
        # Load the current csv
        curr_df = pd.read_csv(str('./data/' + csv_name + num + '.csv'))
        # Merge all csv data to one data frame.
        df = pd.concat([df, curr_df], axis=0)
    return df 

In [84]:
# Load all csvs. 
df = read_all_csvs()

In [85]:
df.head()

Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,created_at,creator,currency,currency_symbol,currency_trailing_code,...,slug,source_url,spotlight,staff_pick,state,state_changed_at,static_usd_rate,urls,usd_pledged,usd_type
0,21,2006 was almost 7 years ago.... Can you believ...,"{""id"":43,""name"":""Rock"",""slug"":""music/rock"",""po...",802,US,1387659690,"{""id"":1495925645,""name"":""Daniel"",""is_registere...",USD,$,True,...,new-final-round-album,https://www.kickstarter.com/discover/categorie...,True,False,successful,1391899046,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",802.0,international
1,97,An adorable fantasy enamel pin series of princ...,"{""id"":54,""name"":""Mixed Media"",""slug"":""art/mixe...",2259,US,1549659768,"{""id"":1175589980,""name"":""Katherine"",""slug"":""fr...",USD,$,True,...,princess-pals-enamel-pin-series,https://www.kickstarter.com/discover/categorie...,True,False,successful,1551801611,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",2259.0,international
2,88,Helping a community come together to set the s...,"{""id"":280,""name"":""Photobooks"",""slug"":""photogra...",29638,US,1477242384,"{""id"":1196856269,""name"":""MelissaThomas"",""is_re...",USD,$,True,...,their-life-through-their-lens-the-amish-and-me...,https://www.kickstarter.com/discover/categorie...,True,True,successful,1480607932,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",29638.0,international
3,193,Every revolution starts from the bottom and we...,"{""id"":266,""name"":""Footwear"",""slug"":""fashion/fo...",49158,IT,1540369920,"{""id"":1569700626,""name"":""WAO"",""slug"":""wearewao...",EUR,€,False,...,wao-the-eco-effect-shoes,https://www.kickstarter.com/discover/categorie...,True,False,successful,1544309940,1.136525,"{""web"":{""project"":""https://www.kickstarter.com...",49075.15252,international
4,20,Learn to build 10+ Applications in this comple...,"{""id"":51,""name"":""Software"",""slug"":""technology/...",549,US,1425706517,"{""id"":1870845385,""name"":""Kalpit Jain"",""is_regi...",USD,$,True,...,apple-watch-development-course,https://www.kickstarter.com/discover/categorie...,False,False,failed,1428511019,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",549.0,domestic


In [86]:
df.isnull().sum()

backers_count                    0
blurb                            8
category                         0
converted_pledged_amount         0
country                          0
created_at                       0
creator                          0
currency                         0
currency_symbol                  0
currency_trailing_code           0
current_currency                 0
deadline                         0
disable_communication            0
friends                     208922
fx_rate                          0
goal                             0
id                               0
is_backing                  208922
is_starrable                     0
is_starred                  208922
launched_at                      0
location                       226
name                             0
permissions                 208922
photo                            0
pledged                          0
profile                          0
slug                             0
source_url          

We can get rid of some columns, which probably will have no impact on the model later on. The list of droped columns is: `blurb`,`converted_pledged_amount`,`currency_symbol`, `currency_trailing_code`, `friends`, `fx_rate`, `is_backing`,`permissions`,`photo`,`profile`,`slug`,`source_url`,`static_usd_rate`,`urls`.

Especially `friends`, `is_backing`, `is_starred`, `permission` are almost completely empty. So no information loss here.

The 226 and 480 missing observations for `location` and `usd_type` are so small in number that we can get rid of these rows as well. 

We also rename the currency into original_currency since this is more informative of its contents. 

In [87]:
# Rename the currency column.
df.rename(columns={'currency':'original_currency'}, inplace=True)
# Drop the listed columns.
df.drop([
    'blurb', 
    'converted_pledged_amount',
    'currency_symbol', 
    'currency_trailing_code', 
    'friends', 
    'fx_rate', 
    'is_backing',
    'is_starred',
    'permissions',
    'photo',
    'profile',
    'slug',
    'source_url', 
    'static_usd_rate',
    'urls'
    ], axis=1, inplace=True)

df.dropna(axis=0, inplace=True)

In [88]:
df.isnull().sum()

backers_count            0
category                 0
country                  0
created_at               0
creator                  0
original_currency        0
current_currency         0
deadline                 0
disable_communication    0
goal                     0
id                       0
is_starrable             0
launched_at              0
location                 0
name                     0
pledged                  0
spotlight                0
staff_pick               0
state                    0
state_changed_at         0
usd_pledged              0
usd_type                 0
dtype: int64

We than transform the `created_at`, `state_changed_at`, `deadline` into datetime types. 
With these we can calculate the time span from begin to success and also to the deadline of the project.

In [89]:
def convert_dates(df):
    # Convert the time columns to datetime types.
    df['created_at'] = pd.to_datetime(df['created_at'],unit='s')
    df['state_changed_at'] = pd.to_datetime(df['state_changed_at'],unit='s')
    df['deadline'] = pd.to_datetime(df['deadline'],unit='s')
    # Calculate the time spans.
    df['days_till_change'] = df['state_changed_at'].dt.date-df['created_at'].dt.date
    df['days_total'] = df['deadline'].dt.date-df['created_at'].dt.date
    # Convert the days to ints.
    df['days_till_change'] = df['days_till_change'].dt.days
    df['days_total'] = df['days_total'].dt.days
    return df

In [90]:
df = convert_dates(df)

In [91]:
df.head()

Unnamed: 0,backers_count,category,country,created_at,creator,original_currency,current_currency,deadline,disable_communication,goal,...,name,pledged,spotlight,staff_pick,state,state_changed_at,usd_pledged,usd_type,days_till_change,days_total
0,21,"{""id"":43,""name"":""Rock"",""slug"":""music/rock"",""po...",US,2013-12-21 21:01:30,"{""id"":1495925645,""name"":""Daniel"",""is_registere...",USD,USD,2014-02-08 22:37:26,False,200.0,...,New Final Round Album,802.0,True,False,successful,2014-02-08 22:37:26,802.0,international,49,49
1,97,"{""id"":54,""name"":""Mixed Media"",""slug"":""art/mixe...",US,2019-02-08 21:02:48,"{""id"":1175589980,""name"":""Katherine"",""slug"":""fr...",USD,USD,2019-03-05 16:00:11,False,400.0,...,Princess Pals Enamel Pin Series,2259.0,True,False,successful,2019-03-05 16:00:11,2259.0,international,25,25
2,88,"{""id"":280,""name"":""Photobooks"",""slug"":""photogra...",US,2016-10-23 17:06:24,"{""id"":1196856269,""name"":""MelissaThomas"",""is_re...",USD,USD,2016-12-01 15:58:50,False,27224.0,...,Their Life Through Their Lens-the Amish and Me...,29638.0,True,True,successful,2016-12-01 15:58:52,29638.0,international,39,39
3,193,"{""id"":266,""name"":""Footwear"",""slug"":""fashion/fo...",IT,2018-10-24 08:32:00,"{""id"":1569700626,""name"":""WAO"",""slug"":""wearewao...",EUR,USD,2018-12-08 22:59:00,False,40000.0,...,WAO: THE ECO EFFECT SHOES,43180.0,True,False,successful,2018-12-08 22:59:00,49075.15252,international,45,45
4,20,"{""id"":51,""name"":""Software"",""slug"":""technology/...",US,2015-03-07 05:35:17,"{""id"":1870845385,""name"":""Kalpit Jain"",""is_regi...",USD,USD,2015-04-08 16:36:57,False,1000.0,...,Apple Watch Development Course,549.0,False,False,failed,2015-04-08 16:36:59,549.0,domestic,32,32


In [93]:
df[df['days_till_change']==df['days_total']]['id'].count()

192653

A lot of projects reach their goal with the deadline. Maybe they were just marked as finished after the goal was reached. 

In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 208516 entries, 0 to 964
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   backers_count          208516 non-null  int64         
 1   category               208516 non-null  object        
 2   country                208516 non-null  object        
 3   created_at             208516 non-null  datetime64[ns]
 4   creator                208516 non-null  object        
 5   original_currency      208516 non-null  object        
 6   current_currency       208516 non-null  object        
 7   deadline               208516 non-null  datetime64[ns]
 8   disable_communication  208516 non-null  bool          
 9   goal                   208516 non-null  float64       
 10  id                     208516 non-null  int64         
 11  is_starrable           208516 non-null  bool          
 12  launched_at            208516 non-null  int64  

In [95]:
df.nunique()

backers_count              3237
category                    169
country                      22
created_at               181898
creator                  207859
original_currency            14
current_currency              1
deadline                 170611
disable_communication         2
goal                       5103
id                       182004
is_starrable                  2
launched_at              181849
location                  15235
name                     181421
pledged                   44293
spotlight                     2
staff_pick                    2
state                         5
state_changed_at         171788
usd_pledged               79114
usd_type                      2
days_till_change           1501
days_total                 1497
dtype: int64