In [2]:
# Analyzing a subset of Kickstarter data 
# Goal: Find the top 7 Kickstarter categories from the dataset. 
# Build a predictive model for success for each category. Determine which one is most useful.

In [3]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
location = 'Kickstartercombo.csv'
df = pd.read_csv(location)
df.head(10)

Unnamed: 0.1,Unnamed: 0,ID,name,main_category,category,country,state,goal,pledged,currency,launched,deadline,backers,usd pledged,usd_goal_real,usd_pledged_real
0,0,1000002330,The Songs of Adelaide & Abullah,Publishing,Poetry,GB,failed,1000.0,0.0,GBP,2015-08-11 12:12:00,2015-10-09 11:36:00,0,0.0,,
1,1,1000004038,Where is Hank?,Film & Video,Narrative Film,US,failed,45000.0,220.0,USD,2013-01-12 00:20:00,2013-02-26 00:20:00,3,220.0,,
2,2,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,US,failed,5000.0,1.0,USD,2012-03-17 03:24:00,2012-04-16 04:24:00,1,1.0,,
3,3,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,US,canceled,19500.0,1283.0,USD,2015-07-04 08:35:00,2015-08-29 01:00:00,14,1283.0,,
4,4,1000014025,Monarch Espresso Bar,Food,Restaurants,US,successful,50000.0,52375.0,USD,2016-02-26 13:38:00,2016-04-01 13:38:00,224,52375.0,,
5,5,1000023410,Support Solar Roasted Coffee & Green Energy! ...,Food,Food,US,successful,1000.0,1205.0,USD,2014-12-01 18:30:00,2014-12-21 18:30:00,16,1205.0,,
6,6,1000030581,Chaser Strips. Our Strips make Shots their B*tch!,Food,Drinks,US,failed,25000.0,453.0,USD,2016-02-01 20:05:00,2016-03-17 19:05:00,40,453.0,,
7,7,1000034518,SPIN - Premium Retractable In-Ear Headphones w...,Design,Product Design,US,canceled,125000.0,8233.0,USD,2014-04-24 18:14:00,2014-05-29 18:14:00,58,8233.0,,
8,8,100004195,STUDIO IN THE SKY - A Documentary Feature Film...,Film & Video,Documentary,US,canceled,65000.0,6240.57,USD,2014-07-11 21:55:00,2014-08-10 21:55:00,43,6240.57,,
9,9,100004721,Of Jesus and Madmen,Publishing,Nonfiction,CA,failed,2500.0,0.0,CAD,2013-09-09 18:19:00,2013-10-09 18:19:00,0,0.0,,


In [4]:
#create new column with numeric value for success of kickstarter, so that statsmodels regression can be used
df['is_success'] = np.where(df['state'] == 'successful', 1, 0)
df.head(10)

Unnamed: 0.1,Unnamed: 0,ID,name,main_category,category,country,state,goal,pledged,currency,launched,deadline,backers,usd pledged,usd_goal_real,usd_pledged_real,is_success
0,0,1000002330,The Songs of Adelaide & Abullah,Publishing,Poetry,GB,failed,1000.0,0.0,GBP,2015-08-11 12:12:00,2015-10-09 11:36:00,0,0.0,,,0
1,1,1000004038,Where is Hank?,Film & Video,Narrative Film,US,failed,45000.0,220.0,USD,2013-01-12 00:20:00,2013-02-26 00:20:00,3,220.0,,,0
2,2,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,US,failed,5000.0,1.0,USD,2012-03-17 03:24:00,2012-04-16 04:24:00,1,1.0,,,0
3,3,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,US,canceled,19500.0,1283.0,USD,2015-07-04 08:35:00,2015-08-29 01:00:00,14,1283.0,,,0
4,4,1000014025,Monarch Espresso Bar,Food,Restaurants,US,successful,50000.0,52375.0,USD,2016-02-26 13:38:00,2016-04-01 13:38:00,224,52375.0,,,1
5,5,1000023410,Support Solar Roasted Coffee & Green Energy! ...,Food,Food,US,successful,1000.0,1205.0,USD,2014-12-01 18:30:00,2014-12-21 18:30:00,16,1205.0,,,1
6,6,1000030581,Chaser Strips. Our Strips make Shots their B*tch!,Food,Drinks,US,failed,25000.0,453.0,USD,2016-02-01 20:05:00,2016-03-17 19:05:00,40,453.0,,,0
7,7,1000034518,SPIN - Premium Retractable In-Ear Headphones w...,Design,Product Design,US,canceled,125000.0,8233.0,USD,2014-04-24 18:14:00,2014-05-29 18:14:00,58,8233.0,,,0
8,8,100004195,STUDIO IN THE SKY - A Documentary Feature Film...,Film & Video,Documentary,US,canceled,65000.0,6240.57,USD,2014-07-11 21:55:00,2014-08-10 21:55:00,43,6240.57,,,0
9,9,100004721,Of Jesus and Madmen,Publishing,Nonfiction,CA,failed,2500.0,0.0,CAD,2013-09-09 18:19:00,2013-10-09 18:19:00,0,0.0,,,0


In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_goal_real,usd_pledged_real,is_success
count,702411.0,702411.0,702411.0,702411.0,702411.0,694817.0,378661.0,378661.0,702411.0
mean,351205.0,1074829000.0,48176.42,9223.828,103.726589,7405.969,45454.4,9058.924,0.351936
std,202768.734302,619200600.0,1162920.0,92860.16,919.654297,81440.94,1152950.0,90973.34,0.477575
min,0.0,5971.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0
25%,175602.5,537959400.0,2000.0,30.0,2.0,20.24,2000.0,31.0,0.0
50%,351205.0,1075562000.0,5000.0,615.0,12.0,456.0,5500.0,624.33,0.0
75%,526807.5,1610544000.0,15056.0,4020.0,55.0,3261.0,15500.0,4050.0,1.0
max,702410.0,2147476000.0,100000000.0,20338990.0,219382.0,20338990.0,166361400.0,20338990.0,1.0


In [6]:
# calculate general probability of success for a given project in the dataset
print('total projects =', len(df))
print('total successful =', df['is_success'].sum())
print('probability of success =', df['is_success'].mean())
p_success_all = df['is_success'].mean()
# 0.35193640190714554
# approx. 35.19% of projects in this dataset were successful 

total projects = 702411
total successful = 247204
probability of success = 0.35193640190714554


In [7]:
df.corr()

Unnamed: 0.1,Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_goal_real,usd_pledged_real,is_success
Unnamed: 0,1.0,-0.183689,0.002181,0.004774,0.001886,-0.003919,0.002557,0.000528,0.00179
ID,-0.183689,1.0,0.001706,0.00026,0.000586,-0.000567,0.001854,-2.5e-05,-0.000932
goal,0.002181,0.001706,1.0,0.00712,0.00445,0.005871,0.942692,0.005104,-0.024246
pledged,0.004774,0.00026,0.00712,1.0,0.728756,0.896962,0.005024,0.952843,0.108506
backers,0.001886,0.000586,0.00445,0.728756,1.0,0.705763,0.004517,0.752539,0.124514
usd pledged,-0.003919,-0.000567,0.005871,0.896962,0.705763,1.0,0.006172,0.907743,0.09945
usd_goal_real,0.002557,0.001854,0.942692,0.005024,0.004517,0.006172,1.0,0.005596,-0.023052
usd_pledged_real,0.000528,-2.5e-05,0.005104,0.952843,0.752539,0.907743,0.005596,1.0,0.110704
is_success,0.00179,-0.000932,-0.024246,0.108506,0.124514,0.09945,-0.023052,0.110704,1.0


In [8]:
# OLS regression for dependent variable 'is_success', independent variables: goal, pledged, backers
result = sm.ols(formula='is_success ~ goal + pledged + backers', data=df).fit()
result.summary()
# Adj. R-squared: 0.017
# Intercept: 0.3454, this implies there is still a 35% chance of success for a project with goal, pledged and backers all = 0
# intercept will be removed from the model in next regression analysis

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.017
Model:,OLS,Adj. R-squared:,0.017
Method:,Least Squares,F-statistic:,4001.0
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,02:25:31,Log-Likelihood:,-471620.0
No. Observations:,702411,AIC:,943300.0
Df Residuals:,702407,BIC:,943300.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.3454,0.001,606.773,0.000,0.344,0.347
goal,-1.025e-08,4.86e-10,-21.087,0.000,-1.12e-08,-9.29e-09
pledged,1.959e-07,8.89e-09,22.048,0.000,1.78e-07,2.13e-07
backers,5.03e-05,8.97e-07,56.064,0.000,4.85e-05,5.21e-05

0,1,2,3
Omnibus:,30234.531,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,34967.697
Skew:,0.506,Prob(JB):,0.0
Kurtosis:,3.414,Cond. No.,1170000.0


In [88]:
# OLS regression for dependent variable 'is_success', independent variables: goal, pledged, backers, and intercept is removed
result = sm.ols(formula='is_success ~ goal + pledged + backers - 1', data=df).fit()
Radj_all = result.rsquared_adj
result.summary()
# Adj. R-squared: 0.029, increased from model with intercept, but still very low
# goal P-value: 0.005, therefore there is a slight probability that the relationship between goal and success in the model is by chance

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.029
Model:,OLS,Adj. R-squared:,0.029
Method:,Least Squares,F-statistic:,6953.0
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,03:19:05,Log-Likelihood:,-619630.0
No. Observations:,702411,AIC:,1239000.0
Df Residuals:,702408,BIC:,1239000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
goal,1.689e-09,5.99e-10,2.818,0.005,5.15e-10,2.86e-09
pledged,3.286e-07,1.1e-08,29.962,0.000,3.07e-07,3.5e-07
backers,8.221e-05,1.11e-06,74.343,0.000,8e-05,8.44e-05

0,1,2,3
Omnibus:,149570.667,Durbin-Watson:,1.323
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6453381.182
Skew:,0.035,Prob(JB):,0.0
Kurtosis:,17.849,Cond. No.,1850.0


In [10]:
# find all the unique main category values
categories = df['main_category'].unique()
categories

array(['Publishing', 'Film & Video', 'Music', 'Food', 'Design', 'Crafts',
       'Games', 'Comics', 'Fashion', 'Theater', 'Art', 'Photography',
       'Technology', 'Dance', 'Journalism', 'Graphic Novels'],
      dtype=object)

In [11]:
# find categories with greatest number of projects
total_projects = len(df)
print('total projects =',total_projects)
projects = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]

x = 0
while x < 16:
    projects[x] = len(df[df['main_category'] == categories[x]])
    print(categories[x],'projects =',len(df[df['main_category'] == categories[x]]))
    x += 1
categorytotalslist = list(zip(categories,projects))
categorytotalsdf = pd.DataFrame(data = categorytotalslist, columns=['category','total'])
categorytotalsdf

total projects = 702411
Publishing projects = 74220
Film & Video projects = 121331
Music projects = 98710
Food projects = 45844
Design projects = 53979
Crafts projects = 16001
Games projects = 63272
Comics projects = 19575
Fashion projects = 41232
Theater projects = 20888
Art projects = 52177
Photography projects = 20468
Technology projects = 58730
Dance projects = 7145
Journalism projects = 8838
Graphic Novels projects = 1


Unnamed: 0,category,total
0,Publishing,74220
1,Film & Video,121331
2,Music,98710
3,Food,45844
4,Design,53979
5,Crafts,16001
6,Games,63272
7,Comics,19575
8,Fashion,41232
9,Theater,20888


In [12]:
categorytotalsdf = categorytotalsdf.sort_values('total',ascending=False)
categorytotalsdf = categorytotalsdf.reset_index(level=0, drop=True)
categorytotalsdf
# Top 7 categories (with total projects > 50,000):
# Film & Video = 121,331
# Music = 98,710
# Publishing = 74,220
# Games = 63,272
# Technology = 58,730
# Design = 53,979
# Art = 52,177

Unnamed: 0,category,total
0,Film & Video,121331
1,Music,98710
2,Publishing,74220
3,Games,63272
4,Technology,58730
5,Design,53979
6,Art,52177
7,Food,45844
8,Fashion,41232
9,Theater,20888


In [13]:
# Create individual dataframes for each of top 7 categories
dffilm = df[df['main_category'] == 'Film & Video']
dfmusic = df[df['main_category'] == 'Music']
dfpub = df[df['main_category'] == 'Publishing']
dfgames = df[df['main_category'] == 'Games']
dftech = df[df['main_category'] == 'Technology']
dfdesign = df[df['main_category'] == 'Design']
dfart = df[df['main_category'] == 'Art']

In [14]:
# Category: Film & Video

In [15]:
# calculate general probability of success for a given project in the category
print('total Film & Video projects =', len(dffilm))
print('total successful =', dffilm['is_success'].sum())
print('probability of success =', dffilm['is_success'].mean())
p_success_film = dffilm['is_success'].mean()
# 0.3712653814771163
# approx. 37.13% of projects in this category were successful vs approx. 35.19% of projects in the total dataset

total Film & Video projects = 121331
total successful = 45046
probability of success = 0.3712653814771163


In [16]:
dffilm.corr()
# goal & is_success: -0.032207, slight negative correlation
# pledged & is_success: 0.141474, positive correlation, 
# however amount pledged may not be a useful variable because the total amount pledged is not known until the end of the kickstarter
# backers & is_success: 0.120688, positive correlation, 
# total number of backers is not known until the end of the kickstarter, 
# however it may still be useful to know if a project's chance of success increases at a certain minimum number of backers


Unnamed: 0.1,Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_goal_real,usd_pledged_real,is_success
Unnamed: 0,1.0,-0.181566,0.005732,0.001993,0.000419,-0.005152,0.004777,8e-05,-0.001067
ID,-0.181566,1.0,0.003829,0.006359,0.005033,0.005956,0.005096,0.005954,-0.002034
goal,0.005732,0.003829,1.0,0.008795,0.007171,0.008201,0.96892,0.006973,-0.032207
pledged,0.001993,0.006359,0.008795,1.0,0.92267,0.975874,0.006632,0.977061,0.141474
backers,0.000419,0.005033,0.007171,0.92267,1.0,0.931216,0.006107,0.935119,0.120688
usd pledged,-0.005152,0.005956,0.008201,0.975874,0.931216,1.0,0.007112,0.987046,0.132633
usd_goal_real,0.004777,0.005096,0.96892,0.006632,0.006107,0.007112,1.0,0.006984,-0.031244
usd_pledged_real,8e-05,0.005954,0.006973,0.977061,0.935119,0.987046,0.006984,1.0,0.144749
is_success,-0.001067,-0.002034,-0.032207,0.141474,0.120688,0.132633,-0.031244,0.144749,1.0


In [75]:
# OLS regression for dependent variable 'is_success', independent variables: goal, pledged, backers
result = sm.ols(formula='is_success ~ goal + pledged + backers', data=dffilm).fit()
result.summary()
# Adj. R-squared: 0.022
# Intercept: 0.3615, this implies there is still a 36% chance of success for a project with goal, pledged and backers all = 0
# intercept will be removed from the model in next regression analysis

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.022
Model:,OLS,Adj. R-squared:,0.022
Method:,Least Squares,F-statistic:,900.9
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,03:12:53,Log-Likelihood:,-82563.0
No. Observations:,121331,AIC:,165100.0
Df Residuals:,121327,BIC:,165200.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.3615,0.001,260.232,0.000,0.359,0.364
goal,-1.004e-08,8.51e-10,-11.803,0.000,-1.17e-08,-8.37e-09
pledged,2.298e-06,8.33e-08,27.575,0.000,2.13e-06,2.46e-06
backers,-5.821e-05,6.45e-06,-9.022,0.000,-7.09e-05,-4.56e-05

0,1,2,3
Omnibus:,7807.493,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23729.557
Skew:,0.321,Prob(JB):,0.0
Kurtosis:,5.069,Cond. No.,1640000.0


In [18]:
# OLS regression for dependent variable 'is_success', independent variables: goal, pledged, backers, and intercept is removed
result = sm.ols(formula='is_success ~ goal + pledged + backers - 1', data=dffilm).fit()
result.summary()
# Adj. R-squared: 0.042, increased from model with intercept
# goal P-value: 0.742, therefore there is a high probability that the relationship between goal and success in the model is by chance
# goal will be removed from independent variables in next regression analysis

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.042
Model:,OLS,Adj. R-squared:,0.042
Method:,Least Squares,F-statistic:,1759.0
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,02:25:37,Log-Likelihood:,-109470.0
No. Observations:,121331,AIC:,218900.0
Df Residuals:,121328,BIC:,219000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
goal,3.49e-10,1.06e-09,0.329,0.742,-1.73e-09,2.43e-09
pledged,4.292e-06,1.04e-07,41.436,0.000,4.09e-06,4.5e-06
backers,-0.0001,8.05e-06,-15.422,0.000,-0.000,-0.000

0,1,2,3
Omnibus:,46810.333,Durbin-Watson:,1.295
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8082370.635
Skew:,-0.754,Prob(JB):,0.0
Kurtosis:,42.956,Cond. No.,7590.0


In [77]:
# OLS regression for dependent variable 'is_success', independent variables: pledged, backers, remove intercept
result = sm.ols(formula='is_success ~ pledged + backers - 1', data=dffilm).fit()
Radj_film = result.rsquared_adj
result.summary()
# Adj. R-squared: 0.042, same as previous model including goal in independent variables, this is still very low
# however it is greater than the adj. R-squared of 0.029 for the regression model of the total dataset
# therefore success can be predicted slightly more accurately for projects in this category than for the total dataset

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.042
Model:,OLS,Adj. R-squared:,0.042
Method:,Least Squares,F-statistic:,2638.0
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,03:13:18,Log-Likelihood:,-109470.0
No. Observations:,121331,AIC:,218900.0
Df Residuals:,121329,BIC:,219000.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
pledged,4.293e-06,1.04e-07,41.442,0.000,4.09e-06,4.5e-06
backers,-0.0001,8.05e-06,-15.424,0.000,-0.000,-0.000

0,1,2,3
Omnibus:,46819.327,Durbin-Watson:,1.295
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8087443.493
Skew:,-0.755,Prob(JB):,0.0
Kurtosis:,42.968,Cond. No.,203.0


In [20]:
# Category: Music

In [21]:
# calculate general probability of success for a given project in the category
print('total Music projects =', len(dfmusic))
print('total successful =', dfmusic['is_success'].sum())
print('probability of success =', dfmusic['is_success'].mean())
p_success_music = dfmusic['is_success'].mean()
# 0.4658595886941546
# approx. 46.59% of projects in this category were successful vs approx. 35.19% of projects in the total dataset

total Music projects = 98710
total successful = 45985
probability of success = 0.4658595886941546


In [22]:
dfmusic.corr()
# goal & is_success: -0.025812, slight negative correlation
# pledged & is_success: 0.228890, positive correlation, 
# however amount pledged may not be a useful variable because the total amount pledged is not known until the end of the kickstarter
# backers & is_success: 0.231500, positive correlation, 
# total number of backers is not known until the end of the kickstarter, 
# however it may still be useful to know if a project's chance of success increases at a certain minimum number of backers


Unnamed: 0.1,Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_goal_real,usd_pledged_real,is_success
Unnamed: 0,1.0,-0.182529,0.000678,0.008235,0.005633,-0.009254,-0.000519,0.005695,-0.001643
ID,-0.182529,1.0,0.002701,-0.002607,-0.002443,-0.000313,0.002883,0.000321,0.000157
goal,0.000678,0.002701,1.0,0.021044,0.005451,0.008186,0.997687,0.008639,-0.025812
pledged,0.008235,-0.002607,0.021044,1.0,0.808462,0.800357,0.007986,0.916276,0.22889
backers,0.005633,-0.002443,0.005451,0.808462,1.0,0.846359,0.004177,0.860632,0.2315
usd pledged,-0.009254,-0.000313,0.008186,0.800357,0.846359,1.0,0.006568,0.842536,0.266943
usd_goal_real,-0.000519,0.002883,0.997687,0.007986,0.004177,0.006568,1.0,0.007597,-0.025653
usd_pledged_real,0.005695,0.000321,0.008639,0.916276,0.860632,0.842536,0.007597,1.0,0.249787
is_success,-0.001643,0.000157,-0.025812,0.22889,0.2315,0.266943,-0.025653,0.249787,1.0


In [23]:
# OLS regression for dependent variable 'is_success', independent variables: goal, pledged, backers
result = sm.ols(formula='is_success ~ goal + pledged + backers', data=dfmusic).fit()
result.summary()
# Adj. R-squared: 0.059
# Intercept: 0.4324, this implies there is still a 43% chance of success for a project with goal, pledged and backers all = 0
# intercept will be removed from the model in next regression analysis

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.059
Model:,OLS,Adj. R-squared:,0.059
Method:,Least Squares,F-statistic:,2080.0
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,02:25:37,Log-Likelihood:,-68386.0
No. Observations:,98710,AIC:,136800.0
Df Residuals:,98706,BIC:,136800.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4324,0.002,269.904,0.000,0.429,0.436
goal,-4.044e-08,4.29e-09,-9.424,0.000,-4.88e-08,-3.2e-08
pledged,4.31e-06,1.86e-07,23.227,0.000,3.95e-06,4.67e-06
backers,0.0003,1.31e-05,25.378,0.000,0.000,0.000

0,1,2,3
Omnibus:,21530.536,Durbin-Watson:,2.003
Prob(Omnibus):,0.0,Jarque-Bera (JB):,542793.924
Skew:,-0.453,Prob(JB):,0.0
Kurtosis:,14.452,Cond. No.,374000.0


In [24]:
# OLS regression for dependent variable 'is_success', independent variables: goal, pledged, backers, and intercept is removed
result = sm.ols(formula='is_success ~ goal + pledged + backers - 1', data=dfmusic).fit()
result.summary()
# Adj. R-squared: 0.127, increased from model with intercept
# goal P-value: 0.584, therefore there is a high probability that the relationship between goal and success in the model is by chance
# goal will be removed from independent variables in next regression analysis

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.127
Model:,OLS,Adj. R-squared:,0.127
Method:,Least Squares,F-statistic:,4780.0
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,02:25:37,Log-Likelihood:,-95668.0
No. Observations:,98710,AIC:,191300.0
Df Residuals:,98707,BIC:,191400.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
goal,3.095e-09,5.65e-09,0.547,0.584,-7.99e-09,1.42e-08
pledged,9.806e-06,2.43e-07,40.329,0.000,9.33e-06,1.03e-05
backers,0.0005,1.73e-05,31.182,0.000,0.001,0.001

0,1,2,3
Omnibus:,103118.871,Durbin-Watson:,1.219
Prob(Omnibus):,0.0,Jarque-Bera (JB):,124247750.845
Skew:,-4.246,Prob(JB):,0.0
Kurtosis:,176.6,Cond. No.,3060.0


In [78]:
# OLS regression for dependent variable 'is_success', independent variables: pledged, backers, remove intercept
result = sm.ols(formula='is_success ~ pledged + backers - 1', data=dfmusic).fit()
Radj_music = result.rsquared_adj
result.summary()
# Adj. R-squared: 0.127, same as previous model including goal in independent variables, this is still low,
# however it is greater than the adj. R-squared of 0.029 for the regression model of the total dataset
# therefore success can be predicted more accurately for projects in this category than for the total dataset

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.127
Model:,OLS,Adj. R-squared:,0.127
Method:,Least Squares,F-statistic:,7170.0
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,03:13:38,Log-Likelihood:,-95668.0
No. Observations:,98710,AIC:,191300.0
Df Residuals:,98708,BIC:,191400.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
pledged,9.811e-06,2.43e-07,40.369,0.000,9.33e-06,1.03e-05
backers,0.0005,1.73e-05,31.177,0.000,0.001,0.001

0,1,2,3
Omnibus:,103138.28,Durbin-Watson:,1.219
Prob(Omnibus):,0.0,Jarque-Bera (JB):,124361555.194
Skew:,-4.247,Prob(JB):,0.0
Kurtosis:,176.68,Cond. No.,125.0


In [26]:
# Category: Publishing

In [27]:
# calculate general probability of success for a given project in the category
print('total Publishing projects =', len(dfpub))
print('total successful =', dfpub['is_success'].sum())
print('probability of success =', dfpub['is_success'].mean())
p_success_pub = dfpub['is_success'].mean()
# 0.3042441390460792
# approx. 30.42% of projects in this category were successful vs approx. 35.19% of projects in the total dataset

total Publishing projects = 74220
total successful = 22581
probability of success = 0.3042441390460792


In [28]:
dfpub.corr()
# goal & is_success: -0.013843, slight negative correlation
# pledged & is_success: 0.245385, positive correlation, 
# however amount pledged may not be a useful variable because the total amount pledged is not known until the end of the kickstarter
# backers & is_success: 0.230898, positive correlation, 
# total number of backers is not known until the end of the kickstarter, 
# however it may still be useful to know if a project's chance of success increases at a certain minimum number of backers


Unnamed: 0.1,Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_goal_real,usd_pledged_real,is_success
Unnamed: 0,1.0,-0.183445,0.006146,0.013353,0.008156,-0.005786,0.012393,0.006523,0.007086
ID,-0.183445,1.0,-0.008004,-0.004486,-0.00091,-0.004871,-0.005671,-0.002507,0.004047
goal,0.006146,-0.008004,1.0,0.003502,-0.000807,-0.000196,0.89687,8e-05,-0.013843
pledged,0.013353,-0.004486,0.003502,1.0,0.769246,0.787567,0.000513,0.858166,0.245385
backers,0.008156,-0.00091,-0.000807,0.769246,1.0,0.855251,-0.000256,0.894551,0.230898
usd pledged,-0.005786,-0.004871,-0.000196,0.787567,0.855251,1.0,0.000469,0.872801,0.261903
usd_goal_real,0.012393,-0.005671,0.89687,0.000513,-0.000256,0.000469,1.0,0.000599,-0.013506
usd_pledged_real,0.006523,-0.002507,8e-05,0.858166,0.894551,0.872801,0.000599,1.0,0.274385
is_success,0.007086,0.004047,-0.013843,0.245385,0.230898,0.261903,-0.013506,0.274385,1.0


In [29]:
# OLS regression for dependent variable 'is_success', independent variables: goal, pledged, backers
result = sm.ols(formula='is_success ~ goal + pledged + backers', data=dfpub).fit()
result.summary()
# Adj. R-squared: 0.065
# Intercept: 0.2799, this implies there is still a 28% chance of success for a project with goal, pledged and backers all = 0
# intercept will be removed from the model in next regression analysis

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.065
Model:,OLS,Adj. R-squared:,0.065
Method:,Least Squares,F-statistic:,1713.0
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,02:25:38,Log-Likelihood:,-45209.0
No. Observations:,74220,AIC:,90430.0
Df Residuals:,74216,BIC:,90460.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.2799,0.002,167.675,0.000,0.277,0.283
goal,-5.86e-09,1.45e-09,-4.040,0.000,-8.7e-09,-3.02e-09
pledged,4.493e-06,1.5e-07,29.903,0.000,4.2e-06,4.79e-06
backers,0.0002,8.84e-06,18.555,0.000,0.000,0.000

0,1,2,3
Omnibus:,5956.59,Durbin-Watson:,2.011
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10992.693
Skew:,0.571,Prob(JB):,0.0
Kurtosis:,4.499,Cond. No.,1150000.0


In [30]:
# OLS regression for dependent variable 'is_success', independent variables: goal, pledged, backers, and intercept is removed
result = sm.ols(formula='is_success ~ goal + pledged + backers - 1', data=dfpub).fit()
result.summary()
# Adj. R-squared: 0.103, increased from model with intercept
# goal P-value: 0.846, therefore there is a high probability that the relationship between goal and success in the model is by chance
# goal will be removed from independent variables in next regression analysis

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.103
Model:,OLS,Adj. R-squared:,0.103
Method:,Least Squares,F-statistic:,2835.0
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,02:25:38,Log-Likelihood:,-57130.0
No. Observations:,74220,AIC:,114300.0
Df Residuals:,74217,BIC:,114300.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
goal,3.313e-10,1.7e-09,0.195,0.846,-3.01e-09,3.67e-09
pledged,6.802e-06,1.76e-07,38.709,0.000,6.46e-06,7.15e-06
backers,0.0002,1.04e-05,22.516,0.000,0.000,0.000

0,1,2,3
Omnibus:,15927.405,Durbin-Watson:,1.482
Prob(Omnibus):,0.0,Jarque-Bera (JB):,597293.554
Skew:,-0.227,Prob(JB):,0.0
Kurtosis:,16.89,Cond. No.,6090.0


In [79]:
# OLS regression for dependent variable 'is_success', independent variables: pledged, backers, remove intercept
result = sm.ols(formula='is_success ~ pledged + backers - 1', data=dfpub).fit()
Radj_pub = result.rsquared_adj
result.summary()
# Adj. R-squared: 0.103, same as previous model including goal in independent variables, this is still very low
# however it is greater than the adj. R-squared of 0.029 for the regression model of the total dataset
# therefore success can be predicted more accurately for projects in this category than for the total dataset

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.103
Model:,OLS,Adj. R-squared:,0.103
Method:,Least Squares,F-statistic:,4252.0
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,03:14:10,Log-Likelihood:,-57130.0
No. Observations:,74220,AIC:,114300.0
Df Residuals:,74218,BIC:,114300.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
pledged,6.802e-06,1.76e-07,38.713,0.000,6.46e-06,7.15e-06
backers,0.0002,1.04e-05,22.515,0.000,0.000,0.000

0,1,2,3
Omnibus:,15928.099,Durbin-Watson:,1.482
Prob(Omnibus):,0.0,Jarque-Bera (JB):,597364.403
Skew:,-0.227,Prob(JB):,0.0
Kurtosis:,16.891,Cond. No.,93.9


In [32]:
# Category: Games

In [33]:
# calculate general probability of success for a given project in the category
print('total Games projects =', len(dfgames))
print('total successful =', dfgames['is_success'].sum())
print('probability of success =', dfgames['is_success'].mean())
p_success_games = dfgames['is_success'].mean()
# 0.34634593501074723
# approx. 34.63% of projects in this category were successful vs approx. 35.19% of projects in the total dataset

total Games projects = 63272
total successful = 21914
probability of success = 0.34634593501074723


In [34]:
dfgames.corr()
# goal & is_success: -0.025160, slight negative correlation
# pledged & is_success: 0.150675, positive correlation, 
# however amount pledged may not be a useful variable because the total amount pledged is not known until the end of the kickstarter
# backers & is_success: 0.166576, positive correlation, 
# total number of backers is not known until the end of the kickstarter, 
# however it may still be useful to know if a project's chance of success increases at a certain minimum number of backers


Unnamed: 0.1,Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_goal_real,usd_pledged_real,is_success
Unnamed: 0,1.0,-0.181743,-0.002446,0.000806,-0.001343,-0.009685,0.001257,-0.002537,0.013969
ID,-0.181743,1.0,0.003614,-0.002108,0.003084,-0.002846,0.003173,-0.00094,0.00518
goal,-0.002446,0.003614,1.0,0.016912,0.013874,0.016002,0.878847,0.014075,-0.02516
pledged,0.000806,-0.002108,0.016912,1.0,0.777541,0.91852,0.017122,0.974431,0.150675
backers,-0.001343,0.003084,0.013874,0.777541,1.0,0.81596,0.018169,0.778208,0.166576
usd pledged,-0.009685,-0.002846,0.016002,0.91852,0.81596,1.0,0.020915,0.916488,0.140777
usd_goal_real,0.001257,0.003173,0.878847,0.017122,0.018169,0.020915,1.0,0.018131,-0.025874
usd_pledged_real,-0.002537,-0.00094,0.014075,0.974431,0.778208,0.916488,0.018131,1.0,0.146192
is_success,0.013969,0.00518,-0.02516,0.150675,0.166576,0.140777,-0.025874,0.146192,1.0


In [35]:
# OLS regression for dependent variable 'is_success', independent variables: goal, pledged, backers
result = sm.ols(formula='is_success ~ goal + pledged + backers', data=dfgames).fit()
result.summary()
# Adj. R-squared: 0.030
# Intercept: 0.3351, this implies there is still a 34% chance of success for a project with goal, pledged and backers all = 0
# intercept will be removed from the model in next regression analysis

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.03
Model:,OLS,Adj. R-squared:,0.03
Method:,Least Squares,F-statistic:,644.5
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,02:25:39,Log-Likelihood:,-41832.0
No. Observations:,63272,AIC:,83670.0
Df Residuals:,63268,BIC:,83710.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.3351,0.002,177.709,0.000,0.331,0.339
goal,-1.227e-08,1.73e-09,-7.099,0.000,-1.57e-08,-8.88e-09
pledged,1.555e-07,1.8e-08,8.659,0.000,1.2e-07,1.91e-07
backers,2.669e-05,1.33e-06,20.076,0.000,2.41e-05,2.93e-05

0,1,2,3
Omnibus:,2948.311,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3261.901
Skew:,0.541,Prob(JB):,0.0
Kurtosis:,2.745,Cond. No.,1090000.0


In [36]:
# OLS regression for dependent variable 'is_success', independent variables: goal, pledged, backers, and intercept is removed
result = sm.ols(formula='is_success ~ goal + pledged + backers - 1', data=dfgames).fit()
result.summary()
# Adj. R-squared: 0.049, increased from model with intercept
# goal P-value: 0.356, therefore there is a high probability that the relationship between goal and success in the model is by chance
# goal will be removed from independent variables in next regression analysis

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.049
Model:,OLS,Adj. R-squared:,0.049
Method:,Least Squares,F-statistic:,1090.0
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,02:25:39,Log-Likelihood:,-54641.0
No. Observations:,63272,AIC:,109300.0
Df Residuals:,63269,BIC:,109300.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
goal,1.952e-09,2.11e-09,0.923,0.356,-2.19e-09,6.1e-09
pledged,2.38e-07,2.2e-08,10.830,0.000,1.95e-07,2.81e-07
backers,4.309e-05,1.62e-06,26.540,0.000,3.99e-05,4.63e-05

0,1,2,3
Omnibus:,10068.458,Durbin-Watson:,1.349
Prob(Omnibus):,0.0,Jarque-Bera (JB):,189991.742
Skew:,0.094,Prob(JB):,0.0
Kurtosis:,11.487,Cond. No.,768.0


In [83]:
# OLS regression for dependent variable 'is_success', independent variables: pledged, backers, remove intercept
result = sm.ols(formula='is_success ~ pledged + backers - 1', data=dfgames).fit()
Radj_games = result.rsquared_adj
result.summary()
# Adj. R-squared: 0.049, same as previous model including goal in independent variables, this is still very low
# however it is greater than the adj. R-squared of 0.029 for the regression model of the total dataset
# therefore success can be predicted slightly more accurately for projects in this category than for the total dataset

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.049
Model:,OLS,Adj. R-squared:,0.049
Method:,Least Squares,F-statistic:,1634.0
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,03:14:53,Log-Likelihood:,-54642.0
No. Observations:,63272,AIC:,109300.0
Df Residuals:,63270,BIC:,109300.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
pledged,2.382e-07,2.2e-08,10.841,0.000,1.95e-07,2.81e-07
backers,4.31e-05,1.62e-06,26.545,0.000,3.99e-05,4.63e-05

0,1,2,3
Omnibus:,10079.534,Durbin-Watson:,1.348
Prob(Omnibus):,0.0,Jarque-Bera (JB):,190753.783
Skew:,0.093,Prob(JB):,0.0
Kurtosis:,11.504,Cond. No.,118.0


In [38]:
# Category: Technology

In [39]:
# calculate general probability of success for a given project in the category
print('total Technology projects =', len(dftech))
print('total successful =', dftech['is_success'].sum())
print('probability of success =', dftech['is_success'].mean())
p_success_tech = dftech['is_success'].mean()
# 0.19584539417674102
# approx. 19.58% of projects in this category were successful vs approx. 35.19% of projects in the total dataset

total Technology projects = 58730
total successful = 11502
probability of success = 0.19584539417674102


In [40]:
dftech.corr()
# goal & is_success: -0.025215, slight negative correlation
# pledged & is_success: 0.276321, positive correlation, 
# however amount pledged may not be a useful variable because the total amount pledged is not known until the end of the kickstarter
# backers & is_success: 0.238011, positive correlation, 
# total number of backers is not known until the end of the kickstarter, 
# however it may still be useful to know if a project's chance of success increases at a certain minimum number of backers


Unnamed: 0.1,Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_goal_real,usd_pledged_real,is_success
Unnamed: 0,1.0,-0.186484,0.002914,0.004704,0.000176,-0.01203,0.005819,-0.001301,0.00037
ID,-0.186484,1.0,0.010891,-0.00142,-0.002058,-0.004481,0.009732,-0.001933,-0.010029
goal,0.002914,0.010891,1.0,0.003229,-0.001261,0.002797,0.888233,0.000788,-0.025215
pledged,0.004704,-0.00142,0.003229,1.0,0.697542,0.83344,0.001869,0.936212,0.276321
backers,0.000176,-0.002058,-0.001261,0.697542,1.0,0.673214,-0.000133,0.725736,0.238011
usd pledged,-0.01203,-0.004481,0.002797,0.83344,0.673214,1.0,0.005511,0.851316,0.260027
usd_goal_real,0.005819,0.009732,0.888233,0.001869,-0.000133,0.005511,1.0,0.003306,-0.023943
usd_pledged_real,-0.001301,-0.001933,0.000788,0.936212,0.725736,0.851316,0.003306,1.0,0.281649
is_success,0.00037,-0.010029,-0.025215,0.276321,0.238011,0.260027,-0.023943,0.281649,1.0


In [41]:
# OLS regression for dependent variable 'is_success', independent variables: goal, pledged, backers
result = sm.ols(formula='is_success ~ goal + pledged + backers', data=dftech).fit()
result.summary()
# Adj. R-squared: 0.081
# Intercept: 0.1773, this implies there is still an 18% chance of success for a project with goal, pledged and backers all = 0
# intercept will be removed from the model in next regression analysis

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.081
Model:,OLS,Adj. R-squared:,0.081
Method:,Least Squares,F-statistic:,1726.0
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,02:25:39,Log-Likelihood:,-26575.0
No. Observations:,58730,AIC:,53160.0
Df Residuals:,58726,BIC:,53190.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1773,0.002,111.103,0.000,0.174,0.180
goal,-5.789e-09,8.88e-10,-6.521,0.000,-7.53e-09,-4.05e-09
pledged,6.505e-07,1.67e-08,38.949,0.000,6.18e-07,6.83e-07
backers,3.032e-05,1.9e-06,15.937,0.000,2.66e-05,3.41e-05

0,1,2,3
Omnibus:,14409.319,Durbin-Watson:,2.021
Prob(Omnibus):,0.0,Jarque-Bera (JB):,44849.194
Skew:,1.259,Prob(JB):,0.0
Kurtosis:,6.462,Cond. No.,1800000.0


In [42]:
# OLS regression for dependent variable 'is_success', independent variables: goal, pledged, backers, and intercept is removed
result = sm.ols(formula='is_success ~ goal + pledged + backers - 1', data=dftech).fit()
result.summary()
# Adj. R-squared: 0.106, increased from model with intercept
# goal P-value: 0.498, therefore there is a high probability that the relationship between goal and success in the model is by chance
# goal will be removed from independent variables in next regression analysis

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.106
Model:,OLS,Adj. R-squared:,0.106
Method:,Least Squares,F-statistic:,2313.0
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,02:25:39,Log-Likelihood:,-32178.0
No. Observations:,58730,AIC:,64360.0
Df Residuals:,58727,BIC:,64390.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
goal,6.602e-10,9.74e-10,0.677,0.498,-1.25e-09,2.57e-09
pledged,8.194e-07,1.83e-08,44.784,0.000,7.84e-07,8.55e-07
backers,3.825e-05,2.09e-06,18.289,0.000,3.42e-05,4.24e-05

0,1,2,3
Omnibus:,15301.6,Durbin-Watson:,1.681
Prob(Omnibus):,0.0,Jarque-Bera (JB):,209359.36
Skew:,0.875,Prob(JB):,0.0
Kurtosis:,12.083,Cond. No.,2150.0


In [82]:
# OLS regression for dependent variable 'is_success', independent variables: pledged, backers, remove intercept
result = sm.ols(formula='is_success ~ pledged + backers - 1', data=dftech).fit()
Radj_tech = result.rsquared_adj
result.summary()
# Adj. R-squared: 0.106, same as previous model including goal in independent variables, this is still very low
# however it is greater than the adj. R-squared of 0.029 for the regression model of the total dataset
# therefore success can be predicted more accurately for projects in this category than for the total dataset

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.106
Model:,OLS,Adj. R-squared:,0.106
Method:,Least Squares,F-statistic:,3469.0
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,03:14:47,Log-Likelihood:,-32178.0
No. Observations:,58730,AIC:,64360.0
Df Residuals:,58728,BIC:,64380.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
pledged,8.195e-07,1.83e-08,44.795,0.000,7.84e-07,8.55e-07
backers,3.825e-05,2.09e-06,18.287,0.000,3.41e-05,4.23e-05

0,1,2,3
Omnibus:,15301.596,Durbin-Watson:,1.681
Prob(Omnibus):,0.0,Jarque-Bera (JB):,209525.715
Skew:,0.874,Prob(JB):,0.0
Kurtosis:,12.086,Cond. No.,161.0


In [44]:
# Category: Design

In [45]:
# calculate general probability of success for a given project in the category
print('total Design projects =', len(dfdesign))
print('total successful =', dfdesign['is_success'].sum())
print('probability of success =', dfdesign['is_success'].mean())
p_success_design = dfdesign['is_success'].mean()
# 0.34309638933659387
# approx. 34.31% of projects in this category were successful vs approx. 35.19% of projects in the total dataset

total Design projects = 53979
total successful = 18520
probability of success = 0.34309638933659387


In [46]:
dfdesign.corr()
# goal & is_success: -0.023216, slight negative correlation
# pledged & is_success: 0.129266, positive correlation, 
# however amount pledged may not be a useful variable because the total amount pledged is not known until the end of the kickstarter
# backers & is_success: 0.173753, positive correlation, 
# total number of backers is not known until the end of the kickstarter, 
# however it may still be useful to know if a project's chance of success increases at a certain minimum number of backers


Unnamed: 0.1,Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_goal_real,usd_pledged_real,is_success
Unnamed: 0,1.0,-0.183487,0.003565,0.00819,0.000321,-0.003386,0.00762,0.005139,0.014983
ID,-0.183487,1.0,-0.005619,0.003557,0.003321,0.002464,-0.005571,0.001882,0.001238
goal,0.003565,-0.005619,1.0,0.009593,0.001291,0.005552,0.967529,0.006083,-0.023216
pledged,0.00819,0.003557,0.009593,1.0,0.753255,0.907716,0.005997,0.951258,0.129266
backers,0.000321,0.003321,0.001291,0.753255,1.0,0.638787,0.001747,0.777847,0.173753
usd pledged,-0.003386,0.002464,0.005552,0.907716,0.638787,1.0,0.006181,0.919446,0.104955
usd_goal_real,0.00762,-0.005571,0.967529,0.005997,0.001747,0.006181,1.0,0.006295,-0.022489
usd_pledged_real,0.005139,0.001882,0.006083,0.951258,0.777847,0.919446,0.006295,1.0,0.131462
is_success,0.014983,0.001238,-0.023216,0.129266,0.173753,0.104955,-0.022489,0.131462,1.0


In [47]:
# OLS regression for dependent variable 'is_success', independent variables: goal, pledged, backers
result = sm.ols(formula='is_success ~ goal + pledged + backers', data=dfdesign).fit()
result.summary()
# Adj. R-squared: 0.031
# Intercept: 0.3312, this implies there is still a 33% chance of success for a project with goal, pledged and backers all = 0
# intercept will be removed from the model in next regression analysis

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.031
Model:,OLS,Adj. R-squared:,0.031
Method:,Least Squares,F-statistic:,570.7
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,02:25:40,Log-Likelihood:,-35537.0
No. Observations:,53979,AIC:,71080.0
Df Residuals:,53975,BIC:,71120.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.3312,0.002,162.551,0.000,0.327,0.335
goal,-1.17e-08,2.12e-09,-5.524,0.000,-1.59e-08,-7.55e-09
pledged,-6.765e-09,1.33e-08,-0.507,0.612,-3.29e-08,1.94e-08
backers,5.293e-05,1.93e-06,27.354,0.000,4.91e-05,5.67e-05

0,1,2,3
Omnibus:,3033.453,Durbin-Watson:,1.987
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4790.515
Skew:,0.474,Prob(JB):,0.0
Kurtosis:,4.109,Cond. No.,963000.0


In [48]:
# OLS regression for dependent variable 'is_success', independent variables: goal, pledged, backers, and intercept is removed
result = sm.ols(formula='is_success ~ goal + pledged + backers - 1', data=dfdesign).fit()
result.summary()
# Adj. R-squared: 0.052, increased from model with intercept
# goal P-value: 0.038, pledged P-value: 0.409, therefore there is a high probability that the relationship between pledged and success in the model is by chance
# pledged will be removed from independent variables in next regression analysis

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.052
Model:,OLS,Adj. R-squared:,0.052
Method:,Least Squares,F-statistic:,978.9
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,02:25:40,Log-Likelihood:,-46291.0
No. Observations:,53979,AIC:,92590.0
Df Residuals:,53976,BIC:,92620.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
goal,5.362e-09,2.58e-09,2.076,0.038,3.01e-10,1.04e-08
pledged,-1.346e-08,1.63e-08,-0.826,0.409,-4.54e-08,1.85e-08
backers,8.459e-05,2.35e-06,36.003,0.000,8e-05,8.92e-05

0,1,2,3
Omnibus:,12985.23,Durbin-Watson:,1.35
Prob(Omnibus):,0.0,Jarque-Bera (JB):,732382.135
Skew:,-0.222,Prob(JB):,0.0
Kurtosis:,21.04,Cond. No.,910.0


In [84]:
# OLS regression for dependent variable 'is_success', independent variables: goal, backers, remove intercept
result = sm.ols(formula='is_success ~ goal + backers - 1', data=dfdesign).fit()
Radj_design = result.rsquared_adj
result.summary()
# Adj. R-squared: 0.052, same as previous model including goal in independent variables, this is still very low
# however it is greater than the adj. R-squared of 0.029 for the regression model of the total dataset
# therefore success can be predicted more accurately for projects in this category than for the total dataset
# Also of note, this model uses the independent variables goal and backers, while the models for the total dataset 
# and the other categories analyzed so far use the independent variables pledged and backers
# However the correlation coefficient of goal & is_success for this category was similar to the other categories

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.052
Model:,OLS,Adj. R-squared:,0.052
Method:,Least Squares,F-statistic:,1468.0
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,03:15:19,Log-Likelihood:,-46292.0
No. Observations:,53979,AIC:,92590.0
Df Residuals:,53977,BIC:,92600.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
goal,5.335e-09,2.58e-09,2.066,0.039,2.74e-10,1.04e-08
backers,8.312e-05,1.54e-06,54.125,0.000,8.01e-05,8.61e-05

0,1,2,3
Omnibus:,12748.341,Durbin-Watson:,1.35
Prob(Omnibus):,0.0,Jarque-Bera (JB):,682139.651
Skew:,-0.21,Prob(JB):,0.0
Kurtosis:,20.41,Cond. No.,595.0


In [50]:
# Category: Art

In [51]:
# calculate general probability of success for a given project in the category
print('total Art projects =', len(dfart))
print('total successful =', dfart['is_success'].sum())
print('probability of success =', dfart['is_success'].mean())
p_success_art = dfart['is_success'].mean()
# 0.4060601414416314
# approx. 40.61% of projects in this category were successful vs approx. 35.19% of projects in the total dataset

total Art projects = 52177
total successful = 21187
probability of success = 0.4060601414416314


In [52]:
dfart.corr()
# goal & is_success: -0.022361, slight negative correlation
# pledged & is_success: 0.116895, positive correlation, 
# however amount pledged may not be a useful variable because the total amount pledged is not known until the end of the kickstarter
# backers & is_success: 0.213721, positive correlation, 
# total number of backers is not known until the end of the kickstarter, 
# however it may still be useful to know if a project's chance of success increases at a certain minimum number of backers


Unnamed: 0.1,Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_goal_real,usd_pledged_real,is_success
Unnamed: 0,1.0,-0.182385,-0.001861,0.006215,0.008245,-0.00791,-0.005315,-0.002613,0.003576
ID,-0.182385,1.0,0.005161,0.005817,-0.000575,0.000935,0.000531,0.004009,0.003108
goal,-0.001861,0.005161,1.0,0.002805,-0.001721,0.004301,0.991579,0.002229,-0.022361
pledged,0.006215,0.005817,0.002805,1.0,0.592788,0.58349,0.001309,0.758102,0.116895
backers,0.008245,-0.000575,-0.001721,0.592788,1.0,0.579155,-0.001617,0.720229,0.213721
usd pledged,-0.00791,0.000935,0.004301,0.58349,0.579155,1.0,0.004508,0.736136,0.174439
usd_goal_real,-0.005315,0.000531,0.991579,0.001309,-0.001617,0.004508,1.0,0.00271,-0.022055
usd_pledged_real,-0.002613,0.004009,0.002229,0.758102,0.720229,0.736136,0.00271,1.0,0.142728
is_success,0.003576,0.003108,-0.022361,0.116895,0.213721,0.174439,-0.022055,0.142728,1.0


In [53]:
# OLS regression for dependent variable 'is_success', independent variables: goal, pledged, backers
result = sm.ols(formula='is_success ~ goal + pledged + backers', data=dfart).fit()
result.summary()
# Adj. R-squared: 0.046
# Intercept: 0.3825, this implies there is still a 38% chance of success for a project with goal, pledged and backers all = 0
# intercept will be removed from the model in next regression analysis

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.046
Model:,OLS,Adj. R-squared:,0.046
Method:,Least Squares,F-statistic:,844.4
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,02:25:40,Log-Likelihood:,-35695.0
No. Observations:,52177,AIC:,71400.0
Df Residuals:,52173,BIC:,71430.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.3825,0.002,177.597,0.000,0.378,0.387
goal,-8.526e-09,1.66e-09,-5.131,0.000,-1.18e-08,-5.27e-09
pledged,-2.588e-07,9.18e-08,-2.821,0.005,-4.39e-07,-7.9e-08
backers,0.0006,1.44e-05,41.923,0.000,0.001,0.001

0,1,2,3
Omnibus:,8390.659,Durbin-Watson:,2.012
Prob(Omnibus):,0.0,Jarque-Bera (JB):,164921.525
Skew:,-0.061,Prob(JB):,0.0
Kurtosis:,11.709,Cond. No.,1300000.0


In [54]:
# OLS regression for dependent variable 'is_success', independent variables: goal, pledged, backers, and intercept is removed
result = sm.ols(formula='is_success ~ goal + pledged + backers - 1', data=dfart).fit()
result.summary()
# Adj. R-squared: 0.091, increased from model with intercept
# goal P-value: 0.837, therefore there is a high probability that the relationship between goal and success in the model is by chance
# goal will be removed from independent variables in next regression analysis

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.091
Model:,OLS,Adj. R-squared:,0.091
Method:,Least Squares,F-statistic:,1744.0
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,02:25:41,Log-Likelihood:,-48031.0
No. Observations:,52177,AIC:,96070.0
Df Residuals:,52174,BIC:,96090.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
goal,4.338e-10,2.1e-09,0.206,0.837,-3.69e-09,4.56e-09
pledged,-5.328e-07,1.16e-07,-4.585,0.000,-7.61e-07,-3.05e-07
backers,0.0011,1.79e-05,60.476,0.000,0.001,0.001

0,1,2,3
Omnibus:,39244.357,Durbin-Watson:,1.294
Prob(Omnibus):,0.0,Jarque-Bera (JB):,26350157.325
Skew:,-2.37,Prob(JB):,0.0
Kurtosis:,112.99,Cond. No.,8520.0


In [85]:
# OLS regression for dependent variable 'is_success', independent variables: pledged, backers, remove intercept
result = sm.ols(formula='is_success ~ pledged + backers - 1', data=dfart).fit()
Radj_art = result.rsquared_adj
result.summary()
# Adj. R-squared: 0.091, same as previous model including goal in independent variables, this is still very low
# however it is greater than the adj. R-squared of 0.029 for the regression model of the total dataset
# therefore success can be predicted slightly more accurately for projects in this category than for the total dataset

0,1,2,3
Dep. Variable:,is_success,R-squared:,0.091
Model:,OLS,Adj. R-squared:,0.091
Method:,Least Squares,F-statistic:,2616.0
Date:,"Sat, 14 Jul 2018",Prob (F-statistic):,0.0
Time:,03:15:36,Log-Likelihood:,-48031.0
No. Observations:,52177,AIC:,96070.0
Df Residuals:,52175,BIC:,96080.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
pledged,-5.327e-07,1.16e-07,-4.584,0.000,-7.61e-07,-3.05e-07
backers,0.0011,1.79e-05,60.477,0.000,0.001,0.001

0,1,2,3
Omnibus:,39247.321,Durbin-Watson:,1.294
Prob(Omnibus):,0.0,Jarque-Bera (JB):,26357702.236
Skew:,-2.37,Prob(JB):,0.0
Kurtosis:,113.006,Cond. No.,193.0


In [91]:
# Summary of results
print('All Categories')
print('projects:', len(df))
print('success:', p_success_all)
print('adjusted R-squared:', Radj_all)
print('')
print('Category: Film & Video')
print('projects:', len(dffilm))
print('success:', p_success_film)
print('adjusted R-squared:', Radj_film)
print('')
print('Category: Music')
print('projects:', len(dfmusic))
print('success:', p_success_music)
print('adjusted R-squared:', Radj_music)
print('')
print('Category: Publishing')
print('projects:', len(dfpub))
print('success:', p_success_pub)
print('adjusted R-squared:', Radj_pub)
print('')
print('Category: Games')
print('projects:', len(dfgames))
print('success:', p_success_games)
print('adjusted R-squared:', Radj_games)
print('')
print('Category: Technology')
print('projects:', len(dftech))
print('success:', p_success_tech)
print('adjusted R-squared:', Radj_tech)
print('')
print('Category: Design')
print('projects:', len(dfdesign))
print('success:', p_success_design)
print('adjusted R-squared:', Radj_design)
print('')
print('Category: Art')
print('projects:', len(dfart))
print('success:', p_success_art)
print('adjusted R-squared:', Radj_art)

All Categories
projects: 702411
success: 0.35193640190714554
adjusted R-squared: 0.028837092431782674

Category: Film & Video
projects: 121331
success: 0.3712653814771163
adjusted R-squared: 0.041656745488558244

Category: Music
projects: 98710
success: 0.4658595886941546
adjusted R-squared: 0.1268321264535024

Category: Publishing
projects: 74220
success: 0.3042441390460792
adjusted R-squared: 0.10278246094153198

Category: Games
projects: 63272
success: 0.34634593501074723
adjusted R-squared: 0.04908743868927645

Category: Technology
projects: 58730
success: 0.19584539417674102
adjusted R-squared: 0.10561494298299123

Category: Design
projects: 53979
success: 0.34309638933659387
adjusted R-squared: 0.05155220173341235

Category: Art
projects: 52177
success: 0.4060601414416314
adjusted R-squared: 0.0910935727442197



In [None]:
# Conclusions: 
# Music is the category with the greatest proportion of successful projects, and the best predictive model for success.
# More analysis would be needed to determine whether these two conditions are related.
# Technology is the category with the smallest proportion of successful projects.
# Film & Video has the worst predictive model, but is still slightly better than the model for all categories.
# None of the models are very useful, with adjusted R-squared values ranging from approx. 0.03 to 0.13
# Perhaps there are other variables that are better predictors of successful projects, that were not available in this dataset. 