In [1]:
import random
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LassoCV

random.seed(1)

In the following chunk of code, I am importing the initial cleaned dataset, but we will need to do additional cleaning. Steps that we will be taking:

1. Dropping the 'Unnamed: 0' column as it contains the original index of the cleaned dataset (which we do not need)
2. Filtering the dataset to only include the entries with successful ('1') or failed ('0') projects
3. Filtered the dataset to only use games as a main category subset for more specific analysis. Games was the category with the most backers

In [2]:
kickstarter_clean = pd.read_csv('kickstarter_cleaned_index.csv')
kickstarter_clean = kickstarter_clean.drop(columns = ['Unnamed: 0'])
#kickstarter_clean['year_launched'] = kickstarter_clean.launched.apply(lambda x:int(x[0:4]))

FileNotFoundError: [Errno 2] File b'kickstarter_cleaned_index.csv' does not exist: b'kickstarter_cleaned_index.csv'

In [None]:
kickstarter_clean_fail_successful = kickstarter_clean[(kickstarter_clean.state == '1') | (kickstarter_clean.state == '0')] 

kickstarter_games = kickstarter_clean_fail_successful[kickstarter_clean_fail_successful['main_category'] == 'Games']

print(kickstarter_games.head())

We will now split the dataset into training and testing. We will also convert the y of the train and test sets into integers to run our logistic regression using smf.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(kickstarter_games[['ID','name','category','main_category','currency','deadline','goal','launched','backers','country','usd_pledged_real','usd_goal_real','project_length','pledged_per_backer']],kickstarter_games[['state']], test_size=0.3)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

y_train['state'] = y_train.state.apply(int)
y_test['state'] = y_test.state.apply(int)

In [None]:
#SKLEARN STUFF
#logreg_model = LogisticRegression()
#VARIABLES WE WANT
#X_train_goal_length = X_train[['goal','project_length']]
#X_test_goal_length = X_test[['goal','project_length']]
#logitfit = smf.logit(formula = str(f), data = hgc).fit()
#logreg_model.fit(X_train_goal_length,y_train)
#print(logreg_model.coef_)
#print(logreg_model.intercept_)

We will now run our model using the X_train and y_train datasets.
We can observe from the model summary that Tabletop Games appear to have the best project success rates, while Mobile Games appear to have the least.

In [None]:
logitmodel = smf.logit(formula = 'state ~ category + usd_goal_real + project_length + pledged_per_backer', data = pd.concat([X_train, y_train], axis=1))
results = logitmodel.fit()
print(results.summary())

We obtain an accuracy of ~70% on the test dataset. TO DO: Crossvalidation to verify our results (but based on running this code multiple times the 70% appears to hold steady).

In [None]:
testresult_logit = results.predict(X_test)
#print(wholedataset_log)
testresult_logit = np.where(testresult_logit > 0.5, 1, 0)
print(testresult_logit)
print(y_test.state.to_numpy())
np.mean(testresult_logit == y_test.state.to_numpy()) #~70% accuracy on test dataset


The number of failed projects constitute 54% of our training data. Thus, our model does represent a noticeable improvement in prediction accuracy (compared to a simple 'failed' prediction for all the data)

In [None]:
y_test.state.value_counts().iloc[0]/(y_test.state.value_counts().iloc[0] + y_test.state.value_counts().iloc[1])

Out of interest, I also checked the number of each subcategory within the 'Games' main category. Tabletop Games were indeed the most popular type of project on Kickstarter, perhaps as a result of the higher success rate of such projects on the website. We also observe that more successful projects have a higher amount pledged per backer, which was a different result from my hypothesis that small donations (but many backers) could lead to more successful projects.

In [None]:
print(kickstarter_games[['pledged_per_backer','state']].sort_values('pledged_per_backer').groupby('state').mean())

#plot.bar(x='state',y='pledged_per_backer')
#plt.show()
kickstarter_games.category.value_counts()