## 0.1 Import libarys

In [None]:
# general
import pandas as pd
import numpy as np
from pandas import to_datetime


# plot libarys
import seaborn as sns
import matplotlib.pyplot as plt

# Model preperation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_validate
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
 

# Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz  
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

# Model Metrics
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve, confusion_matrix, classification_report, plot_confusion_matrix

# for merging the dataframes
import os, glob
import json

# further libarys
import itertools
from sklearn.tree import export_graphviz

%matplotlib inline

## 0.2 Merging the data frames and loading the data frame

In [None]:
#path = 'data/'
#all_files = glob.glob(os.path.join(path, '*.csv'))

#df_from_each_file = (pd.read_csv(f) for f in all_files)
#df_merged   = pd.concat(df_from_each_file, ignore_index=True)
#df_merged.to_csv( "data/Kickstarter.csv")

In [None]:
df = pd.read_csv('data/Kickstarter.csv', index_col = [0])

# 1. Data Cleaning

The following columns were droped, becouse they hold no usefull and/or interesting data. The decision was based on a simple consideration of the data frame.

For further information about the columns please see columns.md

In [None]:
out = ['urls','source_url','currency_symbol', 'currency_trailing_code', 'creator', 'location', 'slug', 'usd_type','photo', 'name', 'blurb', 'profile']
df.drop(columns = out, inplace = True)

Dropping the following columns, becouse they only hold 300 values or empty spots and the rest NaNs

In [None]:
out = ['friends','is_backing','is_starred', 'permissions']
df.drop(columns = out, inplace = True)

Calculate the datetimes, given in seconds:

In [None]:
df.created_at = pd.to_datetime(df.created_at, unit = 's')
df.launched_at = pd.to_datetime(df.launched_at, unit = 's')
df.state_changed_at = pd.to_datetime(df.state_changed_at, unit = 's')
df.deadline = pd.to_datetime(df.deadline, unit = 's')

Define the categorical columns and transform theire type from object to category

In [None]:
categorical = ['country', 'currency','current_currency', 'spotlight','staff_pick','state', 'disable_communication', 'is_starrable']
df[categorical] = df[categorical].astype("category")


# 2. Feature engineering

### 2.1 Extract data in dictionary in category column into separate columns with leading `"category_"`.

The category column contained a dictionary with various information. We extracted the the parent category, added this information to the dataframe and dropped the category column

In [None]:
df = df.join(pd.DataFrame(df["category"].apply(lambda x: json.loads(x)).to_list()).add_prefix(f"category_"))

# drop unrelevant categories created by json and change objects to categorical type
df.drop(columns=["category"], inplace=True)
category_out = ["category_id", "category_color", "category_position", "category_urls"]
df.drop(columns=category_out, inplace=True)
category_categorical = ["category_parent_id", "category_name", "category_slug"]
df[category_categorical] = df[category_categorical].astype("category")

In [None]:
df_cat = df.category_slug.str.title().str.split("/", expand=True).rename(columns={0: "parent_category_name", 1: "subcategory_name"})
df = df.join(df_cat)

#df.pivot_table(index=["parent_category_name"], columns=["state"], values=["backers_count", "pledged_average"])

In [None]:
df['parent_category_name'].astype("category");

### 2.2 Analyse duplicates
Id is a uniquely identifying id for each project on kickstarter. Therefore, we can check for duplicates based on their id.

In [None]:
df.id.value_counts()

We have observations with duplicated ids. How many are there?

In [None]:
(df.id.value_counts() == 2).sum()

How many real duplicates, i.e. completely identical rows, do we have?

In [None]:
df.duplicated().sum()

Duplicates do not give additional information, therefore remove them.

In [None]:
df = df.drop_duplicates()

In [None]:
df.info()

### 2.3 Adding additional columns calculated with the original given data

Projects can be launched for different time spans. We calculated the duration each project was online (based on launch date and deadline. Further, the set up of a project may take some time. We calculated the preparation time of each project based on the date the project was created and when it was eventually launched.

In [None]:
# Calulate the time between launched_at and deadline
df['duration'] =  (df.deadline - df.launched_at).dt.days.astype('int')

# Calculate the time between project creation (on kickstarter) and lounching it (days)
df['prep_time'] =  (df.launched_at - df.created_at ).dt.days.astype('int')

In [None]:
df.prep_time.describe()

In [None]:
df.prep_time.unique()

### 2.4 Conversion of usd_goal and creating additional goal and pleadged related data columns

The projects are not solely US-based. To be able to compare the various project goals we transformed the goal based on the given static USD rate. We also computed the ratio between the pledged amount and the number of backers for each project.

In [None]:
# conversion of goal in USD with static_usd_rate
df["usd_goal"] = df.goal * df.static_usd_rate

df["log_usd_goal"] = np.log10(df.usd_goal)
df["pledged_average"] = df.usd_pledged / df.backers_count
df["log_pledged_average"] = np.log10(df.pledged_average)

### 2.5 Preparing the time related data for visualization 

Subdevision launched, deadline, changed and created in hours (_H), days (_D), months (_M) and years (_Y)

In [None]:
df['launched_Y'] = df.launched_at.dt.year.astype('int')
df['launched_M'] = df.launched_at.dt.month.astype('int')
df['launched_D'] = df.launched_at.dt.day.astype('int')
df['launched_H'] = df.launched_at.dt.hour.astype('int')

In [None]:
df['deadline_Y'] = df.deadline.dt.year.astype('int')
df['deadline_M'] = df.deadline.dt.month.astype('int')
df['deadline_D'] = df.deadline.dt.day.astype('int')
df['deadline_H'] = df.deadline.dt.hour.astype('int')

In [None]:
df['changed_Y'] = df.state_changed_at.dt.year.astype('int')
df['changed_M'] = df.state_changed_at.dt.month.astype('int')
df['changed_D'] = df.state_changed_at.dt.day.astype('int')
df['changed_H'] = df.state_changed_at.dt.hour.astype('int')

In [None]:
df['created_Y'] = df.created_at.dt.year.astype('int')
df['created_M'] = df.created_at.dt.month.astype('int')
df['created_D'] = df.created_at.dt.day.astype('int')
df['created_H'] = df.created_at.dt.hour.astype('int')

Project were online between 1 to 93 days. We subdevided the preparation time into the following 'duration_bins: '1 day', '3 days', '1 week', '2 weeks', '1 month', '2 months' and '3 months'. Apart from '1 day', all other bins should be understood as "as long as".

In [None]:
def dur_kick(x):
        if x == 1: return '1 day'
        elif x == 2 or x <= 3 : return '3 days' 
        elif x == 4 or x <= 7 : return '1 week'
        elif x == 8 or x <= 14 : return '2 weeks'
        elif x == 15 or x <= 30 : return '1 month'
        elif x == 31 or x <= 60 : return '2 months'
        elif x == 61 or x <= 93 : return '3 months'

df["duration_bins"] = pd.Categorical(df.duration.apply(dur_kick), 
                ['1 day', '3 days', '1 week', '2 weeks', '1 month', '2 months','3 months'])

The preparation of a project was rather different. Thus, we we created another bin 'prep_bins' ('1 day', '3 days', '1 week', '2 weeks', '1 month', '2 months','3 months', '6 months', '1 year' and '> 1 year'). Here, each given bin should be read as "at least as long as".

In [None]:
def prep(x):
        if x <= 1: return '1 day'
        elif x <= 3 : return '3 days' 
        elif x <= 7 : return '1 week'
        elif x <= 14 : return '2 weeks'
        elif x <= 30 : return '1 month'
        elif x <= 60 : return '2 months'
        elif x <= 90 : return '3 months'
        elif x <= 180 : return '6 months'
        elif x <= 360 : return '1 year'
        else : return '> 1 year'

df["prep_bins"] = pd.Categorical(df.prep_time.apply(prep), 
                ['1 day', '3 days', '1 week', '2 weeks', '1 month', '2 months','3 months', '6 months', '1 year', '> 1 year'])

In [None]:
#checking for NaNs
df.prep_bins.isnull().sum()

As we calculated the date for each project, we might as well assign weekdays for the launch day and the deadline.

In [None]:
wday = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def weekday(x): return wday[x]       

In [None]:
df['launch_day'] = df.launched_at.dt.to_period('D').dt.weekday
df['launch_day'] = pd.Categorical(df.launch_day.apply(weekday), 
                ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])  

In [None]:
df['deadline_day'] = df.deadline.dt.to_period('D').dt.weekday
df['deadline_day'] = pd.Categorical(df.deadline_day.apply(weekday), 
                ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])

In [None]:
df['changed_day'] = df.state_changed_at.dt.to_period('D').dt.weekday
df['changed_day'] = pd.Categorical(df.changed_day.apply(weekday), 
                ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])

# 4. EDA
## 4.1. The influence of time related data on the success

### 4.1.1 Closer look at the lanuched related data

In [None]:
# seting figuresize and lable size globally
plt.rcParams['figure.figsize']=(12,8)
plt.rcParams['font.size']=14

sns.set_theme(palette = 'pastel', 
              font_scale=1.25)

# setting the colors
state = df.state.unique().tolist()
state_colors=['#fa9fb5', '#7a0177', '#8c96c6', '#f768a1', '#fcc5c0']
COLOR_STATE = dict(zip(state, state_colors))
COLOR_TIME = '#084594'  # dark blue-ish
COLOR_COUNTRY = '#6baed6'  # blue-ish
COLOR_CATEGORY = '#2171b5' # different blue
COLOR_SUCCESS = '#7a0177'  # dark purple


In [None]:
df.launched_Y.value_counts()

In [None]:
print(df['launched_M'].groupby(df['launched_Y']).value_counts())

Let's look at the overall number of projects per year. Since its start in 2009, Kickstarter has has increasing number of projects on its platform. The highest number was reached in 2015 with almost 38,000 projects - so far. The last years there have been a bit more than 27,000 projects. However, we do expect an uptick in projects as numbers have risen in 2018 again and in 2019 there are already more than 8,000 projects in the first three months of the year.

In [None]:
# Visualization of the number of projects started annually 
sns.countplot(x = df.launched_Y.sort_values(), color = COLOR_TIME).set(xlabel='Year', ylabel = 'Number of projects')

#plt.savefig("images/projects_year.png",  bbox_inches="tight")
plt.show()

The success rate of projects was fairly high in the first years of Kickstarter (almost 80%). However, it dropped in 2014 below 50%. After another low in 2015, the success rate as increased slightly over the last years.

In [None]:
# Visualization of the project state (read: success rate) over the different years.
ax = sns.histplot(x='launched_Y',
                 hue= 'state',
                 stat = 'probability',
                 data=df,
                 multiple="fill",
                 palette = COLOR_STATE
                 )
ylabels = ['{:,.0f}'.format(y) for y in ax.get_yticks()*100]
ax.set_yticklabels(ylabels)

ax.set(xlabel="Year")
ax.set(ylabel="Percent")

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0., 
           labels =['live', 'successful', 'failed', 'canceled', 'suspended'], labelspacing=1.2)

#plt.savefig("images/state_year.png", bbox_inches="tight")
plt.show()

In [None]:
ax = sns.countplot(x = df.launched_D.sort_values(), color = COLOR_TIME)

ax.set(xlabel='Day of the month (launch)', ylabel = 'Number of projects')

ylabels = ['{:,.0f}'.format(y) for y in ax.get_yticks()]
ax.set_yticklabels(ylabels)

#plt.savefig("images/projects_month.png" , bbox_inches="tight")
plt.show()

As you can see above, projects were launched on all days of the months and fairly evenly distributed. The first, the 15th and the 31st do stick out. But it's more interesting to see whether the launching day does indeed impact the success of a project. However, when looking at the state of the projects (below) we can see that it does not seem to be important on which day of the week a project is launched.

In [None]:
ax = sns.histplot(x='launch_day',
                 hue= 'state',
                 stat = 'probability',
                 data=df,
                 multiple="fill",
                 palette = COLOR_STATE
                 )
#ax.set_xticklabels(ax.get_xticklabels(),rotation=40)
ax.set(xlabel="Weekday")
ax.set(ylabel="Percent")

ylabels = ['{:,.0f}'.format(y) for y in ax.get_yticks()*100]
ax.set_yticklabels(ylabels)

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0., 
           labels =['live', 'successful', 'failed', 'canceled', 'suspended'], labelspacing=1.2)

#plt.savefig("images/state_weekday.png", bbox_inches="tight")
plt.show()

### 4.1.2 Exploration of the influence of the project duration on kickstarter

When preparing a project is also interesting to know for how long your project should be online to be successful - and if duration has an effect on your success. The previous projects were mostly online for 1 month, followed by 3 months.

In [None]:
# Visualization of duration projects were online (count)
ax = sns.countplot(x = df.duration_bins, color = COLOR_TIME)

ax.set(xlabel='Duration between launch and deadline', ylabel = 'Number of projects')
ylabels = ['{:,.0f}'.format(y) for y in ax.get_yticks()]
ax.set_yticklabels(ylabels)

#plt.savefig("images/duration_bin_counts.png", bbox_inches="tight")
plt.show()

So does it have an effect on the success?

It depends. Projects that have been online for at least 2 weeks or at least 3 months were more successful than other projects online for shorter (or longer) time spans. Also projects that were online for up to 1 week were more likely to be suspended.

In [None]:
# Visualization of duration bins in relation to state of projects
ax = sns.histplot(x='duration_bins',
                 hue= 'state',
                 stat = 'probability',
                 data=df,
                 multiple="fill",
                 palette = COLOR_STATE
                 )
ax.set(xlabel="Duration between launch and deadline")
ax.set(ylabel="Percent")

ylabels = ['{:,.0f}'.format(y) for y in ax.get_yticks()*100]
ax.set_yticklabels(ylabels)

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0., 
           labels =['live', 'successful', 'failed', 'canceled', 'suspended'], labelspacing=1.2)

#plt.savefig("images/duration_bin_state.png", bbox_inches="tight")
plt.show()

Looking at the asked goal of a project, we can see that the duration of "3 months" may be distorting our plot above. It is likley that these projects were not successful because they were online for a longer period but rather because they were asking for a lower goal (compared to the projects in the two months bin).

In [None]:
# Visualization  of the goals in relation to the duration the projects were online 
ax = sns.scatterplot(x = df.duration_bins, y = df.usd_goal, color = COLOR_TIME)

ax.set(xlabel="Duration between launch and deadline")
ax.set(ylabel="Goal ins US Dollar")

ylabels = ['{:,.0f}'.format(y) for y in ax.get_yticks()]
ax.set_yticklabels(ylabels)

#plt.savefig("images/duration_bin_goal.png", bbox_inches="tight")
plt.show()

### 4.1.3 Preperationtime
How long did it take between projects being created and actually launched on Kickstarter, and how did this affect their success? 

In [None]:
ax = sns.countplot(x = df.prep_bins, color = COLOR_TIME)

ax.set(xlabel="Preparation time (duration between creation and launch)")
ax.set(ylabel="Number of projects")

ax.set_xticklabels(ax.get_xticklabels(),rotation=40)

ylabels = ['{:,.0f}'.format(y) for y in ax.get_yticks()]
ax.set_yticklabels(ylabels)

#plt.savefig("images/preparation_bin_count.png", bbox_inches="tight")
plt.show()

A lot of the projects on kickstarter were created and put online within 24 hours. Another higher number of projects were prepared for up to one month. The numbers of project decline with a longer preparation time, but even projects that were worked on for more than a year were brought online eventually.

So did this preparation time have an effect on the success (and maybe other possible states) of these projects?

As you can see below projects that were prepared for one to three months were more likely to be successful. A longer preparation did not seem to have a positive effect on the outcome. And: projects that were launched very fast were more likely to be suspended.

In [None]:
# Visualization of state of a project in relation to preparation time
ax = sns.histplot(x='prep_bins',
                 hue= 'state',
                 stat = 'probability',
                 data=df,
                 multiple="fill",
                 palette = COLOR_STATE
                 )
#ax.set_xticklabels(ax.get_xticklabels(),rotation=40)
ax.set(xlabel="Duration between creation and launch")
ax.set(ylabel="Percent")

ylabels = ['{:,.0f}'.format(y) for y in ax.get_yticks()*100]
ax.set_yticklabels(ylabels)

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0., 
           labels =['live', 'successful', 'failed', 'canceled', 'suspended'], labelspacing=1.2)

#plt.savefig("images/prep_bin_state.png", bbox_inches="tight")
plt.show()

### 4.1.4 Deadline

Let's take a closer look at the deadline. The data shows it doesn't seem to have an effect when a project is ending. The success rate seems fairly evenly distributed over the months of an year (in terms of deadline).

In [None]:
# Visualization of the state of a project in relation to the month of the deadline
ax = sns.histplot(x='deadline_M',
                 hue= 'state',
                 stat = 'probability',
                 data=df,
                 multiple="fill",
                 palette = COLOR_STATE
                 )
#ax.set_xticklabels(ax.get_xticklabels(),rotation=40)
ax.set(xlabel="Month (in numbers)")
ax.set(ylabel="Percent")

ylabels = ['{:,.0f}'.format(y) for y in ax.get_yticks()*100]
ax.set_yticklabels(ylabels)

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0., 
           labels =['live', 'successful', 'failed', 'canceled', 'suspended'], labelspacing=1.2)

#plt.savefig("images/deadline_M_state.png", bbox_inches="tight")
plt.show()

In [None]:
ax = sns.countplot(x = df.deadline_D.sort_values(), color = COLOR_TIME)

ax.set(xlabel="Day of the month of the deadline")
ax.set(ylabel="Number of projects")

ylabels = ['{:,.0f}'.format(y) for y in ax.get_yticks()]
ax.set_yticklabels(ylabels)

#plt.savefig("images/deadline_D_count.png", bbox_inches="tight")
plt.show()

So maybe it has an effect on which days of the month the most deadlines were. In fact, the plot looks fairly similar to the plot of the launch day. The first, 15th/16th and the 30th of a month are popular deadlines. And just like the launch day the weekday does not seem to have an effect on the outcome of a project:

In [None]:
# Visualization of the state of a project in relation to the weekday of the deadline
ax = sns.histplot(x='deadline_day',
                 hue= 'state',
                 stat = 'probability',
                 data=df,
                 multiple="fill",
                 palette = COLOR_STATE
                 )
#ax.set_xticklabels(ax.get_xticklabels(),rotation=40)
ax.set(xlabel="Duration between creation and launch")
ax.set(ylabel="Percent")

ylabels = ['{:,.0f}'.format(y) for y in ax.get_yticks()*100]
ax.set_yticklabels(ylabels)

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0., 
           labels =['live', 'successful', 'failed', 'canceled', 'suspended'], labelspacing=1.2)

#plt.savefig("images/deadline_D_state.png", bbox_inches="tight")
plt.show()

## 4.2 Influence of the location on the success

In [None]:
df.groupby(df['country']).country.value_counts()

The most projects are generated in the US, by far, which is not surprising as Kickstarter is based in Brooklyn, N.Y.C. Though, also a quite large number come from Great Britian and Canada.

In [None]:
# Visualization of project counts in relation to country 
ax = sns.countplot(x = df.country.sort_values(), color = COLOR_COUNTRY)

ax.set(xlabel="Country")
ax.set(ylabel="Number of projects")

ylabels = ['{:,.0f}'.format(y) for y in ax.get_yticks()]
ax.set_yticklabels(ylabels)

#plt.savefig("images/country_count.png", bbox_inches="tight")
plt.show()

To check whether the country of were a project is based has an impact we created a column that displays the success rate of a project depending on the location. For this we ignore the other states a project might have - besides failed (canceled, suspended, live).

In [None]:
df['success'] = df.state == 'successful'
s_country = df.groupby(df['country']).success.mean().reset_index().rename(columns={"success":"success_country"})

df = df.merge(s_country, how = 'outer', left_on = 'country', right_on = 'country')

The most successful projects were launched in Hongkong and Luxemburg (above 60% success rate), Great Britian and Japan being the runners up. Remarkedly, project from Italy are rarely successful (below 30%).

In [None]:
# Visualization of success rate in relation to location/country
ax = sns.barplot(x = df.country, y = df.success_country, color = COLOR_SUCCESS)

ax.set(xlabel="Country")
ax.set(ylabel="Success in percent")

ylabels = ['{:,.0f}'.format(y) for y in ax.get_yticks()*100]
ax.set_yticklabels(ylabels)

#plt.savefig("images/country_success.png", bbox_inches="tight")
plt.show()

## 4.3 Influence of the categorical dataccolumns 

Projects can belong to very different categories. The most popular ones - in terms of numbre of projects are "Music" and " Film & Video", followed by "Publishing", "Art" and "Technology".

In [None]:
# Visualization of project counts in relation to category
ax = sns.countplot(y = df.parent_category_name, color = COLOR_CATEGORY)

ax.set(ylabel="")
ax.set(xlabel="Number of projects")

xlabels = ['{:,.0f}'.format(x) for x in ax.get_xticks()]
ax.set_xticklabels(xlabels)

#plt.savefig("images/category_count.png", bbox_inches="tight")
plt.show()

Are certain categories more successful than others? For the sake of simplicity of the next plot we also created a success rate column in relation for each category.

In [None]:
s_category = df.groupby(df['parent_category_name']).success.mean().reset_index().rename(columns={"success":"success_category"})
df = df.merge(s_category, how = 'outer', left_on = 'parent_category_name', right_on = 'parent_category_name')
s_category

In [None]:
# Visualization of success rate in relation to category
ax = sns.barplot(y = df.parent_category_name, x = df.success_category, color = COLOR_SUCCESS)

ax.set(ylabel="")
ax.set(xlabel="Success in percent")

xlabels = ['{:,.0f}'.format(x) for x in ax.get_xticks()*100]
ax.set_xticklabels(xlabels)

#plt.savefig("images/category_success.png", bbox_inches="tight")
plt.show()

THis plot looks very different to the above count of projects per category. The most successful projects are running the categories "Comics" and "Dance" (above 70%). In both categories the number of projects is fairly low compared to other categories. The least successful categories are Food, Journalism and Technology (below 35%).

When looking a the number of backers we see something slightly different. The most backers are supporting projects in the category "Games" - by far! Next is Desgin, closely followed by Technology and Comics.

In [None]:
ax = sns.barplot(y = df.parent_category_name, x= df.backers_count, color = COLOR_CATEGORY)

ax.set(ylabel="")
ax.set(xlabel="Number of backers")

#plt.savefig("images/category_backers.png", bbox_inches="tight")
plt.show()

When looking at the averaged pledged amount per category, we can see that Technology projects are very popular. However, it does not result in a successful outcome - as seen above.

In [None]:
ax = sns.barplot(y = df.parent_category_name, x= df.pledged_average, color = COLOR_CATEGORY)

ax.set(ylabel="")
ax.set(xlabel="Ratio between pledged amount and goal (percent)")

#plt.savefig("images/category_pledged_av.png", bbox_inches="tight")
plt.show()

## 4.4 Staff pick

When a project is picked by kcikstarter staff is put in a certain spotlight and highlighted on the website. We would think this should have an effect on the success rate. Let's see.

In [None]:
s_pick = df.groupby(df['staff_pick']).success.mean().reset_index().rename(columns={"success":"success_pick"})
df = df.merge(s_pick, how = 'outer', left_on = 'staff_pick', right_on = 'staff_pick')
s_pick

In [None]:
df.groupby(df['staff_pick']).success.value_counts()

In [None]:
df.groupby(df['staff_pick']).success.value_counts()

Indeed! Just about 11% of all projects were staff picks. However these were than very successful - with a rate of about 87%. Projects that weren't picked showed a success rate of about 48%.

In [None]:
fig, ax = plt.subplots()
labels = [' ', 
         'Staff pick']
percentages = [89.3, 10.7]
explode=(0.1,0)
ax.pie(percentages, explode=explode, labels=labels, autopct='%1.0f%%', 
       shadow=False, startangle=0,  colors = ['#9ecae1', '#08306b'] ,
       pctdistance=1.2,labeldistance=1.4)
ax.axis('equal')

#plt.savefig("images/staffpick.png", bbox_inches="tight")
plt.show()

In [None]:
fig, ax = plt.subplots()
labels = ['Successful', 
         '']
percentages = [86.7, 13.3]
explode=(0.1,0)
plt.title('Staff pick')
ax.pie(percentages, explode=explode, labels=labels, autopct='%1.0f%%', 
       colors = ['#7a0177', '#8c96c6', ],
       shadow=False, startangle=0,   
       pctdistance=1.2,labeldistance=1.4)
ax.axis('equal')

#plt.savefig("images/staffpick_success.png", bbox_inches="tight")
plt.show()

Particularly projects from the categories "Film & Video" and "Publishing" are picked by Kickstarter staff, whereas projects in "Crafts" and "Journalism" are less likely to be picked.

In [None]:
df.groupby(df['staff_pick']).parent_category_name.value_counts()

## 4.5 Disable communication

Something we haven't looked at so far is a feature that seems to be minor: disable communication. Let's calculated the success rate ...

In [None]:
s_comm = df.groupby(df['disable_communication']).success.mean().reset_index().rename(columns={"success":"success_comm"})
df = df.merge(s_comm, how = 'outer', left_on = 'disable_communication', right_on = 'disable_communication')

s_comm

In [None]:
df.groupby(df['disable_communication']).state.value_counts()

Above you can see that when communication is disabled there is no success at all. How come?

When we look at the state of projects in relation to disabled communication we see that all projects where communication was disabled have been suspended by Kickstarter!

## 4. Projects that are "starrable"

Another - maybe minor - feature is "is_starrable". What information does it hold, i.e. does it affect the success of a project?

In [None]:
s_star = df.groupby(df['is_starrable']).success.mean().reset_index().rename(columns={"success":"success_star"})
df = df.merge(s_star, how = 'outer', left_on = 'is_starrable', right_on = 'is_starrable')

s_star

In [None]:
df.groupby(df['is_starrable']).state.value_counts()

All projects that are starrable are still live and we cannot assume the success by this feature.

# 5. Prepare data for model training 
## 5.1 Define target:

The aim of this project is to help potential project creators assess whether or not Kickstarter is a good funding option for them. Therefore, we want to model the chances to successfully raise enough money on Kickstarter. Kickstarter allwos backers to cancel their pledge and creators to cancel funding while the project is live. According to these cancellation policies, live or canceled projects could still miss the funding goal at the deadline although they had reached the funding goal earlier. Therefore, we only include successful, failed, and suspended projects in our analysis, treating failed and suspended both as not successful.

Let's check the distribution of state classes before removing live and canceled data:

In [None]:
# Number of projects per state
df.groupby("state").backers_count.count()

Most of the projects are recorded as failed or successful. What percentage of data would we drop by removing live and canceled projects?

In [None]:
# Percentage of live and canceled data
df.query("state in ['live', 'canceled']").shape[0]/df.shape[0] * 100

We drop ~7.3% of the data.

We generate a new target column with 1 for successful projects and 0 for failed and suspended projects.

In [None]:
def target(row):
    if row.state == "successful":
        return 1
    elif row.state in ["failed", "suspended"]:
        return 0
    else:
        return np.nan

In [None]:
# Create the new target column
df["successful"] = df.apply(lambda row: target(row), axis=1)

In [None]:
# Drop live and canceled projects (they are the only rows with NaN values)
df.dropna(axis=0, inplace=True)

In [None]:
## 5.2 Check for inbalance

Let's check the class distribution of our target variable:

In [None]:
df.successful.value_counts() / df.shape[0]

In [None]:
print(f"Number of projects with duplicate ID: {(df.id.value_counts() == 2).sum()}") 
print(f"Number of observations: {df.shape[0]}")

With a class distribution of 60% successful and 39% unsuccessful we have an almost balanced dataset

# 6. Modeling
Although we found differences in success depending on the year of the launch, including it in the modeling could lead to overfitting to old economic situations. Information such as "your project would have been successful in 2013" is not relevant for our stakeholder who wants to realize their project now.

We include features in our model, that are known/decided on project creation such as the funding goal in US Dollar, whether or not communication with the creator is enabled, the country of the project, the duration of the funding, the duration of project preparation, the name of the parent category, the name of the subcategory, and whether or not the project can be starred by users.

As target we choose whether or not a project was successful, as perpared above.

## 6.1 Define target (X) and features (y)

In [None]:
# Select features
features = ["usd_goal", "disable_communication", "country", "duration", "prep_time", 
            "parent_category_name", "category_name"]
X = df[features]

# Select target
y = df.successful

In [None]:
X.info()

## 6.2 Spliting the data in train and test sets
To be able to choose a model based on its performance on unseen data, we split our dataset into training and test set. We choose a random seed to have a reproducible split and no data leakage in our model selection process.

The split was realized with sklearns train_test_split method, with a 70/30 ratio, random_state = 42 and stratify = target (the latter shall secure that the propration of vales in the training and test set have the same propration)

In [None]:
# Split the data in test and training set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

## 6.3 Function definations
For model evaluation, ................

In [None]:
def pred_eval_plot_model(X_train, X_test, y_train, y_test, clf, cv=None):
    """Train a single model and print evaluation metrics.
    
    Args:
        X_train (pd.DataFrame, np.array): Features of the training set
        X_test (pd.DataFrame, np.array): Features of thee test set
        y_train (pd.Series, np.array): Target of the training set
        y_teset (pd.Seeries, np.array): Target of the test set
        clf (sklearn.base.BaseEstimator): Estimator to train and use
        cv (int, None): Number of cross-validations, default=None
    
    Returns:
        model (sklearn.base.BaseEstimator): The trained model
    """
    model = clf.fit(X_train, y_train)

    if cv:
        cv = cross_validate(m_rf, X_train_trans, y_train, cv=5, verbose=5)
        print(f"Best cross-validated score: {cv['test_score'].mean()}")
    
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print(f"--- MODEL PARAMETERS {'-'*10}")
    print(json.dumps(model.get_params(), indent=4))
    print(f"--- CLASSIFICATION REPORT {'-'*10}")
    print(classification_report(y_test,y_pred))
    print(f"--- CONFUSION MATRIX {'-'*10}")
    print(confusion_matrix(y_test,y_pred))
    plot_confusion_matrix(model, X_test, y_test)
    return model

def _pred_eval_plot_grid(X_train, X_test, y_train, y_test, gs):
    """Helper function to perform a grid search and calculate performance metrics.
    
    Args:
        X_train (pd.DataFrame, np.array): Features of the training set
        X_test (pd.DataFrame, np.array): Features of thee test set
        y_train (pd.Series, np.array): Target of the training set
        y_teset (pd.Seeries, np.array): Target of the test set
        gs (BaseSearchCV): SearchCV to train and use
    
    Returns:
        model (BaseSearchCV): The trained grid search
    """
    gs = gs.fit(X_train, y_train)
    
    # Testing predictions (to determine performance)
    y_pred = gs.best_estimator_.predict(X_test)
    
    print(f"--- GRID SEARCH RESULTS {'-'*10}")
    print(f"Best model: {gs.best_params_}")
    print(f"Best cross-validated score: {gs.best_score_}")
    print(f"--- CLASSIFICATION REPORT {'-'*10}")
    print(classification_report(y_test,y_pred))
    print(f"--- CONFUSION MATRIX {'-'*10}")
    print(confusion_matrix(y_test,y_pred))
    plot_confusion_matrix(gs.best_estimator_, X_test, y_test)
    return gs
    

def run_rand_grid_search(X_train, X_test, y_train, y_test, clf, params_grid, n_iter=10, cv=5):
    """Perform a randomized grid search and calculate performance metrics.
    
    Args:
        X_train (pd.DataFrame, np.array): Features of the training set
        X_test (pd.DataFrame, np.array): Features of thee test set
        y_train (pd.Series, np.array): Target of the training set
        y_teset (pd.Seeries, np.array): Target of the test set
        clf (sklearn.base.BaseEstimator): Estimator to train and use
        params_grid (dict): Dictionary defining the parameters for the grid search
        n_iter (int): Number of grid search combinations to run
        cv (int, None): Number of cross-validations, default=None
        
    Returns:
        model (BaseSearchCV): The trained grid search
    """
    gs = RandomizedSearchCV(clf, params_grid, n_iter=n_iter, cv=cv, random_state=24, verbose=5)
    return _pred_eval_plot_grid(X_train, X_test, y_train, y_test, gs)
    
def run_grid_search(X_train, X_test, y_train, y_test, clf, params_grid, cv=5):
    """Perform a grid search and calculate performance metrics.
    
    Args:
        X_train (pd.DataFrame, np.array): Features of the training set
        X_test (pd.DataFrame, np.array): Features of thee test set
        y_train (pd.Series, np.array): Target of the training set
        y_teset (pd.Seeries, np.array): Target of the test set
        clf (sklearn.base.BaseEstimator): Estimator to train and use
        params_grid (dict): Dictionary defining the parameters for the grid search
        cv (int, None): Number of cross-validations, default=None
        
    Returns:
        model (BaseSearchCV): The trained grid search
    """
    gs = GridSearchCV(clf, params_grid, cv=cv, verbose=5)
    return _pred_eval_plot_grid(X_train, X_test, y_train, y_test, gs)
    

## 6.4 Preparation of Data Scaling and Category Encoding

In [None]:
# encoder for categories
onehot = OneHotEncoder(drop="first")

In [None]:
# scalers for numerical features
mms = MinMaxScaler()
ss = StandardScaler()

In [None]:
# Prepare list of numerical and categorical columns
num_cols = make_column_selector(dtype_include=np.number)
cat_cols = make_column_selector(dtype_include="category")

# 7. Modeltesting

## 7.1 Logistic Regression
Data Transformation
For Logistic Regression we need to scale our data and encode categorical data. As the categories are not ordinal, we use one hot encoding.

### 7.1.1 Simple Logistic Regression with Standard Scaling

In [None]:
# Define transformer
transformer = ColumnTransformer([
    ("scale", ss, num_cols),
    ("encode", onehot, cat_cols),
])

# Transform
X_train_trans = transformer.fit_transform(X_train)
X_test_trans = transformer.transform(X_test)

In [None]:
logreg_ss = LogisticRegression(max_iter=400)
m_logreg_ss = pred_eval_plot_model(X_train_trans, X_test_trans, y_train, y_test, logreg_ss)

The basic model with standard scaling of the numerical features achieves an accuracy of 75% and a precision of 78% on successful projects.

In [None]:
# Classification report
y_probs = m_logreg_ss.predict_proba(X_test_trans)[:, 1]

y_pred = y_probs > 0.9
print(f"--- CLASSIFICATION REPORT {'-'*10}")
print(classification_report(y_test,y_pred))
print(f"--- CONFUSION MATRIX {'-'*10}")
print(confusion_matrix(y_test,y_pred))

Adjusting the threshold to 0.9, the precision for successful projects can be increased to 99%, reducing the accuracy to 65%.

### 7.1.2 Simple Logistic Regression with MinMax Scaling

In [None]:
# Define transformer
transformer = ColumnTransformer([
    ("scale", mms, num_cols),
    ("encode", onehot, cat_cols),
])

# Transform
X_train_trans = transformer.fit_transform(X_train)
X_test_trans = transformer.transform(X_test)

In [None]:
# Calculate the LogisticRegression
logreg = LogisticRegression(max_iter=400)
m_logreg_mm = pred_eval_plot_model(X_train_trans, X_test_trans, y_train, y_test, logreg)

Using MinMax scaling, the model has an accuracy of 74% and precision of 79%.

In [None]:
# Classification report
y_probs = m_logreg_mm.predict_proba(X_test_trans)[:, 1]

y_pred = y_probs > 0.9
print(f"--- CLASSIFICATION REPORT {'-'*10}")
print(classification_report(y_test,y_pred))
print(f"--- CONFUSION MATRIX {'-'*10}")
print(confusion_matrix(y_test,y_pred))

The same results can be achieved by adjusting the threshold value: Decrease in accuracy to 65% for an increase in precision to 99%.

Hence, we can not say, that one scaling outperforms the other in case of logistic regression.

### 7.1.3 With Randomized Grid Search
Let's try different regularization weights and types to improve the performance of the logistic regression:

In [None]:
params_grid = {
    "penalty": ["elasticnet"],
    "C": np.logspace(-3, 3, 7),
    "max_iter": [200],
    "l1_ratio": np.arange(0, 1, 0.25),
    "solver": ["saga"],
}
rs_logreg = run_rand_grid_search(X_train_trans, X_test_trans, y_train, y_test, logreg, params_grid, cv=3, n_iter=20)

The best model with randomized search is achieved with {'solver': 'saga', 'penalty': 'elasticnet', 'max_iter': 200, 'l1_ratio': 0.25, 'class_weight': None, 'C': 1000.0} With a cross-validated score of 0.7391006496832911.

The accuracy of the best model is 74% and precision is 79%. Therefore, we could not find a parameter combination that improves precision.

In [None]:
# Classification report
y_probs = rs_logreg.best_estimator_.predict_proba(X_test_trans)[:, 1]

y_pred = y_probs > 0.9
print(f"--- CLASSIFICATION REPORT {'-'*10}")
print(classification_report(y_test,y_pred))
print(f"--- CONFUSION MATRIX {'-'*10}")
print(confusion_matrix(y_test,y_pred))

Unsurprisingly, changing the threshold value gives the same results as before.

## 7.2 KNN

### Data Transformation
KNN compares observations based on a similarity measure. Therefore, we need to scale numerical features and use one-hot-encoding for our categorical features. Using one-hot encoding creates a sparse matrix and reduces KNN efficiency. Therefore, we remove category_name from our features to reduce the number of features.

In [None]:
X_train_trans = X_train.copy()
X_train_trans.pop("category_name")
X_test_trans = X_test.copy()
X_test_trans.pop("category_name")

# Define transformer
transformer = ColumnTransformer([
    ("scale", ss, num_cols),
    ("encode", onehot, make_column_selector(dtype_include="category")),
])

# Transform
X_train_trans = transformer.fit_transform(X_train)
X_test_trans = transformer.transform(X_test)

### 7.2.1 Simple KNN
We will use the manhattan distance for similarity as our data is sparse.

In [None]:
# Define Classifier
knn = KNeighborsClassifier(p=1, n_jobs=-1)

In [None]:
pred_eval_plot_model(X_train_trans , X_test_trans, y_train, y_test, knn)

The model achieves a precision of 75% for successful projects. Training the KNN took very long and did not achieve large differences in precision. Therefore, we will not optimize KNN parameters with a grid saerch.

## 7.3 Decision Tree

### Data Transformation

For Decision Trees numerical data doesn't need to be scaled. Cateegorical data needs to be encoded. As One-Hot-Encoding leads to sparse data and decreases the performance of decision trees, we encode the categories numerically.

In [None]:
# Encode categorical features (with more than two classes)
X_train_trans = X_train.copy()
X_test_trans = X_test.copy()
for cat in ["country", "parent_category_name", "category_name"]:
    X_train_trans[[cat]] = X_train_trans[cat].cat.codes
    X_test_trans[[cat]] = X_test_trans[cat].cat.codes

### 7.3.1 Simple Decision Tree

In [None]:
# Define Classifier
dtree = DecisionTreeClassifier(random_state=42)

In [None]:
m_dtree = pred_eval_plot_model(X_train_trans, X_test_trans, y_train, y_test, dtree)

The accuracy of 71% and precision of 73% for being successful needs to be improved further. Let's do a grid search:

### 7.3.2 Decion Tree with grid search

In [None]:
params_grid = {
    "max_depth": np.arange(10, 50, 2),
    "min_samples_leaf": np.arange(10, 30, 2),
}
rs_dtree = run_rand_grid_search(X_train_trans, X_test_trans, y_train, y_test, dtree, params_grid, n_iter=30)

The currently best DecisionTree can be trained with {'min_samples_leaf': 33, 'max_depth': 28}. The model has an accuracy of 74% and precision of 76%.

In [None]:
# export the decision tree to a tree.dot file 
# for visualizing the plot easily anywhere 
export_graphviz(rs_dtree.best_estimator_, out_file ='tree.dot')

## 7.4 Random Forest

### Data Transformation
For Random Forests we use the same data scaling and encoding as for decision trees.

In [None]:
# Encode categorical features (with more than two classes)
X_train_trans = X_train.copy()
X_test_trans = X_test.copy()
for cat in ["country", "parent_category_name", "category_name"]:
    X_train_trans[[cat]] = X_train_trans[cat].cat.codes
    X_test_trans[[cat]] = X_test_trans[cat].cat.codes

### 7.4.1 Simple Random Forest

In [None]:
# Define Classifier
rf = RandomForestClassifier()

In [None]:
m_rf = pred_eval_plot_model(X_train_trans, X_test_trans, y_train, y_test, rf)

Random Forest Classifier with default parameters has an accuracy of 75% and precision of 76%.

In [None]:
# Calculate some stats for the random forest:
n_nodes = []
max_depths = []

# Stats about the trees in random forest
for ind_tree in m_rf.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

### 7.4.2 Grid Search on random forest

In [None]:
params_grid = {
    "criterion": ["gini", "entropy"],
}
rs_rf = run_grid_search(X_train_trans, X_test_trans, y_train, y_test, rf,  params_grid)

According to the Grid Searc, entropy is the better criterion to select features and split-values, but accuracy and precision could not be improved.

## 7.5 ExtraTree

### Data Transformation
Still working with trees, we keep the same data transformation:

In [None]:
X_train_trans = X_train.copy()
X_test_trans = X_test.copy()
for cat in ["country", "parent_category_name", "category_name"]:
    X_train_trans[[cat]] = X_train_trans[cat].cat.codes
    X_test_trans[[cat]] = X_test_trans[cat].cat.codes

### 7.5.1 Simple ExtraTreeClassifier

In [None]:
etree = ExtraTreesClassifier()
m_etree = pred_eval_plot_model(X_train_trans, X_test_trans, y_train, y_test, etree)

ExtraTreeClassifier has an accuracy of 74% and precision of 77% out of the box.

## 7.6 XGBoost

### Data Transformation
We scale numerical features and encode categorical features with one-hot-encoding.

In [None]:
# Define transformer
transformer = ColumnTransformer([
    ("scale", ss, num_cols),
    ("encode", onehot, cat_cols),
])

# Transform
X_train_trans = transformer.fit_transform(X_train)
X_test_trans = transformer.transform(X_test)

### 7.6.1 Simple XGB Classifier

In [None]:
# Define Classifier
xgb = XGBClassifier()

In [None]:
pred_eval_plot_model(X_train_trans, X_test_trans, y_train, y_test, xgb)

XGBoost has an accuracy of 77% and precision of 79%, showing the best prediction results so far.

# 8. Future work

### Data Cleaning

* Remove duplicates (based on id)
* Extract more detailed location information (?)

### Modeling

* Stacked model (?)

# 5. Prepare data for model training 
## 5.1 Define target and features:

In [None]:
# Select the target
y = df.state

# Select the features
# aus KS_simple:
# features = ["backers_count", "converted_pledged_amount" , "goal", "disable_communication", "country", "staff_pick", "duration"]

# aus KS_model
features = ["backers_count", "converted_pledged_amount" , "goal", "disable_communication", "country_trans", "staff_pick", "duration", "prep_time"]
X = df[features]

## Spliting the data in test and train

Using the train_test_split method, with a 70/30 ratio, random_state = 42 and stratify = target (the latter shall secure that the propration of vales in the training and test set  have the same propration)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)

## Scaling and getting dummies for some models:

In [None]:
# Dummies
X_train_dummies = pd.get_dummies(X_train, drop_first = True)
X_test_dummies = pd.get_dummies(X_test, drop_first = True)

In [None]:
# Scaling
mms = MinMaxScaler()
X_train_mms = mms.fit_transform(X_train)
X_test_mms = mms.transform(X_test)

# X. Creation, testing and comparing different models

## Define function to calculate the model metrics and ploting the confusion matrix:

The function takes the splitet train, test data  and the model classifier.
Calculate the model predictions, print out the classification_report with the model metrics and the confusion matrix, the latter it also plots.

In [None]:
def pred_eval_plot_model (X_train, X_test, y_train, y_test, clf):
    model = clf.fit(X_train, y_train) 
    
    # Training predictions 
    y_train_pred = model.predict(X_train)
    
    # Testing predictions (to determine performance)
    y_pred = model.predict(X_test)

    print(classification_report(y_test,y_pred))
    print(confusion_matrix(y_test,y_pred))
    plot_confusion_matrix(model, X_test, y_test)
    

## X.1 Decision tree

In [None]:
dtree = DecisionTreeClassifier(random_state = 42)
pred_eval_plot_model(X_train, X_test, y_train, y_test, dtree)

## 4.2 Random forest

In [None]:
rf = RandomForestClassifier(random_state = 42)
pred_eval_plot_model(X_train, X_test, y_train, y_test, rf)

## 4.3 KNN

Using the default numbers of neighbors (K = 5)

In [None]:
knn = KNeighborsClassifier()
pred_eval_plot_model(X_train_dummies , X_test_dummies, y_train, y_test, knn)

## 4.4 XGBoost

In [None]:
xgb = XGBClassifier()
pred_eval_plot_model(X_train_dummies, X_test_dummies, y_train, y_test, xgb)