# Predicting the State of a Kickstarter Project

In [1]:
# Import Dependencies
%matplotlib inline

# Start Python Imports
import math, time, random, datetime

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier


In [2]:
# Import the entire dataset into a dataframe
df = pd.read_csv('data/ks-projects-201801.csv')

df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [3]:
#check datatypes of attributes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
ID                  378661 non-null int64
name                378657 non-null object
category            378661 non-null object
main_category       378661 non-null object
currency            378661 non-null object
deadline            378661 non-null object
goal                378661 non-null float64
launched            378661 non-null object
pledged             378661 non-null float64
state               378661 non-null object
backers             378661 non-null int64
country             378661 non-null object
usd pledged         374864 non-null float64
usd_pledged_real    378661 non-null float64
usd_goal_real       378661 non-null float64
dtypes: float64(5), int64(2), object(8)
memory usage: 43.3+ MB


In [4]:
#sum count of nulls per attribute
df.isnull().sum()

ID                     0
name                   4
category               0
main_category          0
currency               0
deadline               0
goal                   0
launched               0
pledged                0
state                  0
backers                0
country                0
usd pledged         3797
usd_pledged_real       0
usd_goal_real          0
dtype: int64

In [5]:
#percentage of nulls per attribute
(df.isnull().sum().values / df.count())*100

ID                  0.000000
name                0.001056
category            0.000000
main_category       0.000000
currency            0.000000
deadline            0.000000
goal                0.000000
launched            0.000000
pledged             0.000000
state               0.000000
backers             0.000000
country             0.000000
usd pledged         1.012901
usd_pledged_real    0.000000
usd_goal_real       0.000000
dtype: float64

In [6]:
#value counts of state attribute
df['state'].value_counts()

failed        197719
successful    133956
canceled       38779
undefined       3562
live            2799
suspended       1846
Name: state, dtype: int64

## Data Cleaning and EDA

In [7]:
#converting launch string to datetime type
df['launched_timestamp'] = pd.to_datetime(df['launched'], infer_datetime_format=True)
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,launched_timestamp
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,2015-08-11 12:12:28
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,2017-09-02 04:43:57
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,2013-01-12 00:20:50
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,2012-03-17 03:24:11
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,2015-07-04 08:35:03


In [8]:
#converting deadline string to datatime type
df['deadline_timestamp'] = pd.to_datetime(df['deadline'], infer_datetime_format=True)
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,launched_timestamp,deadline_timestamp
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,2015-08-11 12:12:28,2015-10-09
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,2017-09-02 04:43:57,2017-11-01
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,2013-01-12 00:20:50,2013-02-26
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,2012-03-17 03:24:11,2012-04-16
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,2015-07-04 08:35:03,2015-08-29


In [9]:
#calculating duration of each project in days
df['duration_days'] = pd.Series(delta.days for delta in (df['deadline_timestamp'] - df['launched_timestamp']))
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,launched_timestamp,deadline_timestamp,duration_days
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,2015-08-11 12:12:28,2015-10-09,58
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,2017-09-02 04:43:57,2017-11-01,59
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,2013-01-12 00:20:50,2013-02-26,44
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,2012-03-17 03:24:11,2012-04-16,29
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,2015-07-04 08:35:03,2015-08-29,55


In [10]:
#view column headers in list form
list(df.columns.values.tolist())

['ID',
 'name',
 'category',
 'main_category',
 'currency',
 'deadline',
 'goal',
 'launched',
 'pledged',
 'state',
 'backers',
 'country',
 'usd pledged',
 'usd_pledged_real',
 'usd_goal_real',
 'launched_timestamp',
 'deadline_timestamp',
 'duration_days']

In [11]:
#dropping unneccesary columns
#dropping usd pledged since converion has errors
#dropping goal since usd goal real matches all other usd attributes
df.drop(['goal','deadline', 'launched', 'usd pledged', 'launched_timestamp', 'deadline_timestamp'], axis=1, inplace=True)
df.head()

Unnamed: 0,ID,name,category,main_category,currency,pledged,state,backers,country,usd_pledged_real,usd_goal_real,duration_days
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,0.0,failed,0,GB,0.0,1533.95,58
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2421.0,failed,15,US,2421.0,30000.0,59
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,220.0,failed,3,US,220.0,45000.0,44
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,1.0,failed,1,US,1.0,5000.0,29
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,1283.0,canceled,14,US,1283.0,19500.0,55


In [12]:
#drop all nulls in the dataset
df.dropna(inplace=True)
#sum count of nulls per attribute
df.isnull().sum()

ID                  0
name                0
category            0
main_category       0
currency            0
pledged             0
state               0
backers             0
country             0
usd_pledged_real    0
usd_goal_real       0
duration_days       0
dtype: int64

In [13]:
#df.drop(['name'], axis=1, inplace=True)
#df.head()

In [14]:
#dropping more unncessary columns
#dropping currency since we are only keeping all usd type attributes for consistency
#dropping pledge related attributes since if greater or equal to goal will always result in success
df.drop(['pledged', 'currency', 'usd_pledged_real'], axis=1, inplace=True)
df.head()

Unnamed: 0,ID,name,category,main_category,state,backers,country,usd_goal_real,duration_days
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,failed,0,GB,1533.95,58
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,failed,15,US,30000.0,59
2,1000004038,Where is Hank?,Narrative Film,Film & Video,failed,3,US,45000.0,44
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,failed,1,US,5000.0,29
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,canceled,14,US,19500.0,55


In [15]:
#label encode the state attribute
le = LabelEncoder()
le.fit(df['state'])
df['state_encoded'] = le.transform(df['state'])
df['state_encoded'].value_counts()

1    197716
3    133956
0     38779
5      3562
2      2799
4      1845
Name: state_encoded, dtype: int64

In [16]:
#view dataset
df.head()

Unnamed: 0,ID,name,category,main_category,state,backers,country,usd_goal_real,duration_days,state_encoded
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,failed,0,GB,1533.95,58,1
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,failed,15,US,30000.0,59,1
2,1000004038,Where is Hank?,Narrative Film,Film & Video,failed,3,US,45000.0,44,1
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,failed,1,US,5000.0,29,1
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,canceled,14,US,19500.0,55,0


In [17]:
# One hot encode the categorical columns
df_category_onehot = pd.get_dummies(df['category'], 
                                     prefix='category')

df_main_category_onehot = pd.get_dummies(df['main_category'], 
                                prefix='main_cat')

df_country_onehot = pd.get_dummies(df['country'], 
                                   prefix='country')


In [18]:
#concat the original df with the one hot encoded dfs
df_encoded = pd.concat([df, 
                        df_category_onehot, 
                        df_main_category_onehot, 
                        df_country_onehot], axis=1)

df_encoded.shape

(378657, 207)

In [19]:
df_orig = df_encoded[['ID', 'name', 'state', 'state_encoded']]
df_orig.head()

Unnamed: 0,ID,name,state,state_encoded
0,1000002330,The Songs of Adelaide & Abullah,failed,1
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,failed,1
2,1000004038,Where is Hank?,failed,1
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,failed,1
4,1000011046,Community Film Project: The Art of Neighborhoo...,canceled,0


In [20]:
#drop more attributes to finalize the dataset ready for training
df_encoded.drop(['ID', 'name', 'category', 'main_category', 'country'], axis=1, inplace=True)
df_encoded.head()

Unnamed: 0,state,backers,usd_goal_real,duration_days,state_encoded,category_3D Printing,category_Academic,category_Accessories,category_Action,category_Animals,...,country_JP,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,failed,0,1533.95,58,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,failed,15,30000.0,59,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,failed,3,45000.0,44,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,failed,1,5000.0,29,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,canceled,14,19500.0,55,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Machine Learning

In [21]:
df_cleaned = df_encoded

In [22]:
train_set, test_set = train_test_split(df_cleaned, test_size = .2, random_state=42)

In [23]:
#splitting to data and labels
X_train = train_set.drop(['state', 'state_encoded'], axis=1) # data
y_train = train_set.state_encoded # labels

In [24]:
X_train.shape

(302925, 200)

In [25]:
y_train.shape

(302925,)

In [26]:
#create function to fit model, cross validate, and get accuracies
def fit_algo(algo, X_train, y_train, cv):
    
    
    model = algo.fit(X_train, y_train)
    acc = round(model.score(X_train, y_train) * 100, 2)
    
    
    train_pred = model_selection.cross_val_predict(algo, 
                                                  X_train, 
                                                  y_train, 
                                                  cv=cv, 
                                                  n_jobs = -1)
    
    acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, 2)
    
    return train_pred, acc, acc_cv

In [27]:
#create instance of log reg 
#lr_model = LogisticRegression(multi_class='multinomial', solver='newton-cg')
lr_model = LogisticRegression()

In [28]:
# Logistic Regression
start_time = time.time()
train_pred_log, acc_log, acc_cv_log = fit_algo(lr_model, X_train, y_train, 10)
log_time = (time.time() - start_time)


print("Accuracy: %s" % acc_log)
print("Accuracy CV 10-Fold: %s" % acc_cv_log)
print("Running Time: %s" % datetime.timedelta(seconds=log_time))

Accuracy: 77.24
Accuracy CV 10-Fold: 77.19
Running Time: 0:02:21.563680


In [29]:
models = pd.DataFrame({
    'Model_Name': ['Logistic Regression'],
    'Reg_Accuracy_Score': [ 
        acc_log,  
    ]})
print("---Reuglar Accuracy Scores---")
models.sort_values(by='Reg_Accuracy_Score', ascending=False)

---Reuglar Accuracy Scores---


Unnamed: 0,Model_Name,Reg_Accuracy_Score
0,Logistic Regression,77.24


In [30]:
cv_models = pd.DataFrame({
    'Model_Name': ['Logistic Regression'],
    'CV_Accuracy_Score': [ 
        acc_cv_log,      
    ]})
print('---Cross-validation Accuracy Scores---')
cv_models.sort_values(by='CV_Accuracy_Score', ascending=False)

---Cross-validation Accuracy Scores---


Unnamed: 0,Model_Name,CV_Accuracy_Score
0,Logistic Regression,77.19


In [32]:
#splitting test data to data and labels
X_test = test_set.drop(['state', 'state_encoded'], axis=1) # data
y_test = test_set.state_encoded # labels

In [33]:
X_test.shape

(75732, 200)

In [34]:
y_test.shape

(75732,)

In [36]:
predictions = lr_model.predict(X_test)
df_predictions = pd.DataFrame({
                               "State": test_set['state'],
                               "Actual_State_Encoded": y_test,
                               "Prediction": predictions})

df_predictions



Unnamed: 0,State,Actual_State_Encoded,Prediction
266682,failed,1,1
179492,successful,3,3
179755,successful,3,3
195374,failed,1,1
161413,successful,3,3
281802,failed,1,1
357031,successful,3,3
66123,suspended,4,1
212120,canceled,0,1
235566,successful,3,3


In [37]:
test_accuracy = round(metrics.accuracy_score(y_test, predictions) * 100, 2)
test_accuracy

77.25

In [38]:
print(metrics.classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       0.22      0.00      0.00      7814
          1       0.72      0.96      0.82     39359
          2       0.00      0.00      0.00       531
          3       0.90      0.76      0.82     26942
          4       0.04      0.00      0.00       394
          5       0.94      0.51      0.66       692

avg / total       0.72      0.77      0.73     75732

