In [1]:
import os
os.chdir('/home/potusvn/Projects/kickstarter_prediction')

In [2]:
import pandas as pd
import pickle
import numpy as np

from os.path import join
from settings import *

In [3]:
df = pd.read_csv('data/raw/ks-projects-201801.csv', encoding='latin1', low_memory=False)

In [4]:
df = df[(df.state == 'successful') | (df.state == 'failed')]

In [5]:
df['success'] = np.where(df.state == 'successful', 1, 0)

In [6]:
df.columns

Index(['ID', 'name', 'category', 'main_category', 'currency', 'deadline',
       'goal', 'launched', 'pledged', 'state', 'backers', 'country',
       'usd pledged', 'usd_pledged_real', 'usd_goal_real', 'success'],
      dtype='object')

In [7]:
df = df.drop(['ID', 'pledged', 'usd pledged','state', 'goal'], axis=1)

In [8]:
df.loc[:, 'usd_pledged_real'] = pd.to_numeric(df['usd_pledged_real'], downcast='float', errors='coerce')
df.loc[:, 'usd_goal_real'] = pd.to_numeric(df['usd_goal_real'], downcast='float', errors='coerce')
df.loc[:, 'backers'] = pd.to_numeric(df.backers, errors='raise', downcast='integer')

# Convert launched, and deadline to datetime objects
for col in ['launched', 'deadline']:
    df.loc[:, col] = pd.to_datetime(df[col], errors='coerce')

In [9]:
features = df.copy()

# Filter out the null name
features = features[features.name.notna()]


features['name_length'] = df.name.str.len()

features['duration'] = (df.deadline - df.launched).dt.days + 1

In [10]:
new_dataset = features[['backers', 'usd_goal_real', 'name_length','duration', 'main_category', 'country', 'success']]

In [11]:
new_dataset = pd.get_dummies(new_dataset, columns = ['main_category', 'country'], dummy_na=False, drop_first=True)


In [12]:
new_dataset.columns

Index(['backers', 'usd_goal_real', 'name_length', 'duration', 'success',
       'main_category_Comics', 'main_category_Crafts', 'main_category_Dance',
       'main_category_Design', 'main_category_Fashion',
       'main_category_Film & Video', 'main_category_Food',
       'main_category_Games', 'main_category_Journalism',
       'main_category_Music', 'main_category_Photography',
       'main_category_Publishing', 'main_category_Technology',
       'main_category_Theater', 'country_AU', 'country_BE', 'country_CA',
       'country_CH', 'country_DE', 'country_DK', 'country_ES', 'country_FR',
       'country_GB', 'country_HK', 'country_IE', 'country_IT', 'country_JP',
       'country_LU', 'country_MX', 'country_N,0"', 'country_NL', 'country_NO',
       'country_NZ', 'country_SE', 'country_SG', 'country_US'],
      dtype='object')

In [13]:
X = new_dataset.drop(['success'], axis=1)
y = new_dataset.success

In [14]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(X, y, shuffle=True, test_size=0.2, random_state=0)

In [None]:
# train_x = train.drop(['success'], axis=1)
# train_y = train.success

# test_x = test.drop(['success'], axis=1)
# test_y = test.success

In [17]:
train_x.head(10)

Unnamed: 0,backers,usd_goal_real,name_length,duration,main_category_Comics,main_category_Crafts,main_category_Dance,main_category_Design,main_category_Fashion,main_category_Film & Video,...,country_JP,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
323096,25,5000.0,60.0,30,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
136074,3,18000.0,16.0,35,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
14614,103,9863.509766,15.0,24,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
340105,0,6000.0,32.0,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
100164,25,1000.0,41.0,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
24270,36,1500.0,34.0,31,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
125383,30,1500.0,46.0,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12008,164,5000.0,60.0,83,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
306032,51,15000.0,47.0,28,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
109995,6,50000.0,53.0,60,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [34]:
import tensorflow as tf
from keras import layers, optimizers
from keras import models

model = models.Sequential()
# Input - Layer
model.add(layers.Dense(180, activation="relu", input_shape = (40,)))
# Hidden - Layers
model.add(layers.Dense(80, activation="relu"))
# Output- Layer
model.add(layers.Dense(1, activation=tf.nn.sigmoid))
model.summary()
# compiling the model
model.compile(
    optimizer='adam',
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
results = model.fit(
    train_x, train_y,
    epochs=100,
    batch_size=180,
    validation_data=(test_x, test_y)
)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_24 (Dense)             (None, 180)               7380      
_________________________________________________________________
dense_25 (Dense)             (None, 80)                14480     
_________________________________________________________________
dense_26 (Dense)             (None, 1)                 81        
Total params: 21,941
Trainable params: 21,941
Non-trainable params: 0
_________________________________________________________________
Train on 265337 samples, validate on 66335 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/

Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Test-Accuracy: 0.8915860381641807


## Remove backers

In [35]:
new_dataset2 = features[['usd_goal_real', 'name_length','duration', 'main_category', 'country', 'success']]

In [36]:
new_dataset2 = pd.get_dummies(new_dataset2, columns = ['main_category', 'country'], dummy_na=False, drop_first=True)


In [37]:
X = new_dataset2.drop(['success'], axis=1)
y = new_dataset2.success

In [38]:
from sklearn.model_selection import train_test_split
train_x2, test_x2, train_y2, test_y2 = train_test_split(X, y, shuffle=True, test_size=0.2, random_state=0)

In [39]:
train_x2.shape

(265337, 39)

In [40]:
import tensorflow as tf
from keras import layers, optimizers
from keras import models

model = models.Sequential()
# Input - Layer
model.add(layers.Dense(180, activation="relu", input_shape = (39,)))
# Hidden - Layers
model.add(layers.Dense(80, activation="relu"))
# Output- Layer
model.add(layers.Dense(1, activation=tf.nn.sigmoid))
model.summary()
# compiling the model
model.compile(
    optimizer='adam',
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
results = model.fit(
    train_x2, train_y2,
    epochs=100,
    batch_size=180,
    validation_data=(test_x2, test_y2)
)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_27 (Dense)             (None, 180)               7200      
_________________________________________________________________
dense_28 (Dense)             (None, 80)                14480     
_________________________________________________________________
dense_29 (Dense)             (None, 1)                 81        
Total params: 21,761
Trainable params: 21,761
Non-trainable params: 0
_________________________________________________________________
Train on 265337 samples, validate on 66335 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/

Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [43]:
np.max(results.history['val_acc'])

0.6059998516095847