In [18]:
import os
os.chdir('/home/potusvn/Projects/kickstarter_prediction')

In [19]:
import pandas as pd
import pickle
import numpy as np

from os.path import join
from settings import *

In [20]:
df = pd.read_csv('data/raw/ks-projects-201801.csv', encoding='latin1', low_memory=False)

In [21]:
df = df[(df.state == 'successful') | (df.state == 'failed')]

In [22]:
df['success'] = np.where(df.state == 'successful', 1, 0)

In [23]:
df = df.drop(['ID', 'pledged', 'usd pledged','state', 'goal'], axis=1)

In [24]:
df.loc[:, 'usd_pledged_real'] = pd.to_numeric(df['usd_pledged_real'], downcast='float', errors='coerce')
df.loc[:, 'usd_goal_real'] = pd.to_numeric(df['usd_goal_real'], downcast='float', errors='coerce')
df.loc[:, 'backers'] = pd.to_numeric(df.backers, errors='raise', downcast='integer')

# Convert launched, and deadline to datetime objects
for col in ['launched', 'deadline']:
    df.loc[:, col] = pd.to_datetime(df[col], errors='coerce')

In [25]:
def devide_by_zero(a, b):
    if b == 0 or a == 0:
        return 0
    else:
        return a / b

In [26]:
features = df.copy()

# Filter out the null name
features = features[features.name.notna()]

# Post features
# Engineer for name
# features['contain_special_symbols'] = pd.get_dummies(df.name.str.contains(r'[.,:!?#*]'), drop_first=True)
features['name_length'] = df.name.str.len()
# features['num_vowels'] = df.name.str.count(r'[aeiouywAEIOUYW]')
# features['num_cap'] = df.name.str.count(r'[A-Z]')
# features['num_whitespace'] = df.name.str.count(r'\s')
# features['contain_bad_word'] = df.name.apply(lambda row: TextPreprocess(row).contain_bad_words(google_bad_words_list, no_swearing_words_list))

# Sentiment from TextBlob
# features[['subjectivity', 'polarity']] = df.apply(lambda row: pd.Series(TextPreprocess(row['name']).get_sentiment_value()), axis=1)
# features['subjectivity'] = features['subjectivity'].round(3)
# features['polarity'] = features['polarity'].round(3)

# Datetime features
# features['launch_month'] = df.launched.dt.month
features['duration'] = (df.deadline - df.launched).dt.days + 1


# Deduction features
# features['pledged_per_backer'] = features.apply(lambda row: devide_by_zero(row.usd_pledged_real, row.backers), axis=1)
# features['pledged_per_backer'] = features['pledged_per_backer'].round(2)

# features['required_backers'] = features.apply(lambda row: devide_by_zero(row.usd_goal_real, row['pledged_per_backer']), axis=1)
# features['required_backers'] = features['required_backers'].round()

# features['required_backers_per_day'] = features.apply(lambda row: devide_by_zero(row['required_backers'], row['duration']), axis=1)
# features['required_backers_per_day'] = features['required_backers_per_day'].round()

In [27]:
new_dataset = features[['backers','usd_goal_real', 'name_length','duration', 'main_category', 'country', 'success']]

In [28]:
new_dataset = pd.get_dummies(new_dataset, columns = ['main_category', 'country'], dummy_na=False, drop_first=True)


In [29]:
new_dataset.columns

Index(['pledged_per_backer', 'usd_goal_real', 'name_length', 'duration',
       'success', 'main_category_Comics', 'main_category_Crafts',
       'main_category_Dance', 'main_category_Design', 'main_category_Fashion',
       'main_category_Film & Video', 'main_category_Food',
       'main_category_Games', 'main_category_Journalism',
       'main_category_Music', 'main_category_Photography',
       'main_category_Publishing', 'main_category_Technology',
       'main_category_Theater', 'country_AU', 'country_BE', 'country_CA',
       'country_CH', 'country_DE', 'country_DK', 'country_ES', 'country_FR',
       'country_GB', 'country_HK', 'country_IE', 'country_IT', 'country_JP',
       'country_LU', 'country_MX', 'country_N,0"', 'country_NL', 'country_NO',
       'country_NZ', 'country_SE', 'country_SG', 'country_US'],
      dtype='object')

In [30]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(new_dataset, shuffle=True, test_size=0.2, random_state=0)

In [32]:
train_x = train.drop(['success'], axis=1)
train_y = train.success

test_x = test.drop(['success'], axis=1)
test_y = test.success

In [33]:
train_x.shape

(265337, 40)

In [34]:
import tensorflow as tf
from keras import layers, optimizers
from keras import models

model = models.Sequential()
# Input - Layer
model.add(layers.Dense(180, activation="relu", input_shape = (40,)))
# Hidden - Layers
model.add(layers.Dense(180, activation="relu"))
model.add(layers.Dense(80, activation="relu"))
model.add(layers.Dense(180, activation="relu"))
# Output- Layer
model.add(layers.Dense(1, activation=tf.nn.sigmoid))
model.summary()
# compiling the model
model.compile(
    optimizer='adam',
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
results = model.fit(
    train_x, train_y,
    epochs=100,
    batch_size=30,
    validation_data=(test_x, test_y)
)
print("Test-Accuracy:", np.mean(results.history["val_acc"]))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 180)               7380      
_________________________________________________________________
dense_7 (Dense)              (None, 180)               32580     
_________________________________________________________________
dense_8 (Dense)              (None, 80)                14480     
_________________________________________________________________
dense_9 (Dense)              (None, 180)               14580     
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 181       
Total params: 69,201
Trainable params: 69,201
Non-trainable params: 0
_________________________________________________________________
Train on 265337 samples, validate on 66335 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epo

KeyboardInterrupt: 