In [5]:
import numpy as np
import pandas as pd
df = pd.read_csv('data.csv', index_col=0)

In [6]:
print('Columns:\n{}'.format(df.columns.values))
print('Data Types:\n{}'.format(df.dtypes))
print('Shape:\n{}'.format(df.shape))

Columns:
['name' 'desc' 'goal' 'keywords' 'disable_communication' 'country'
 'currency' 'deadline' 'state_changed_at' 'created_at' 'launched_at'
 'backers_count' 'final_status']
Data Types:
name                      object
desc                      object
goal                     float64
keywords                  object
disable_communication       bool
country                   object
currency                  object
deadline                   int64
state_changed_at           int64
created_at                 int64
launched_at                int64
backers_count              int64
final_status               int64
dtype: object
Shape:
(108129, 13)


In [7]:
# Drop 'created_at' and 'currency' columns for dimension reduction and also importance
# currency column is proxy for country anyways
df = df.drop(['created_at','currency'], axis=1)
df.head()

Unnamed: 0_level_0,name,desc,goal,keywords,disable_communication,country,deadline,state_changed_at,launched_at,backers_count,final_status
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
kkst1451568084,drawing for dollars,I like drawing pictures. and then i color them...,20.0,drawing-for-dollars,False,US,1241333999,1241334017,1240602723,3,1
kkst1474482071,Sponsor Dereck Blackburn (Lostwars) Artist in ...,"I, Dereck Blackburn will be taking upon an inc...",300.0,sponsor-dereck-blackburn-lostwars-artist-in-re...,False,US,1242429000,1242432018,1240975592,2,0
kkst183622197,Mr. Squiggles,So I saw darkpony's successfully funded drawin...,30.0,mr-squiggles,False,US,1243027560,1243027818,1242164398,0,0
kkst597742710,Help me write my second novel.,Do your part to help out starving artists and ...,500.0,help-me-write-my-second-novel,False,US,1243555740,1243556121,1240966730,18,1
kkst1913131122,Support casting my sculpture in bronze,"I'm nearing completion on a sculpture, current...",2000.0,support-casting-my-sculpture-in-bronze,False,US,1243769880,1243770317,1241180541,1,0


In [8]:
unix_time_columns = ['deadline','state_changed_at','launched_at']
for col in unix_time_columns:
    col_name = col+'_dt'
    df[col_name] = pd.to_datetime(df[col],unit='s')

In [9]:
# feature engineering 1 (duration)
# leave in unix time because only care about the difference
df['expected_duration'] = df['deadline'] - df['launched_at']
df['actual_duration']   = df['state_changed_at'] - df['launched_at']

In [10]:
# feature engineering 2 (launch times)
df['launch_year']  = df['launched_at_dt'].dt.year
df['launch_month'] = df['launched_at_dt'].dt.month
df['launch_day']   = df['launched_at_dt'].dt.day
df['launch_hour']  = df['launched_at_dt'].dt.hour

In [11]:
# feature engineering 3 (deadline times)
df['deadline_year']  = df['deadline_dt'].dt.year
df['deadline_month'] = df['deadline_dt'].dt.month
df['deadline_day']   = df['deadline_dt'].dt.day

In [12]:
for col in unix_time_columns:
    df = df.drop(col, axis=1)
    df = df.drop(col+'_dt', axis=1)

In [13]:
print('Columns:\n{}'.format(df.columns.values))
print('Data Types:\n{}'.format(df.dtypes))
print('Shape:\n{}'.format(df.shape))
df.head()

Columns:
['name' 'desc' 'goal' 'keywords' 'disable_communication' 'country'
 'backers_count' 'final_status' 'expected_duration' 'actual_duration'
 'launch_year' 'launch_month' 'launch_day' 'launch_hour' 'deadline_year'
 'deadline_month' 'deadline_day']
Data Types:
name                      object
desc                      object
goal                     float64
keywords                  object
disable_communication       bool
country                   object
backers_count              int64
final_status               int64
expected_duration          int64
actual_duration            int64
launch_year                int64
launch_month               int64
launch_day                 int64
launch_hour                int64
deadline_year              int64
deadline_month             int64
deadline_day               int64
dtype: object
Shape:
(108129, 17)


Unnamed: 0_level_0,name,desc,goal,keywords,disable_communication,country,backers_count,final_status,expected_duration,actual_duration,launch_year,launch_month,launch_day,launch_hour,deadline_year,deadline_month,deadline_day
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
kkst1451568084,drawing for dollars,I like drawing pictures. and then i color them...,20.0,drawing-for-dollars,False,US,3,1,731276,731294,2009,4,24,19,2009,5,3
kkst1474482071,Sponsor Dereck Blackburn (Lostwars) Artist in ...,"I, Dereck Blackburn will be taking upon an inc...",300.0,sponsor-dereck-blackburn-lostwars-artist-in-re...,False,US,2,0,1453408,1456426,2009,4,29,3,2009,5,15
kkst183622197,Mr. Squiggles,So I saw darkpony's successfully funded drawin...,30.0,mr-squiggles,False,US,0,0,863162,863420,2009,5,12,21,2009,5,22
kkst597742710,Help me write my second novel.,Do your part to help out starving artists and ...,500.0,help-me-write-my-second-novel,False,US,18,1,2589010,2589391,2009,4,29,0,2009,5,29
kkst1913131122,Support casting my sculpture in bronze,"I'm nearing completion on a sculpture, current...",2000.0,support-casting-my-sculpture-in-bronze,False,US,1,0,2589339,2589776,2009,5,1,12,2009,5,31


In [14]:
dummy_cols = ['country',
              'launch_hour','launch_day','launch_month','launch_year',
              'deadline_day','deadline_month','deadline_year']
for col in dummy_cols:
    df = pd.merge(df, pd.get_dummies(df[col], prefix=col), left_index=True, right_index=True)
    df = df.drop(col, axis=1)
print('Columns:\n{}'.format(df.columns.values))
print('Data Types:\n{}'.format(df.dtypes))
print('Shape:\n{}'.format(df.shape))

Columns:
['name' 'desc' 'goal' 'keywords' 'disable_communication' 'backers_count'
 'final_status' 'expected_duration' 'actual_duration' 'country_AU'
 'country_CA' 'country_DE' 'country_DK' 'country_GB' 'country_IE'
 'country_NL' 'country_NO' 'country_NZ' 'country_SE' 'country_US'
 'launch_hour_0' 'launch_hour_1' 'launch_hour_2' 'launch_hour_3'
 'launch_hour_4' 'launch_hour_5' 'launch_hour_6' 'launch_hour_7'
 'launch_hour_8' 'launch_hour_9' 'launch_hour_10' 'launch_hour_11'
 'launch_hour_12' 'launch_hour_13' 'launch_hour_14' 'launch_hour_15'
 'launch_hour_16' 'launch_hour_17' 'launch_hour_18' 'launch_hour_19'
 'launch_hour_20' 'launch_hour_21' 'launch_hour_22' 'launch_hour_23'
 'launch_day_1' 'launch_day_2' 'launch_day_3' 'launch_day_4' 'launch_day_5'
 'launch_day_6' 'launch_day_7' 'launch_day_8' 'launch_day_9'
 'launch_day_10' 'launch_day_11' 'launch_day_12' 'launch_day_13'
 'launch_day_14' 'launch_day_15' 'launch_day_16' 'launch_day_17'
 'launch_day_18' 'launch_day_19' 'launch_day_20'

In [15]:
# remove text fields for now
text_cols = ['name','desc','keywords']
df = df.drop(text_cols, axis=1)

In [16]:
# split into train and test
from sklearn.model_selection import train_test_split

X = df.drop('final_status',axis=1)
y = df['final_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)

In [40]:
from sklearn.naive_bayes import BernoulliNB
>>> gnb = BernoulliNB()
>>> y_pred = gnb.fit(X_train, y_train).predict(X_test)
>>> print("Number of mislabeled points out of a total %d points : %d"
...       % (X_test.shape[0],(y_test != y_pred).sum()))

Number of mislabeled points out of a total 35683 points : 13249


In [41]:
from sklearn.naive_bayes import GaussianNB
>>> gnb = GaussianNB()
>>> y_pred = gnb.fit(X_train, y_train).predict(X_test)
>>> print("Number of mislabeled points out of a total %d points : %d"
...       % (X_test.shape[0],(y_test != y_pred).sum()))

Number of mislabeled points out of a total 35683 points : 17843


0.37129725639660344