## Program takes data.csv file as input which contains raw data 
## Then it performs
          1.Data cleaning, 
          2.Categorical value to numerical value conversion
          3.Data Normalization
          4.Data Partition
## During Data partition, it splits data into
           1.Training data     It contains 75% of the whole dataset
           2.Validation data   It contains 15% of the whole dataset
           3.Test data         It contains 10% of the whole dataset
## Then Program use Keras to build Neural Network, train the network and evaluate the network
## Result:
            1. Taining Accuracy = 97.41%
            2. Validation Accuracy = 97.35%
            3. Test Accuracy = 97.27%
## On average Accuracy lies arround 97%

In [405]:
import pandas as pd
import numpy as np
import time
import datetime
from keras.models import Sequential
from keras.layers import Dense
from keras.layers.core import Dropout
from keras.utils import to_categorical
from keras.regularizers import l2
from keras.models import load_model

In [406]:
#load csv file to the pandas dataframe
data = pd.read_csv('data.csv')

### Data Visualization

In [407]:
data.head(10)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375.0,52375.0,50000.0
6,1000023410,Support Solar Roasted Coffee & Green Energy! ...,Food,Food,USD,2014-12-21,1000.0,2014-12-01 18:30:44,1205.0,successful,16,US,1205.0,1205.0,1000.0
7,1000030581,Chaser Strips. Our Strips make Shots their B*tch!,Drinks,Food,USD,2016-03-17,25000.0,2016-02-01 20:05:12,453.0,failed,40,US,453.0,453.0,25000.0
8,1000034518,SPIN - Premium Retractable In-Ear Headphones w...,Product Design,Design,USD,2014-05-29,125000.0,2014-04-24 18:14:43,8233.0,canceled,58,US,8233.0,8233.0,125000.0
9,100004195,STUDIO IN THE SKY - A Documentary Feature Film...,Documentary,Film & Video,USD,2014-08-10,65000.0,2014-07-11 21:55:48,6240.57,canceled,43,US,6240.57,6240.57,65000.0


In [408]:
# I used data.info() and data.describe to see the distribution of the data 

In [409]:
# Used groupby from pandas to see each category of the 

In [410]:
group_state = data.groupby('state')

In [411]:
len(group_state.groups)

6

In [412]:
group_main_category = data.groupby('main_category')

In [413]:
len(group_main_category.groups)

15

In [414]:
group_currency = data.groupby('currency')

In [415]:
len(group_currency.groups)

14

In [416]:
groups_country = data.groupby('country')

In [417]:
len(groups_country)

23

## Get successful and failed data from the whole dataset

In [418]:
# Filter out other Projects which are in canceled,live,suspended and undefined state
# Because from these data we can not get information whether the project is going to be 
# successful or not
success_fail_data = data.loc[(data['state']=='successful')|(data['state']=='failed')]
# Now there are 331,675 rows left

In [419]:
success_fail_data.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375.0,52375.0,50000.0


## Filter the required columns which might have the high impact on output

In [420]:
# ID, name might not have huge impact in the success of the project
filtered_data = success_fail_data.filter(items = ['main_category','currency','state','backers',\
                                                  'country','usd_pledged_real','usd_goal_real'])

In [421]:
filtered_data.head()

Unnamed: 0,main_category,currency,state,backers,country,usd_pledged_real,usd_goal_real
0,Publishing,GBP,failed,0,GB,0.0,1533.95
1,Film & Video,USD,failed,15,US,2421.0,30000.0
2,Film & Video,USD,failed,3,US,220.0,45000.0
3,Music,USD,failed,1,US,1.0,5000.0
5,Food,USD,successful,224,US,52375.0,50000.0


## Convert string datatype to integer datatype

In [422]:
Main_category = {'Art':1,'Comics':2,'Crafts':3,'Dance':4,'Design':5,'Fashion':6,'Film & Video':7,\
                 'Food':8,'Games':9,'Journalism':10,'Music':11,'Photography':12,'Publishing':13,'Technology':14\
                ,'Theater':15}
Currency = {'AUD':1,'CAD':2,'CHF':3,'DKK':4,'EUR':5,'GBP':6,'HKD':7,'JPY':8,'MXN':9,'NOK':10,'NZD':11,'SEK':12\
           ,'SGD':13,'USD':14}
Country = {'AT':1,'AU':2,'BE':3,'CA':4,'CH':5,'DE':6,'DK':7,'ES':8,'FR':9,'GB':10,'HK':11,'IE':12,'IT':13\
          ,'JP':14,'LU':15,'MX':16,'N,0"':17,'NL':18,'NO':19,'NZ':20,'SE':21,'SG':22,'US':23}
State = {'failed':0,'successful':1}

## Normalization of the data

In [423]:
filtered_data.state = [State[item] for item in filtered_data.state]
filtered_data.main_category = [Main_category[item]/15 for item in filtered_data.main_category]
filtered_data.currency = [Currency[item]/14 for item in filtered_data.currency]
filtered_data.country = [Country[item]/23 for item in filtered_data.country]
max_backers = filtered_data.backers.max()
max_usd_pledged_real = filtered_data.usd_pledged_real.max()
max_usd_goal_real = filtered_data.usd_goal_real.max()
filtered_data.backers = [item/max_backers for item in filtered_data.backers]
filtered_data.usd_pledged_real = [item/max_usd_pledged_real for item in filtered_data.usd_pledged_real]
filtered_data.usd_goal_real = [item/max_usd_goal_real for item in filtered_data.usd_goal_real]

In [424]:
filtered_data.head()

Unnamed: 0,main_category,currency,state,backers,country,usd_pledged_real,usd_goal_real
0,0.866667,0.428571,0,0.0,0.434783,0.0,9e-06
1,0.466667,1.0,0,6.8e-05,1.0,0.0001190325,0.00018
2,0.466667,1.0,0,1.4e-05,1.0,1.081666e-05,0.00027
3,0.733333,1.0,0,5e-06,1.0,4.916666e-08,3e-05
5,0.533333,1.0,1,0.001021,1.0,0.002575104,0.000301


In [425]:
filtered_data.describe()

Unnamed: 0,main_category,currency,state,backers,country,usd_pledged_real,usd_goal_real
count,331675.0,331675.0,331675.0,331675.0,331675.0,331675.0,331675.0
mean,0.565561,0.862945,0.403877,0.00053,0.86934,0.000489,0.000249517
std,0.262098,0.277041,0.490674,0.004401,0.269297,0.004756,0.006665787
min,0.066667,0.071429,0.0,0.0,0.043478,0.0,6.01101e-11
25%,0.4,1.0,0.0,9e-06,1.0,2e-06,1.202202e-05
50%,0.533333,1.0,0.0,6.8e-05,1.0,3.9e-05,3.005505e-05
75%,0.733333,1.0,1.0,0.000287,1.0,0.000227,9.016515e-05
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Split failed project and successful project to balance the data set

In [426]:
#success_fail_data = data.loc[(data['state']=='successful')|(data['state']=='failed')]
success_project = filtered_data.loc[filtered_data['state']==1]
failed_project = filtered_data.loc[filtered_data['state']==0]

## Split test, validation and training data from individual failed and successful project

In [427]:
# To randomly shuffle the successful project data
success_project = success_project.sample(frac=1)
# To seperate 10% of data for the test data from successful project
test_length_success = int(len(success_project)*0.1) 
#To seperate 15% of data for the validation from succesful project
validation_length_success = int(len(success_project)*0.25)
success_test_data = success_project[:test_length_success :]
success_validation_data = success_project[test_length_success:validation_length_success :]
success_train_data = success_project[validation_length_success: :]

In [428]:
print(len(success_test_data))
print(len(success_validation_data))
print(len(success_train_data))

13395
20094
100467


In [429]:
print(len(failed_project))

197719


## This data set is unblanced, there are 133,956 successful projects and 197719 failed,
## unsuccessful, project. So I balanced the data set by using following steps

In [430]:
# To randomly shuffle and get only 67.751% of the failed project data
# Main purpose of this line is to balance the number of successful and failed project
failed_project = failed_project.sample(frac=len(success_project)/len(failed_project))
# To seperate 10% of data for the test data from failed project
print('Now new length of failed_project is=', len(failed_project))
test_length_failed = int(len(failed_project)*0.1) 
#To seperate 15% of data for the validation from failed project
validation_length_failed = int(len(failed_project)*0.25)
failed_test_data = failed_project[:test_length_failed :]
failed_validation_data = failed_project[test_length_failed:validation_length_failed :]
failed_train_data = failed_project[validation_length_failed: :]

Now new length of failed_project is= 133956


In [431]:
print(len(failed_test_data))
print(len(failed_validation_data))
print(len(failed_train_data))

13395
20094
100467


## Merge test data from the failed project to the test data from the successful project

In [432]:
final_test_data = pd.concat([success_test_data,failed_test_data],axis = 0)
# Randomly shuffle the final_test_data to fixed the successful test data and failed test data
final_test_data = final_test_data.sample(frac=1)

## Merge validation data from the failed project to the validation data from the successful project

In [433]:
final_validation_data = pd.concat([success_validation_data,failed_validation_data],axis = 0)
# Randomly shuffle the final_validaion_data to fixed the successful validaion data and failed validaion data
final_validation_data = final_validation_data.sample(frac=1)

## Merge training data from the failed project to the training data from the successful project

In [434]:
final_training_data = pd.concat([success_train_data,failed_train_data],axis = 0)
# Randomly shuffle the final_training_data to fix the successful training data and failed training data
final_training_data = final_training_data.sample(frac = 1)

## Write the final test data, final validation data and final training data to the csv file

In [435]:
final_test_data.to_csv('processed_data/final_test_data.csv')
final_validation_data.to_csv('processed_data/final_validation_data.csv')
final_training_data.to_csv('processed_data/final_training_data.csv')

## Seperate inputs from label

In [436]:
train_inputs = final_training_data.filter(items = ['main_category','currency','backers',\
                                                  'country','usd_pledged_real','usd_goal_real'])
train_labels = final_training_data.filter(items = ['state'])

In [437]:
train_inputs.head()

Unnamed: 0,main_category,currency,backers,country,usd_pledged_real,usd_goal_real
113607,0.933333,0.357143,9e-06,0.652174,1.07675e-07,6.575444e-07
363815,0.733333,1.0,0.000492,1.0,0.0005971291,6.01101e-05
304484,0.866667,0.428571,0.00052,0.434783,0.000416861,0.0001364449
213322,0.733333,1.0,5e-06,1.0,7.374999e-07,7.213212e-06
28016,0.866667,1.0,0.000324,1.0,0.0001947,5.409909e-06


In [438]:
train_labels.head()

Unnamed: 0,state
113607,0
363815,1
304484,0
213322,0
28016,1


In [439]:
test_inputs = final_test_data.filter(items = ['main_category','currency','backers',\
                                                  'country','usd_pledged_real','usd_goal_real'])
test_labels = final_test_data.filter(items = ['state'])

validation_inputs = final_validation_data.filter(items = ['main_category','currency','backers',\
                                                  'country','usd_pledged_real','usd_goal_real'])
validation_labels = final_validation_data.filter(items = ['state'])

In [440]:
test_inputs.head()

Unnamed: 0,main_category,currency,backers,country,usd_pledged_real,usd_goal_real
91947,0.533333,1.0,5e-06,1.0,2e-06,0.00015
92035,0.866667,1.0,0.000191,1.0,0.000108,9.6e-05
254103,0.6,1.0,3.6e-05,1.0,0.000263,0.00015
313233,0.533333,1.0,2.7e-05,1.0,1.5e-05,1e-06
19180,0.866667,1.0,0.000292,1.0,0.00062,0.000102


## Convert pandas dataframe to numpy ndarray for the trainging

In [441]:
numpy_train_inputs = train_inputs.as_matrix()
numpy_train_labels = train_labels.as_matrix()
numpy_test_inputs = test_inputs.as_matrix()
numpy_test_labels = test_labels.as_matrix()
numpy_validation_inputs = validation_inputs.as_matrix()
numpy_validation_labels = validation_labels.as_matrix()

In [442]:
numpy_train_labels[:5]

array([[0],
       [1],
       [0],
       [0],
       [1]])

### Convert numpy array labels to categorical value
#### Here first or top node represent the zero if top node gets the output value near 1 that means
####  that particular project is unsuccess one. 
#### Same principle follows to the success node, buttom node.

In [443]:
train_label_categ = to_categorical(numpy_train_labels)
test_label_categ = to_categorical(numpy_test_labels)
validation_label_categ = to_categorical(numpy_validation_labels)

In [444]:
train_label_categ[:5]

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.]], dtype=float32)

In [445]:
type(numpy_train_inputs)

numpy.ndarray

In [446]:
numpy_train_inputs[0]

array([9.33333333e-01, 3.57142857e-01, 9.11651822e-06, 6.52173913e-01,
       1.07674983e-07, 6.57544395e-07])

In [447]:
numpy_train_labels.shape

(200934, 1)

# Train Model Using keras

In [448]:
model = Sequential([
    Dense(12,activation='relu',input_shape = (6,)),
    Dense(12,activation='relu'),
    Dense(2,activation='softmax')
])

In [449]:
model.compile(optimizer='adam',loss = 'categorical_crossentropy',
               metrics = ['accuracy'])

In [450]:
model.fit(numpy_train_inputs,train_label_categ,epochs = 200, batch_size=128,
            validation_data=(numpy_validation_inputs,validation_label_categ))

Train on 200934 samples, validate on 40188 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200


Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200


Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200


Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x7fb7860063c8>

In [453]:
test_loss,test_acc = model.evaluate(numpy_test_inputs,test_label_categ)
print('test accuracy=',test_acc)

test accuracy= 0.9727510265024263


## To save the Model architecture and weight 

In [454]:
model.save('trained.h5')