In [1]:
# Dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

# Importing data into pandas
application_df = pd.read_csv("Resources/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [2]:
# Dropping columns that won't be used
application_df = application_df.drop(columns=['EIN','NAME'])

# Dropping all rows with a 'STATUS' of 0
application_df = application_df[application_df.STATUS != 0]
application_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [None]:
# Checking data
#application_df['APPLICATION_TYPE'].value_counts()
#application_df['AFFILIATION'].value_counts()
#application_df['CLASSIFICATION'].value_counts()
#application_df['INCOME_AMT'].value_counts()
#application_df['SPECIAL_CONSIDERATIONS'].value_counts()
#application_df['ASK_AMT'].value_counts()

In [3]:
# Creating list of application types to combine, according to our cutoff value of 500+
application_types_to_replace = ['T10','T9','T13','T12','T2','T25','T14','T29','T15','T17']

# Iterating through specified values to replace dataframe values with 'Other'
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Viewing results
application_df['APPLICATION_TYPE'].value_counts()

T3       27032
T4        1542
T6        1216
T5        1173
T19       1065
Other      804
T8         737
T7         725
Name: APPLICATION_TYPE, dtype: int64

In [4]:
# Creating list of classifications to combine, according to cutoff of 700+
classifications_to_replace = ['C7000', 'C1700', 'C4000',
       'C5000', 'C1270', 'C2700', 'C2800', 'C7100', 'C1300', 'C1280', 'C1230',
       'C1400', 'C7200', 'C2300', 'C1240', 'C8000', 'C7120', 'C1500', 'C1800',
       'C6000', 'C1250', 'C8200', 'C1238', 'C1278', 'C1235', 'C1237', 'C7210',
       'C2400', 'C1720', 'C4100', 'C1257', 'C1600', 'C1260', 'C2710', 'C0',
       'C3200', 'C1234', 'C1246', 'C1267', 'C1256', 'C2190', 'C4200', 'C2600',
       'C5200', 'C1370', 'C1248', 'C6100', 'C1820', 'C1900', 'C1236', 'C3700',
       'C2570', 'C1580', 'C1245', 'C2500', 'C1570', 'C1283', 'C2380', 'C1732',
       'C1728', 'C2170', 'C4120', 'C8210', 'C2561', 'C4500', 'C2150']

# Iterating through specified values to replace dataframe values with 'Other'
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Viewing results
application_df['CLASSIFICATION'].value_counts()

C1000    17323
C2000     6073
C1200     4837
Other     2261
C3000     1918
C2100     1882
Name: CLASSIFICATION, dtype: int64

In [5]:
# Creating bins for ask amount
bin00 = application_df[application_df['ASK_AMT'] == 5000]
bin01 = application_df[(application_df['ASK_AMT'] > 5000) & (application_df['ASK_AMT'] <= 100000)]
bin02 = application_df[(application_df['ASK_AMT'] > 100000) & (application_df['ASK_AMT'] <= 1000000)]
bin03 = application_df[(application_df['ASK_AMT'] > 1000000) & (application_df['ASK_AMT'] <= 10000000)]
bin04 = application_df[application_df['ASK_AMT'] > 10000000]

print(bin00.shape, bin01.shape, bin02.shape, bin03.shape, bin04.shape)

(25394, 10) (4369, 10) (2954, 10) (1165, 10) (412, 10)


In [6]:
# Iterating through values to replace dataframe values
for b1 in list(bin01['ASK_AMT']):
    application_df['ASK_AMT'] = application_df['ASK_AMT'].replace(b1,"5K_100K")

for b2 in list(bin02['ASK_AMT']):
    application_df['ASK_AMT'] = application_df['ASK_AMT'].replace(b2,'100K_1M')

for b3 in list(bin03['ASK_AMT']):
    application_df['ASK_AMT'] = application_df['ASK_AMT'].replace(b3,'1M_10M')

for b4 in list(bin04['ASK_AMT']):
    application_df['ASK_AMT'] = application_df['ASK_AMT'].replace(b4,'over10M')

# Viewing results
application_df['ASK_AMT'].value_counts()

5000       25394
5K_100K     4369
100K_1M     2954
1M_10M      1165
over10M      412
Name: ASK_AMT, dtype: int64

In [7]:
# Previewing dataframe
application_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,Other,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,100K_1M,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,5K_100K,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,100K_1M,1


In [8]:
# Converting categorical data into numerical data
df = pd.get_dummies(application_df)
df.head()

Unnamed: 0,STATUS,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,APPLICATION_TYPE_T7,APPLICATION_TYPE_T8,...,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y,ASK_AMT_5000,ASK_AMT_100K_1M,ASK_AMT_1M_10M,ASK_AMT_5K_100K,ASK_AMT_over10M
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
1,1,1,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
2,1,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,1,0,0,0,0
3,1,1,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,1,1,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


In [9]:
# Splitting data into features and targets
y = df['IS_SUCCESSFUL']
X = df.drop('IS_SUCCESSFUL', axis=1)

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [10]:
# Setting up StandardScaler
scaler = StandardScaler()

# Fitting training data
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# Defining neural network model
nn = tf.keras.models.Sequential()

In [15]:
# Adding input layer
nn.add(tf.keras.layers.Dense(units=200, activation='relu', input_dim=46))

# Adding hidden layers
nn.add(tf.keras.layers.Dense(units=100, activation='relu'))
nn.add(tf.keras.layers.Dense(units=100, activation='relu'))

# Adding output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Checking structure of model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 200)               9400      
                                                                 
 dense_5 (Dense)             (None, 100)               20100     
                                                                 
 dense_6 (Dense)             (None, 100)               10100     
                                                                 
 dense_7 (Dense)             (None, 1)                 101       
                                                                 
Total params: 39,701
Trainable params: 39,701
Non-trainable params: 0
_________________________________________________________________


In [16]:
# Compiling and training the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

fit_model = nn.fit(X_train_scaled, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
# Evaluating model using test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.5489 - accuracy: 0.7358 - 290ms/epoch - 1ms/step
Loss: 0.5488824844360352, Accuracy: 0.73582923412323


In [18]:
# Creating function that creates a new sequential model with variable hyperparameters
def create_model(hp):

    nn_optimized = tf.keras.models.Sequential()

    activation = hp.Choice('activation',['relu','tanh','sigmoid'])

    nn_optimized.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=200,
        step=2), activation=activation, input_dim=46))
    
    for i in range(hp.Int('num_layers', 1, 3)):
        nn_optimized.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=100,
            step=2),
            activation=activation))
    
    nn_optimized.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

    nn_optimized.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return nn_optimized

In [19]:
# Importing keras tuner
import keras_tuner as kt

# Initializing tuner
tuner = kt.Hyperband(
    create_model,
    objective='val_accuracy',
    max_epochs=20,
    hyperband_iterations=2)

In [20]:
# Searching for the best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 60 Complete [00h 00m 22s]
val_accuracy: 0.7394448518753052

Best val_accuracy So Far: 0.7407277822494507
Total elapsed time: 00h 08m 07s
INFO:tensorflow:Oracle triggered exit


In [21]:
# Viewing best values
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'tanh',
 'first_units': 119,
 'num_layers': 1,
 'units_0': 39,
 'units_1': 59,
 'units_2': 37,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 0,
 'tuner/round': 0}

In [23]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
best_model_loss, best_model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {best_model_loss}, Accuracy: {best_model_accuracy}")

268/268 - 0s - loss: 0.5474 - accuracy: 0.7407 - 284ms/epoch - 1ms/step
Loss: 0.5474126935005188, Accuracy: 0.7407277822494507


In [24]:
# Exporting preliminary results to hdf5 file
import numpy as np
import h5py

# Initializing hdf5 file
hf = h5py.File('AlphabetSoupCharity_Optimization.h5', 'w')

# Writing in results
hf.create_dataset('optimized_model_loss', data=best_model_loss)
hf.create_dataset('optimized_model_accuracy', data=best_model_accuracy)

# Close and write file
hf.close()