## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd 
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [2]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df.drop(["EIN", "NAME"], axis=1, inplace=True)
application_df.head(10)

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
5,T3,Independent,C1200,Preservation,Trust,1,0,N,5000,1
6,T3,Independent,C1000,Preservation,Trust,1,100000-499999,N,31452,1
7,T3,Independent,C2000,Preservation,Trust,1,10M-50M,N,7508025,1
8,T7,Independent,C1000,ProductDev,Trust,1,1-9999,N,94389,1
9,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0


In [3]:
# Determine the number of unique values in each column.
application_df.nunique()

APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64

In [4]:
def createOtherColumn(df, col, cutoff, rep_str="Other"):
    vc = df[col].value_counts()
    to_replace = list()
    for ind in list(vc.index):
        if vc[ind] < cutoff:
            to_replace.append(ind)

    # Replace in dataframe
    for r in to_replace:
        df[col] = df[col].replace(r, rep_str)

In [5]:
# Look at APPLICATION_TYPE value counts for binning
app_type_vc = application_df["APPLICATION_TYPE"].value_counts()
print(app_type_vc)
createOtherColumn(application_df, 'APPLICATION_TYPE', 200)
print(application_df['APPLICATION_TYPE'].value_counts())

T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: APPLICATION_TYPE, dtype: int64
T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
Other      276
Name: APPLICATION_TYPE, dtype: int64


In [6]:
# Look at CLASSIFICATION value counts for binning
class_vc = application_df["CLASSIFICATION"].value_counts()
print(class_vc[class_vc > 1])
createOtherColumn(application_df, 'CLASSIFICATION', 1000)
print(application_df['CLASSIFICATION'].value_counts())

C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
C2800       95
C7100       75
C1300       58
C1280       50
C1230       36
C1400       34
C7200       32
C2300       32
C1240       30
C8000       20
C7120       18
C1500       16
C1800       15
C6000       15
C1250       14
C8200       11
C1238       10
C1278       10
C1235        9
C1237        9
C7210        7
C2400        6
C1720        6
C4100        6
C1257        5
C1600        5
C1260        3
C2710        3
C0           3
C3200        2
C1234        2
C1246        2
C1267        2
C1256        2
Name: CLASSIFICATION, dtype: int64
C1000    17326
C2000     6074
C1200     4837
Other     2261
C3000     1918
C2100     1883
Name: CLASSIFICATION, dtype: int64


In [7]:
ask_amt_vc = application_df["ASK_AMT"].value_counts()
value_dict = dict()
for ind in ask_amt_vc.index:
    cur_val = ask_amt_vc[ind]
    if cur_val not in value_dict:
        value_dict[cur_val] = 1
    else:
        value_dict[cur_val] += 1
print(value_dict)
createOtherColumn(application_df, "ASK_AMT", 10, "GT_5000")
print(application_df["ASK_AMT"].value_counts())

{25398: 1, 3: 4, 2: 147, 1: 8595}


5000       25398
GT_5000     8901
Name: ASK_AMT, dtype: int64


In [8]:
ask_amt_vc = application_df["ASK_AMT"].value_counts()
for ind in ask_amt_vc.index:
    print(ind)
    print(application_df[application_df["ASK_AMT"] == ind]["IS_SUCCESSFUL"].value_counts())

5000
1    13303
0    12095
Name: IS_SUCCESSFUL, dtype: int64
GT_5000
1    4958
0    3943
Name: IS_SUCCESSFUL, dtype: int64


In [9]:
income_vc = application_df["INCOME_AMT"].value_counts()
print(income_vc)
for ind in income_vc.index:
    print(ind)
    print(application_df[application_df["INCOME_AMT"] == ind]["IS_SUCCESSFUL"].value_counts())

0                24388
25000-99999       3747
100000-499999     3374
1M-5M              955
1-9999             728
10000-24999        543
10M-50M            240
5M-10M             185
50M+               139
Name: INCOME_AMT, dtype: int64
0
1    12577
0    11811
Name: IS_SUCCESSFUL, dtype: int64
25000-99999
1    2135
0    1612
Name: IS_SUCCESSFUL, dtype: int64
100000-499999
1    1952
0    1422
Name: IS_SUCCESSFUL, dtype: int64
1M-5M
0    510
1    445
Name: IS_SUCCESSFUL, dtype: int64
1-9999
1    553
0    175
Name: IS_SUCCESSFUL, dtype: int64
10000-24999
1    368
0    175
Name: IS_SUCCESSFUL, dtype: int64
10M-50M
0    143
1     97
Name: IS_SUCCESSFUL, dtype: int64
5M-10M
0    98
1    87
Name: IS_SUCCESSFUL, dtype: int64
50M+
0    92
1    47
Name: IS_SUCCESSFUL, dtype: int64


In [10]:
use_case_vc = application_df["USE_CASE"].value_counts()
print(use_case_vc)
for ind in use_case_vc.index:
    print(ind)
    print(application_df[application_df["USE_CASE"] == ind]["IS_SUCCESSFUL"].value_counts())

Preservation     28095
ProductDev        5671
CommunityServ      384
Heathcare          146
Other                3
Name: USE_CASE, dtype: int64
Preservation
1    15117
0    12978
Name: IS_SUCCESSFUL, dtype: int64
ProductDev
1    2944
0    2727
Name: IS_SUCCESSFUL, dtype: int64
CommunityServ
0    250
1    134
Name: IS_SUCCESSFUL, dtype: int64
Heathcare
0    83
1    63
Name: IS_SUCCESSFUL, dtype: int64
Other
1    3
Name: IS_SUCCESSFUL, dtype: int64


In [11]:
organization_vc = application_df["ORGANIZATION"].value_counts()
print(organization_vc)
for ind in organization_vc.index:
    print(ind)
    print(application_df[application_df["ORGANIZATION"] == ind]["IS_SUCCESSFUL"].value_counts())

Trust           23515
Association     10255
Co-operative      486
Corporation        43
Name: ORGANIZATION, dtype: int64
Trust
1    13808
0     9707
Name: IS_SUCCESSFUL, dtype: int64
Association
0    6202
1    4053
Name: IS_SUCCESSFUL, dtype: int64
Co-operative
1    367
0    119
Name: IS_SUCCESSFUL, dtype: int64
Corporation
1    33
0    10
Name: IS_SUCCESSFUL, dtype: int64


In [12]:
status_vc = application_df["STATUS"].value_counts()
print(status_vc)
for ind in status_vc.index:
    print(ind)
    print(application_df[application_df["STATUS"] == ind]["IS_SUCCESSFUL"].value_counts())

1    34294
0        5
Name: STATUS, dtype: int64
1
1    18258
0    16036
Name: IS_SUCCESSFUL, dtype: int64
0
1    3
0    2
Name: IS_SUCCESSFUL, dtype: int64


This shows that the `STATUS` column is in excess, we don't need it, so we should drop it

Also, with the `USE_CASE` column, most of the results per type of use case are pretty evenly split, so we'll also remove this column to see what happens

In [13]:
application_df.drop(["STATUS"], axis=1, inplace=True)
application_df.head(5)

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1-9999,N,GT_5000,1
2,T5,CompanySponsored,C3000,ProductDev,Association,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,10000-24999,N,GT_5000,1
4,T3,Independent,C1000,Heathcare,Trust,100000-499999,N,GT_5000,1


In [14]:
# Convert categorical data to numeric with `pd.get_dummies`
full_df = pd.get_dummies(application_df)
full_df.head(10)

Unnamed: 0,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,APPLICATION_TYPE_T7,APPLICATION_TYPE_T8,...,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y,ASK_AMT_5000,ASK_AMT_GT_5000
0,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
4,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
5,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
6,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
7,1,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1
8,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
9,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0


In [15]:
# Split our preprocessed data into our features and target arrays
X = full_df.drop(["IS_SUCCESSFUL"], axis=1)
y = full_df[["IS_SUCCESSFUL"]]

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 50)

In [16]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [31]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
nn = tf.keras.models.Sequential()
initializer = tf.keras.initializers.Ones()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=80, activation="relu", input_dim=len(X.columns), kernel_initializer=initializer))

nn.add(tf.keras.layers.Dense(units=110, activation="sigmoid", kernel_initializer=initializer))

nn.add(tf.keras.layers.Dense(units=50, activation="relu", kernel_initializer=initializer))

nn.add(tf.keras.layers.Dense(units=20, activation="relu", kernel_initializer=initializer))
# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="relu", kernel_initializer=initializer))

# Check the structure of the model
nn.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_22 (Dense)            (None, 80)                3520      
                                                                 
 dense_23 (Dense)            (None, 110)               8910      
                                                                 
 dense_24 (Dense)            (None, 50)                5550      
                                                                 
 dense_25 (Dense)            (None, 20)                1020      
                                                                 
 dense_26 (Dense)            (None, 1)                 21        
                                                                 
Total params: 19,021
Trainable params: 19,021
Non-trainable params: 0
_________________________________________________________________


In [32]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [33]:
# Train the model
nn_fit = nn.fit(X_train_scaled, y_train, epochs=150)

Epoch 1/150
Epoch 2/150

KeyboardInterrupt: 

In [27]:
# Evaluate the model using the test
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.6367 - accuracy: 0.7373 - 496ms/epoch - 2ms/step
Loss: 0.6367349028587341, Accuracy: 0.7372594475746155


In [None]:
# 80 relu 100 sigmoid 30 relu 1 relu 150 epochs gave 0.738
# 80 relu 110 sigmoid 30 relu 1 relu 150 epochs gave 0.7377
# 80 relu 100 sigmoid 50 relu 1 relu 150 epochs gave 0.7378
# 80 relu 100 sigmoid 50 relu 20 relu 1 relu 150 epochs gave 0.738
# 80 relu 100 sigmoid 50 relu 20 relu 1 relu 200 epochs gave 0.737
# 80 relu 100 sigmoid 50 relu 30 relu 1 relu 150 epochs gave 0.735
# 80 relu 90 sigmoid 50 relu 20 relu 1 relu 150 epochs gave 0.7367
# 80 relu 110 sigmoid 50 relu 20 relu 1 relu 150 epochs gave 0.7403 ***
# 80 relu 110 sigmoid 60 relu 20 relu 1 relu 150 epochs gave 0.736
# 80 relu 110 sigmoid 50 relu 30 relu 1 relu 150 epochs gave 0.7361
# 80 relu 110 sigmoid 50 relu 20 relu 1 relu 300 epochs gave 0.736
# 80 relu 110 sigmoid 50 relu 20 relu 20 sigmoid 1 relu 150 epochs gave 0.7392 ***
# 80 relu 110 sigmoid 50 relu 30 relu 20 sigmoid 1 relu 150 epochs gave 0.7356
# 80 relu 110 sigmoid 50 relu 20 sigmoid 20 sigmoid 1 relu 150 epochs gave 0.7383
# 80 relu 110 sigmoid 50 relu 20 sigmoid 20 relu 1 relu 150 epochs gave 0.7341
# 80 relu 110 sigmoid 50 relu 20 sigmoid 20 relu 1 relu 200 epochs gave 0.7384
# 80 relu 110 sigmoid 50 relu 20 sigmoid 20 relu 1 relu 250 epochs gave 0.7366
# 80 relu 110 sigmoid 50 relu 20 relu 30 sigmoid 1 relu 150 epochs gave 0.7368

# BEFORE THIS COMMENT, THESE WERE BEFORE MAKING ASK_AMT CHANGES

# 80 relu 110 sigmoid 50 relu 20 relu 1 relu 150 epochs gave 0.7358
# 80 relu 110 sigmoid 50 relu 20 relu 20 sigmoid 1 relu 150 epochs gave 0.7374

# BEFORE THIS COMMENT, WE DIDN'T DROP ASK_AMT

# 80 relu 110 sigmoid 50 relu 20 relu 20 sigmoid 1 relu 150 epochs gave 0.7364

# NO ASK_AMT changes, and STATUS removed

# 80 relu 110 sigmoid 50 relu 20 relu 1 relu 150 epochs gave 0.7394
# 80 relu 110 sigmoid 50 relu 20 relu 1 relu 150 epochs ask_amt GT_5000 gave 0.7373



In [80]:
# Export our model to HDF5 file
nn.save(filepath="AlphabetSoupCharity_Optimization.h5", save_format="h5")