## Preprocessing

In [173]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dla-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [174]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME''SPECIAL_CONSIDERATIONS.
application_df = application_df.drop(['EIN', 'NAME','SPECIAL_CONSIDERATIONS'], axis=1)


In [175]:
# Determine the number of unique values in each column.
unique_values_count = application_df.nunique()
print(unique_values_count)


APPLICATION_TYPE      17
AFFILIATION            6
CLASSIFICATION        71
USE_CASE               5
ORGANIZATION           4
STATUS                 2
INCOME_AMT             9
ASK_AMT             8747
IS_SUCCESSFUL          2
dtype: int64


In [177]:
# Get columns with more than 10 unique values
columns_with_more_than_10_unique = [col for col in application_df.columns if application_df[col].nunique() > 10]

# Loop through these columns and count data points for each unique value
for col in columns_with_more_than_10_unique:
    unique_values_count = application_df[col].value_counts()
    print(f"Column '{col}':")
    print(unique_values_count)
    # print("\n")


Column 'APPLICATION_TYPE':
T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: APPLICATION_TYPE, dtype: int64
Column 'CLASSIFICATION':
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C2561        1
C4500        1
C2150        1
Name: CLASSIFICATION, Length: 71, dtype: int64
Column 'ASK_AMT':
5000        25398
10478           3
15583           3
63981           3
6725            3
            ...  
5371754         1
30060           1
43091152        1
18683           1
36500179        1
Name: ASK_AMT, Length: 8747, dtype: int64


In [179]:
# Determine the cutoff value
cutoff_value = 10

# Get value counts for 'APPLICATION_TYPE'
application_type_counts = application_df['APPLICATION_TYPE'].value_counts()

# Create a list of application types to be replaced with 'Other'
application_types_to_replace = application_type_counts[application_type_counts < cutoff_value].index.tolist()

# Replace values in the DataFrame
application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].apply(lambda x: 'Other' if x in application_types_to_replace else x)

# Check the value counts after binning
print(application_df['APPLICATION_TYPE'].value_counts())


T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
T9         156
Other      120
Name: APPLICATION_TYPE, dtype: int64


In [181]:
# Look at CLASSIFICATION value counts for binning
classification_counts = application_df['CLASSIFICATION'].value_counts()
classification_counts


C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C2561        1
C4500        1
C2150        1
Name: CLASSIFICATION, Length: 71, dtype: int64

In [182]:
# Determine the cutoff value
cutoff_value = 10

# Get value counts for 'CLASSIFICATION'
classification_counts = application_df['CLASSIFICATION'].value_counts()

# Create a list of classifications to be replaced with 'Other'
classifications_to_replace = classification_counts[classification_counts < cutoff_value].index.tolist()

# Replace values in the DataFrame
application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].apply(lambda x: 'Other' if x in classifications_to_replace else x)

# Check the value counts after binning
print(application_df['CLASSIFICATION'].value_counts())


C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
Other       98
C2800       95
C7100       75
C1300       58
C1280       50
C1230       36
C1400       34
C7200       32
C2300       32
C1240       30
C8000       20
C7120       18
C1500       16
C1800       15
C6000       15
C1250       14
C8200       11
C1238       10
C1278       10
Name: CLASSIFICATION, dtype: int64


In [183]:
# Convert categorical data to numeric with `pd.get_dummies`
# Select categorical columns
categorical_columns = ['APPLICATION_TYPE', 'AFFILIATION', 'CLASSIFICATION', 'USE_CASE', 'ORGANIZATION', 'INCOME_AMT']

# Convert categorical columns to dummies
application_df_encoded = pd.get_dummies(application_df, columns=categorical_columns)

# Display the new DataFrame with encoded categorical columns
application_df_encoded.head()


Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,...,ORGANIZATION_Trust,INCOME_AMT_0,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M
0,1,5000,1,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1,108590,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1,5000,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,1,6692,1,0,0,0,1,0,0,0,...,1,0,0,1,0,0,0,0,0,0
4,1,142590,1,0,0,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [185]:
# Split our preprocessed data into our features and target arrays
target_column = 'IS_SUCCESSFUL'

# Features: All columns except the target variable
features = application_df_encoded.drop(columns=[target_column])

# Target variable array
target = application_df_encoded[target_column]

# Display the shapes of the features and target arrays
print("Features shape:", features.shape)
print("Target shape:", target.shape)

# Split the preprocessed data into a training and testing dataset
from sklearn.model_selection import train_test_split

# Split the data into 90% training and 10% testing
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.1, random_state=42)

# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


Features shape: (34299, 66)
Target shape: (34299,)
X_train shape: (30869, 66)
X_test shape: (3430, 66)
y_train shape: (30869,)
y_test shape: (3430,)


In [186]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [187]:
nn = tf.keras.models.Sequential()

# Number of input features
input_features = len(X_train.columns)

# First hidden layer
nn.add(tf.keras.layers.Dense(units=180, activation='relu', input_dim=input_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=20, activation='relu'))


nn.add(tf.keras.layers.Dense(units=10, activation='relu'))



# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()


Model: "sequential_37"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_119 (Dense)           (None, 180)               12060     
                                                                 
 dense_120 (Dense)           (None, 20)                3620      
                                                                 
 dense_121 (Dense)           (None, 10)                210       
                                                                 
 dense_122 (Dense)           (None, 1)                 11        
                                                                 
Total params: 15901 (62.11 KB)
Trainable params: 15901 (62.11 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [190]:
# Compile the model
nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [191]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=150)


Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [192]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

108/108 - 0s - loss: 0.5760 - accuracy: 0.7195 - 293ms/epoch - 3ms/step
Loss: 0.575965940952301, Accuracy: 0.7195335030555725


In [193]:
nn = tf.keras.models.Sequential()

# Number of input features
input_features = len(X_train.columns)

# First hidden layer
nn.add(tf.keras.layers.Dense(units=128, activation='relu', input_dim=input_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=60, activation='relu'))
nn.add(tf.keras.layers.Dense(units=20, activation='relu'))
nn.add(tf.keras.layers.Dense(units=10, activation='relu'))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

Model: "sequential_39"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_126 (Dense)           (None, 128)               8576      
                                                                 
 dense_127 (Dense)           (None, 60)                7740      
                                                                 
 dense_128 (Dense)           (None, 20)                1220      
                                                                 
 dense_129 (Dense)           (None, 10)                210       
                                                                 
 dense_130 (Dense)           (None, 1)                 11        
                                                                 
Total params: 17757 (69.36 KB)
Trainable params: 17757 (69.36 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [194]:
# Compile the model
nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [197]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [199]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

108/108 - 0s - loss: 0.6621 - accuracy: 0.7286 - 184ms/epoch - 2ms/step
Loss: 0.6620940566062927, Accuracy: 0.7285714149475098


In [200]:
# Export our model to HDF5 file
nn.save("trained_model2.h5")


  saving_api.save_model(
