In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

#  Import and read the charity_data.csv.
application_df = pd.read_csv("./resources/charity_data.csv")

# PRE-PROCESSING

In [2]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(['EIN', 'NAME'],1)

In [3]:
# Look at INCOME_AMT value counts for binning
i_amts = application_df.INCOME_AMT.value_counts()

### Leaving Income amounts as there is only 9 bins here 

In [4]:
# Look at ASK_AMT value counts for binning
a_amts = application_df.ASK_AMT.value_counts()

In [5]:
#Bin ask amount values in increaments of 50000

replace_ask = list(a_amts[a_amts<1000].index)

#Binning in dataframe
for amt in replace_ask:
    if amt<=49999:
        application_df.ASK_AMT = application_df.ASK_AMT.replace(amt,str('5000-49999'))
    elif amt<=99999:
        application_df.ASK_AMT = application_df.ASK_AMT.replace(amt,str('50000-99999'))
    elif amt<=499999:
        application_df.ASK_AMT = application_df.ASK_AMT.replace(amt,str('100000-499999'))
    elif amt<=999999:
        application_df.ASK_AMT = application_df.ASK_AMT.replace(amt,str('500000-999999'))
    elif amt<=4999999:
        application_df.ASK_AMT = application_df.ASK_AMT.replace(amt,str('1M-5M'))
    else:
        application_df.ASK_AMT = application_df.ASK_AMT.replace(amt,str('5M+'))

application_df.ASK_AMT = application_df.ASK_AMT.astype(str)

In [6]:
# Look at APPLICATION_TYPE value counts for binning
application_counts = application_df['APPLICATION_TYPE'].value_counts()

# Determine which values to replace if counts are less than ...?
replace_application = list(application_counts[application_counts<500].index)

# Replace in dataframe
for app in replace_application:
    application_df.APPLICATION_TYPE = application_df.APPLICATION_TYPE.replace(app,"Other")

# Look at CLASSIFICATION value counts for binning
class_count = application_df['CLASSIFICATION'].value_counts()

# Determine which values to replace if counts are less than ..?
replace_class = list(class_count[class_count<1000].index)

# Replace in dataframe
for cls in replace_class:
    application_df.CLASSIFICATION = application_df.CLASSIFICATION.replace(cls,"Other")
    
# Generate our categorical variable lists
application_cat = application_df.dtypes[application_df.dtypes == 'object'].index.tolist()

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(application_df[application_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(application_cat)

In [7]:
# Merge one-hot encoded features and drop the originals
application_df = application_df.merge(encode_df, left_index=True, right_index=True).drop(application_cat, axis=1)

In [8]:
# Split our preprocessed data into our features and target arrays
y = application_df['IS_SUCCESSFUL']
X = application_df.drop('IS_SUCCESSFUL', 1)
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=25, stratify=y)

In [9]:
scaler = StandardScaler()

#fit the standard scaler with training data
X_scaler = scaler.fit(X_train)

#scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators = 500, random_state=78)

rf_model = rf_model.fit(X_train_scaled, y_train)

#Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [11]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)


# Create a DataFrame from the confusion matrix.

cm_df = pd.DataFrame(
cm, index=["Actual 0","Actual 1"], columns=["Predicted 0", "Predicted 1"])

#calculate accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.7289795918367347

In [12]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2697,1313
Actual 1,1011,3554


Accuracy Score : 0.7289795918367347
Classification Report
              precision    recall  f1-score   support

           0       0.73      0.67      0.70      4010
           1       0.73      0.78      0.75      4565

    accuracy                           0.73      8575
   macro avg       0.73      0.73      0.73      8575
weighted avg       0.73      0.73      0.73      8575

