In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('charity_navigator_clean.csv', index_col = 0)

In [3]:
n = 0
for excess_or_deficit in df['excess_or_deficit_for_year']:
    if df.loc[n, 'excess_or_deficit_for_year'] > 0:
        df.loc[n, 'excess_or_deficit_id'] = str(1)
        n += 1
    elif df.loc[n, 'excess_or_deficit_for_year'] <= 0:
        df.loc[n, 'excess_or_deficit_id'] = str(0)
        n += 1

In [4]:
df2 = df[df['compensation_leader_compensation'].isnull() == False]
org_type_dummies = pd.get_dummies(df2.org_type)
df2 = pd.concat([df2, org_type_dummies], axis=1, join = 'outer')
df2.drop(['charity_name', 'charity_url', 
                     'city', 'cn_advisory', 
                     'state', 'org_type',
                     'org_category', 'compensation_leader_title'], 
                    axis = 1, inplace = True)
df2.head()

Unnamed: 0,accountability_score,administrative_expenses,compensation_leader_compensation,compensation_leader_expense_percent,excess_or_deficit_for_year,financial_score,fundraising_expenses,net_assets,other_revenue,overall_score,...,"Arts, Culture, Humanities",Community Development,Education,Environment,Health,Human Services,Human and Civil Rights,International,Religion,Research and Public Policy
0,89.0,164590.0,53463.0,3.11,349718.0,90.56,111522,1350382,49634.0,89.75,...,0,0,0,0,0,0,1,0,0,0
1,86.0,1001560.0,73500.0,1.47,1175965.0,85.92,54613,14773920,382540.0,85.95,...,0,0,1,0,0,0,0,0,0,0
2,85.0,93957.0,85000.0,0.99,-461502.0,77.65,248833,-770370,0.0,80.96,...,0,0,0,0,0,0,0,1,0,0
3,86.0,346867.0,61220.0,0.78,1872733.0,97.5,384550,11460087,-81726.0,89.94,...,0,0,1,0,0,0,0,0,0,0
4,97.0,135195.0,74244.0,5.41,-103940.0,87.08,87436,723772,32436.0,90.62,...,0,0,0,0,0,0,0,0,1,0


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics

In [6]:
X = df2[['accountability_score', 'financial_score',
         'overall_score', 'administrative_expenses',
         'fundraising_expenses', 'net_assets',
         'other_revenue', 'payments_to_affiliates',
         'compensation_leader_compensation', 
         'compensation_leader_expense_percent',
         'Animals', 'Arts, Culture, Humanities',
         'Community Development', 'Education',
         'Environment', 'Health', 'Human Services',
         'Human and Civil Rights', 'International',
         'Religion']]
y = df2['excess_or_deficit_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 101)

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
svc_model = SVC()
svc_model.fit(X_train_scaled, y_train)
y_pred = svc_model.predict(X_test_scaled)
# print('svc_model.coef_\n', svc_model.coef_)
# print('svc_model.intercept_\n', svc_model.intercept_)

In [9]:
df_scaled_features = pd.DataFrame(X_train_scaled, columns = X.columns)
df_scaled_features2 = pd.DataFrame(X_test_scaled, columns = X.columns)
df_scaled_features = pd.concat([df_scaled_features, df_scaled_features2], axis=1, join = 'outer')
df_scaled_features.head()

Unnamed: 0,accountability_score,financial_score,overall_score,administrative_expenses,fundraising_expenses,net_assets,other_revenue,payments_to_affiliates,compensation_leader_compensation,compensation_leader_expense_percent,...,Animals,"Arts, Culture, Humanities",Community Development,Education,Environment,Health,Human Services,Human and Civil Rights,International,Religion
0,0.523977,0.794084,0.977527,1.416871,0.885268,0.290553,-0.089992,-0.075257,0.007558,-1.39452,...,-0.243064,-0.413244,-0.325487,-0.283784,-0.235588,-0.336121,1.560762,-0.207767,-0.268927,-0.237206
1,0.971347,0.136413,0.490523,-0.150319,0.024736,-0.116007,-0.299356,-0.075257,0.614274,0.028344,...,-0.243064,2.419876,-0.325487,-0.283784,-0.235588,-0.336121,-0.640713,-0.207767,-0.268927,-0.237206
2,0.971347,-0.35511,0.046528,-0.101158,-0.10647,-0.118849,-0.136323,-0.075257,-0.021811,0.197229,...,-0.243064,2.419876,-0.325487,-0.283784,-0.235588,-0.336121,-0.640713,-0.207767,-0.268927,-0.237206
3,-3.166823,-2.782065,-3.394119,-0.120767,-0.074416,-0.14777,-0.153193,-0.075257,-0.230468,1.856533,...,-0.243064,-0.413244,-0.325487,-0.283784,-0.235588,2.975122,-0.640713,-0.207767,-0.268927,-0.237206
4,0.18845,0.008835,0.227415,0.377612,0.032822,0.547113,0.20907,-0.075257,2.738364,-0.41076,...,-0.243064,2.419876,-0.325487,-0.283784,-0.235588,-0.336121,-0.640713,-0.207767,-0.268927,-0.237206


In [10]:
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.49      0.10      0.16      1245
          1       0.58      0.92      0.71      1671

avg / total       0.54      0.57      0.48      2916

[[ 123 1122]
 [ 129 1542]]


In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001]}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 2)
grid.fit(X_train_scaled, y_train)
grid_predictions = grid.predict(X_test_scaled)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] C=0.1, gamma=1 ..................................................
[CV] ................................... C=0.1, gamma=1, total=   1.3s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s


[CV] ................................... C=0.1, gamma=1, total=   0.7s
[CV] C=0.1, gamma=1 ..................................................
[CV] ................................... C=0.1, gamma=1, total=   0.6s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................................. C=0.1, gamma=0.1, total=   0.9s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................................. C=0.1, gamma=0.1, total=   0.6s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................................. C=0.1, gamma=0.1, total=   0.7s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ................................ C=0.1, gamma=0.01, total=   0.5s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ................................ C=0.1, gamma=0.01, total=   0.5s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] .

[CV] .............................. C=1000, gamma=0.001, total=   0.9s


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  1.8min finished


In [13]:
print(metrics.classification_report(y_test, grid_predictions))
print(metrics.confusion_matrix(y_test, grid_predictions))

             precision    recall  f1-score   support

          0       0.48      0.13      0.21      1245
          1       0.58      0.89      0.70      1671

avg / total       0.54      0.57      0.49      2916

[[ 163 1082]
 [ 177 1494]]


In [14]:
X = df2[['accountability_score', 'financial_score',
         'overall_score', 'administrative_expenses',
         'fundraising_expenses', 'net_assets',
         'other_revenue', 'payments_to_affiliates',
         'compensation_leader_compensation', 
         'compensation_leader_expense_percent',
         'excess_or_deficit_for_year']]
y = df2['org_type_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 101)

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
svc_model = SVC()
svc_model.fit(X_train_scaled, y_train)
y_pred = svc_model.predict(X_test_scaled)
# print('svc_model.coef_\n', svc_model.coef_)
# print('svc_model.intercept_\n', svc_model.intercept_)

In [17]:
df_scaled_features = pd.DataFrame(X_train_scaled, columns = X.columns)
df_scaled_features2 = pd.DataFrame(X_test_scaled, columns = X.columns)
df_scaled_features = pd.concat([df_scaled_features, df_scaled_features2], axis=1, join = 'outer')
df_scaled_features.head()

Unnamed: 0,accountability_score,financial_score,overall_score,administrative_expenses,fundraising_expenses,net_assets,other_revenue,payments_to_affiliates,compensation_leader_compensation,compensation_leader_expense_percent,...,financial_score.1,overall_score.1,administrative_expenses.1,fundraising_expenses.1,net_assets.1,other_revenue.1,payments_to_affiliates.1,compensation_leader_compensation.1,compensation_leader_expense_percent.1,excess_or_deficit_for_year
0,0.523977,0.794084,0.977527,1.416871,0.885268,0.290553,-0.089992,-0.075257,0.007558,-1.39452,...,0.15817,-1.279133,-0.166188,-0.13545,-0.162193,-0.163133,-0.075257,-0.495042,-0.930083,-0.06258
1,0.971347,0.136413,0.490523,-0.150319,0.024736,-0.116007,-0.299356,-0.075257,0.614274,0.028344,...,-1.221456,-0.764301,-0.168045,-0.108975,-0.155332,-0.14455,-0.075257,-0.639228,0.138119,-0.033338
2,0.971347,-0.35511,0.046528,-0.101158,-0.10647,-0.118849,-0.136323,-0.075257,-0.021811,0.197229,...,-0.606311,-0.317776,-0.178184,-0.101399,-0.129523,0.026565,-0.075257,-1.227741,-1.449407,-0.096739
3,-3.166823,-2.782065,-3.394119,-0.120767,-0.074416,-0.14777,-0.153193,-0.075257,-0.230468,1.856533,...,0.899905,0.547446,-0.181454,-0.118079,-0.155555,-0.155218,-0.075257,-0.257832,0.526557,-0.0676
4,0.18845,0.008835,0.227415,0.377612,0.032822,0.547113,0.20907,-0.075257,2.738364,-0.41076,...,-0.676528,-0.675755,-0.060284,-0.05953,0.011125,0.080959,-0.075257,0.308987,0.21834,0.098471


In [18]:
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       112
        2.0       0.00      0.00      0.00       225
        3.0       0.50      0.00      0.01       203
        4.0       0.38      0.22      0.28       133
        5.0       0.61      0.11      0.19       288
        6.0       0.00      0.00      0.00       137
        7.0       0.19      0.03      0.06       289
        8.0       0.37      0.33      0.35       443
        9.0       0.32      0.90      0.47       826
       10.0       0.00      0.00      0.00       175
       11.0       0.00      0.00      0.00        85

avg / total       0.28      0.33      0.23      2916

[[  0   0   0   1   0   0   0  16  95   0   0]
 [  0   0   1   2   0   0   7  40 175   0   0]
 [  0   0   1   4   1   0   8  23 166   0   0]
 [  0   0   0  29   0   0   2  13  89   0   0]
 [  0   0   0   4  33   0   3  19 229   0   0]
 [  0   0   0   2   0   0   0  18 117   0   0]
 [  0   0   0   2   7   0 

  'precision', 'predicted', average, warn_for)


In [19]:
param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001]}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 2)
grid.fit(X_train_scaled, y_train)
grid_predictions = grid.predict(X_test_scaled)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] C=0.1, gamma=1 ..................................................
[CV] ................................... C=0.1, gamma=1, total=   0.7s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


[CV] ................................... C=0.1, gamma=1, total=   0.6s
[CV] C=0.1, gamma=1 ..................................................
[CV] ................................... C=0.1, gamma=1, total=   0.6s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................................. C=0.1, gamma=0.1, total=   0.6s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................................. C=0.1, gamma=0.1, total=   0.6s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................................. C=0.1, gamma=0.1, total=   0.6s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ................................ C=0.1, gamma=0.01, total=   0.5s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ................................ C=0.1, gamma=0.01, total=   0.6s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] .

[CV] .............................. C=1000, gamma=0.001, total=   3.0s


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  2.0min finished


In [20]:
print(metrics.classification_report(y_test, grid_predictions))
print(metrics.confusion_matrix(y_test, grid_predictions))

             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       112
        2.0       0.11      0.04      0.05       225
        3.0       0.19      0.08      0.11       203
        4.0       0.29      0.14      0.18       133
        5.0       0.62      0.31      0.42       288
        6.0       0.04      0.01      0.01       137
        7.0       0.36      0.17      0.23       289
        8.0       0.42      0.40      0.41       443
        9.0       0.36      0.83      0.50       826
       10.0       0.14      0.01      0.02       175
       11.0       0.05      0.01      0.02        85

avg / total       0.31      0.36      0.29      2916

[[  0   4   1   3   1   2   7  16  76   0   2]
 [  4   8   6   4   2   3   8  53 133   2   2]
 [  2   7  16   5   3   3  13  15 138   0   1]
 [  2   3  10  18   3   1   1   8  85   1   1]
 [  1   8  13   2  90   1   9  15 148   0   1]
 [  1   1   1   1   3   1   7  21  99   2   0]
 [  3   8   9   1  14   1 