In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output

import datetime
from datetime import timedelta, date #for time duration calculations
from dateutil.parser import parse #for fuzzy finding year

# Feature Scaling
from sklearn.preprocessing import StandardScaler




import pickle #for saving output files, pickles
from sys import stdout
import time #for time.sleep function to delay calls
from tqdm import tqdm #for updating loop
#from os import listdir
#from os.path import isfile, join
import glob #pattern matching and expansion.

## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
from sqlalchemy.sql import table, column, select, update, insert
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

import pandas as pd

#In Python: Define your username and password used above. I've defined the database name (we're 
#using a dataset on births, so I call it birth_db). 
dbname = 'donors_db'
username = 'russell'
pswd = 'bradypodion'

In [2]:
intervals = (
    ('weeks', 604800),  # 60 * 60 * 24 * 7
    ('days', 86400),    # 60 * 60 * 24
    ('hours', 3600),    # 60 * 60
    ('minutes', 60),
    ('seconds', 1),
    )

def display_time(seconds, granularity=2):
    result = []

    for name, count in intervals:
        value = seconds // count
        if value:
            seconds -= value * count
            if value == 1:
                name = name.rstrip('s')
            result.append("{} {}".format(value, name))
    return ', '.join(result[:granularity])

# Function convert seconds into day.decimal  
def ConvertSectoDay(n): 
    day = n // (24 * 3600) 
    #print(day) #keep day
    n = n % (24 * 3600) 
    daydec=(n/86400) # add this to day
    addem=day+daydec
    #https://stackoverflow.com/a/48812729/1602288
    holder='{:g}'.format(float('{:.{p}g}'.format(addem, p=5)))
    return(float(holder))

def elapsedseconds(posted, completed, expiration):
    formatuse = '%Y-%m-%d %H:%M:%S' # The format: see down this page:https://docs.python.org/3/library/datetime.html
    otherformat = '%Y-%m-%d'
    
    #failed projects were never completed, so in those cases, use the expiration date
    # if variable is None:
    if completed is None:
        try:
            clock = datetime.datetime.strptime(expiration,formatuse) 
        except:
            try:
                clock = datetime.datetime.strptime(expiration,otherformat)
            except:
                clock = 'stop'
    else:
        try:
            clock = datetime.datetime.strptime(completed,formatuse) 
        except:
            try:
                clock = datetime.datetime.strptime(completed,otherformat) 
            except:
                clock = 'stop'
            
    if clock != 'stop': 
        try:
            startclock = datetime.datetime.strptime(posted,formatuse)
        except:
            startclock = datetime.datetime.strptime(posted,otherformat)

        elapsed = (clock-startclock).total_seconds()
        
    else:
        elapsed = 123456789
    return(elapsed)

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print(engine.url)
# Replace localhost with IP address if accessing a remote server

postgresql://russell:bradypodion@localhost/donors_db
postgresql://russell:bradypodion@localhost/donors_db


In [5]:
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))
print(engine.url)

True
postgresql://russell:bradypodion@localhost/donors_db


In [6]:
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [7]:
###########################################################
### query: from historical data
hist_query = """
SELECT * FROM hist_projects;\n"""


hist_state = pd.read_sql_query(hist_query,con)
orig_hist_rows = len(hist_state.index)
hist_state = hist_state.drop_duplicates(keep='first')
dedup_hist_rows = len(hist_state.index)

print("Historical Raw Obs = "+str(orig_hist_rows)+"\nDeDup Obs = "+str(dedup_hist_rows))
   

Historical Raw Obs = 1425169
DeDup Obs = 1425169


In [8]:
### Close communication with the database
con.close()

In [None]:
############# previous EDA suggests these are all abberant classes with less than 100 values
hist_state=hist_state[hist_state.school_state != 'La']
hist_state=hist_state[hist_state.teacher_prefix != 'Mr. & Mrs.']
hist_state=hist_state[hist_state.teacher_prefix != 'Mr. & Mrs. ']

In [None]:
hist_state['latency_to_funded'] = hist_state.apply(lambda row: elapsedseconds(row['date_posted'],row['date_completed'],row['date_expiration']),axis=1)
# if the latency was non-addressable, the returned value = 123456789, so now we drop those
hist_state = hist_state[hist_state.latency_to_funded != 123456789]


In [None]:
hist_state['days_to_funding'] = hist_state.apply(lambda row: ConvertSectoDay(row.latency_to_funded),axis=1)

In [None]:
hist_state['succeed']= np.where(hist_state['funding_status']=='completed', 1, 0)

In [None]:
#replace 'f' and 't' with 'true' and 'false' for these columns
#https://stackoverflow.com/a/34697070/1602288

hist_state[['school_charter','school_magnet','school_year_round','school_nlns','school_kipp',
            'school_charter_ready_promise','teacher_teach_for_america','eligible_double_your_impact_match','eligible_almost_home_match']]= hist_state[['school_charter','school_magnet','school_year_round','school_nlns','school_kipp',
                             'school_charter_ready_promise','teacher_teach_for_america','eligible_double_your_impact_match','eligible_almost_home_match']].replace(['f','t'], ['false', 'true'])

#make new columns for posting time info, from splitting posting date = date_posted
hist_state[['posting_year','posting_month','posting_day']]=hist_state['date_posted'].str.split("-",expand=True)


In [None]:
# trimmed=hist_state[['school_state','school_metro','school_charter', 'school_magnet', 
#                     'school_year_round','teacher_prefix','teacher_teach_for_america', 
#                     'primary_focus_subject','resource_type', 'poverty_level', 'grade_level',
#                     'total_price_excluding_optional_support','students_reached',
#                     'posting_month','days_to_funding']]

trimmed=hist_state[['total_price_excluding_optional_support','students_reached',
                    'posting_month','days_to_funding','succeed']]

trimmed = trimmed[trimmed.days_to_funding < 150]
trimmed = trimmed.dropna()
trimmed.shape

In [None]:
trimmed.astype({'posting_month':'int32'}).dtypes #cast posting month as integer

In [None]:
funded_x=trimmed[trimmed['succeed']==1]
notfund_y=trimmed[trimmed['succeed']==0]

In [None]:
funded_x=funded_x['days_to_funding']
notfund_y=notfund_y['days_to_funding']

In [None]:
sns.set_context("poster", font_scale=.6)

bins = np.linspace(0, 150, 100)
plt.hist(funded_x, bins, alpha=0.25, label='Funded')
plt.hist(notfund_y, bins, alpha=0.5, label='NOT funded')
plt.legend(loc='upper right')


plt.xlabel("Project duration");
plt.ylabel("Count");

plt.show()

In [None]:
sns.set_context("poster", font_scale=1.3)

fig, ax = plt.subplots(figsize=(12, 8))
sns.distplot(trimmed["days_to_funding"].dropna())
ax.set_xlim(1,150)
fig.tight_layout()

In [None]:
from sklearn import metrics 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

In [None]:
#binary output = is a project funded or not?
y = trimmed.succeed

#
x = trimmed.drop(['succeed'], axis=1)
x_scaled = preprocessing.scale(x)
# create training and testing vars
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.25)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

In [None]:
# fit a model
logistic_regression = LogisticRegression()
model = logistic_regression.fit(x_train, y_train)
predictions = logistic_regression.predict(x_test)

print("Score:", model.score(x_test, y_test))

In [None]:
y_pred = logistic_regression.predict(x_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy_percentage = 100 * accuracy
accuracy_percentage

In [None]:
cm = metrics.confusion_matrix(y_test, predictions)
print(cm)

In [None]:
# Use score method to get accuracy of model
score = model.score(x_test, y_test)
print(score)

matplotlib.rcParams.update({'font.size': 22})

plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'magma');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

import matplotlib

# font = {'family' : 'normal',
#         'weight' : 'bold',
#         'size'   : 30}

# matplotlib.rc('font', **font)
matplotlib.rcParams.update({'font.size': 42})

logit_roc_auc = roc_auc_score(y_test, logistic_regression.predict(x_test))
fpr, tpr, thresholds = roc_curve(y_test, logistic_regression.predict_proba(x_test)[:,1])
plt.figure(figsize=(9, 9))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc,linewidth=4)
plt.plot([0, 1], [0, 1],'r--',linewidth=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()


In [None]:
print(x.columns)
print(logistic_regression.coef_)


In [None]:
clf=logistic_regression

feature_importance = abs(clf.coef_[0])
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

matplotlib.rcParams.update({'font.size': 42})
plt.figure(figsize=(24, 16))
featfig = plt.figure()
featax = featfig.add_subplot(1, 1, 1)
featax.barh(pos, feature_importance[sorted_idx], align='center')
featax.set_yticks(pos)
featax.set_yticklabels(np.array(x.columns)[sorted_idx], fontsize=18)
featax.set_xlabel('Relative Feature Importance');

In [None]:
# Function convert months to school-year-months
def ConvertGregorian_to_School(m): 
    m=int(m)
    if m>=7:
        sm=m-6
    else:
        sm=m+6

    return(int(sm))

In [None]:
trimmed['posting_month'].value_counts()

In [None]:
trimmed['posting_month'] = trimmed.apply(lambda row: ConvertGregorian_to_School(row.posting_month),axis=1)

In [None]:
#binary output = is a project funded or not?
y = trimmed.succeed

#
x = trimmed.drop(['succeed'], axis=1)
x_scaled = preprocessing.scale(x)
# create training and testing vars
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.25)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

In [None]:
# fit a model
logistic_regression = LogisticRegression()
model = logistic_regression.fit(x_train, y_train)
predictions = logistic_regression.predict(x_test)

print("Score:", model.score(x_test, y_test))

In [None]:
# Use score method to get accuracy of model
score = model.score(x_test, y_test)
print(score)

matplotlib.rcParams.update({'font.size': 22})

plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'magma');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

In [None]:
logit_roc_auc = roc_auc_score(y_test, logistic_regression.predict(x_test))
fpr, tpr, thresholds = roc_curve(y_test, logistic_regression.predict_proba(x_test)[:,1])
plt.figure(figsize=(9, 9))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc,linewidth=4)
plt.plot([0, 1], [0, 1],'r--',linewidth=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()


In [None]:
clf=logistic_regression

feature_importance = abs(clf.coef_[0])
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

matplotlib.rcParams.update({'font.size': 42})
plt.figure(figsize=(24, 16))
featfig = plt.figure()
featax = featfig.add_subplot(1, 1, 1)
featax.barh(pos, feature_importance[sorted_idx], align='center')
featax.set_yticks(pos)
featax.set_yticklabels(np.array(x.columns)[sorted_idx], fontsize=18)
featax.set_xlabel('Relative Feature Importance');

In [None]:
# One-hot encode categorical features
features = pd.get_dummies(trimmed)
print(features.shape)
features = features.dropna()
print(features.shape)

features.head(5)

In [None]:
#binary output = is a project funded or not?
y = features.succeed

#
x = features.drop(['succeed'], axis=1)
x_scaled = preprocessing.scale(x)
# create training and testing vars
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.25)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

In [None]:
# fit a model
logistic_regression = LogisticRegression()
model = logistic_regression.fit(x_train, y_train)
predictions = logistic_regression.predict(x_test)

print("Score:", model.score(x_test, y_test))

In [None]:
count_failed= len(features[features['succeed']==0])
count_funded = len(features[features['succeed']==1])
pct_of_fail = count_failed/(count_failed+count_funded)
print("percentage of failed projects is ", pct_of_fail*100)
pct_of_fund = count_funded/(count_failed+count_funded)
print("percentage of funded projects is ", pct_of_fund*100)

In [None]:
trimmed.groupby('grade_level').mean()

In [None]:
trimmed.groupby('poverty_level').mean()

In [None]:
trimmed.groupby('resource_type').mean()#resource_type

In [None]:
# Use numpy to convert to arrays
import numpy as np

# Labels are the values we want to predict
labels = np.array(features['succeed'])

# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('succeed', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

In [None]:
print(features.shape)


In [None]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25,
                                                                           random_state = 42)

In [None]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model 
rf = RandomForestRegressor(n_estimators= 1000, random_state=42)

# Train the model on training data
rf.fit(train_features, train_labels);