In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output

import datetime
from datetime import timedelta, date #for time duration calculations
from dateutil.parser import parse #for fuzzy finding year

# Feature Scaling
from sklearn.preprocessing import StandardScaler




import pickle #for saving output files, pickles
from sys import stdout
import time #for time.sleep function to delay calls
from tqdm import tqdm #for updating loop
#from os import listdir
#from os.path import isfile, join
import glob #pattern matching and expansion.

## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
from sqlalchemy.sql import table, column, select, update, insert
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

import pandas as pd

#In Python: Define your username and password used above. I've defined the database name (we're 
#using a dataset on births, so I call it birth_db). 
dbname = 'donors_db'
username = 'russell'
pswd = 'bradypodion'

In [39]:
intervals = (
    ('weeks', 604800),  # 60 * 60 * 24 * 7
    ('days', 86400),    # 60 * 60 * 24
    ('hours', 3600),    # 60 * 60
    ('minutes', 60),
    ('seconds', 1),
    )

def display_time(seconds, granularity=2):
    result = []

    for name, count in intervals:
        value = seconds // count
        if value:
            seconds -= value * count
            if value == 1:
                name = name.rstrip('s')
            result.append("{} {}".format(value, name))
    return ', '.join(result[:granularity])

# Function convert seconds into day.decimal  
def ConvertSectoDay(n): 
    day = n // (24 * 3600) 
    #print(day) #keep day
    n = n % (24 * 3600) 
    daydec=(n/86400) # add this to day
    addem=day+daydec
    #https://stackoverflow.com/a/48812729/1602288
    holder='{:g}'.format(float('{:.{p}g}'.format(addem, p=5)))
    return(float(holder))

def elapsedseconds(posted, completed, expiration):
    formatuse = '%Y-%m-%d %H:%M:%S' # The format: see down this page:https://docs.python.org/3/library/datetime.html
    otherformat = '%Y-%m-%d'
    
    #failed projects were never completed, so in those cases, use the expiration date
    # if variable is None:
    if completed is None:
        try:
            clock = datetime.datetime.strptime(expiration,formatuse) 
        except:
            try:
                clock = datetime.datetime.strptime(expiration,otherformat)
            except:
                clock = 'stop'
    else:
        try:
            clock = datetime.datetime.strptime(completed,formatuse) 
        except:
            try:
                clock = datetime.datetime.strptime(completed,otherformat) 
            except:
                clock = 'stop'
            
    if clock != 'stop': 
        try:
            startclock = datetime.datetime.strptime(posted,formatuse)
        except:
            startclock = datetime.datetime.strptime(posted,otherformat)

        elapsed = (clock-startclock).total_seconds()
        
    else:
        elapsed = 123456789
    return(elapsed)

In [36]:
pd.set_option('display.max_columns', None)

In [37]:
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print(engine.url)
# Replace localhost with IP address if accessing a remote server

postgresql://russell:bradypodion@localhost/donors_db
postgresql://russell:bradypodion@localhost/donors_db


In [5]:
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))
print(engine.url)

True
postgresql://russell:bradypodion@localhost/donors_db


In [6]:
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [7]:
###########################################################
### query: from historical data
hist_query = """
SELECT * FROM hist_projects;\n"""


hist_state = pd.read_sql_query(hist_query,con)
orig_hist_rows = len(hist_state.index)
hist_state = hist_state.drop_duplicates(keep='first')
dedup_hist_rows = len(hist_state.index)

print("Historical Raw Obs = "+str(orig_hist_rows)+"\nDeDup Obs = "+str(dedup_hist_rows))
   

Historical Raw Obs = 1425169
DeDup Obs = 1425169


In [8]:
### Close communication with the database
con.close()

In [9]:
############# previous EDA suggests these are all abberant classes with less than 100 values
hist_state=hist_state[hist_state.school_state != 'La']
hist_state=hist_state[hist_state.teacher_prefix != 'Mr. & Mrs.']
hist_state=hist_state[hist_state.teacher_prefix != 'Mr. & Mrs. ']

In [40]:
hist_state['latency_to_funded'] = hist_state.apply(lambda row: elapsedseconds(row['date_posted'],row['date_completed'],row['date_expiration']),axis=1)
# if the latency was non-addressable, the returned value = 123456789, so now we drop those
hist_state = hist_state[hist_state.latency_to_funded != 123456789]


In [42]:
hist_state['days_to_funding'] = hist_state.apply(lambda row: ConvertSectoDay(row.latency_to_funded),axis=1)

In [43]:
hist_state['succeed']= np.where(hist_state['funding_status']=='completed', 1, 0)

In [44]:
#replace 'f' and 't' with 'true' and 'false' for these columns
#https://stackoverflow.com/a/34697070/1602288

hist_state[['school_charter','school_magnet','school_year_round','school_nlns','school_kipp',
            'school_charter_ready_promise','teacher_teach_for_america','eligible_double_your_impact_match','eligible_almost_home_match']]= hist_state[['school_charter','school_magnet','school_year_round','school_nlns','school_kipp',
                             'school_charter_ready_promise','teacher_teach_for_america','eligible_double_your_impact_match','eligible_almost_home_match']].replace(['f','t'], ['false', 'true'])

#make new columns for posting time info, from splitting posting date = date_posted
hist_state[['posting_year','posting_month','posting_day']]=hist_state['date_posted'].str.split("-",expand=True)


In [51]:
trimmed=hist_state[['school_state','school_metro','school_charter', 'school_magnet', 
                    'school_year_round','teacher_prefix','teacher_teach_for_america', 
                    'primary_focus_subject','resource_type', 'poverty_level', 'grade_level',
                    'total_price_excluding_optional_support','students_reached',
                    'posting_month','days_to_funding','succeed']]

In [52]:
trimmed.astype({'posting_month':'int32'}).dtypes

school_state                               object
school_metro                               object
school_charter                             object
school_magnet                              object
school_year_round                          object
teacher_prefix                             object
teacher_teach_for_america                  object
primary_focus_subject                      object
resource_type                              object
poverty_level                              object
grade_level                                object
total_price_excluding_optional_support    float64
students_reached                          float64
posting_month                               int32
days_to_funding                           float64
succeed                                     int64
dtype: object

In [14]:
# One-hot encode categorical features
features = pd.get_dummies(trimmed)
print(features.shape)
features = features.dropna()
print(features.shape)

features.head(5)

(1425159, 118)
(1424961, 118)


Unnamed: 0,total_price_excluding_optional_support,students_reached,posting_month,succeed,school_state_AK,school_state_AL,school_state_AR,school_state_AZ,school_state_CA,school_state_CO,school_state_CT,school_state_DC,school_state_DE,school_state_FL,school_state_GA,school_state_HI,school_state_IA,school_state_ID,school_state_IL,school_state_IN,school_state_KS,school_state_KY,school_state_LA,school_state_MA,school_state_MD,school_state_ME,school_state_MI,school_state_MN,school_state_MO,school_state_MS,school_state_MT,school_state_NC,school_state_ND,school_state_NE,school_state_NH,school_state_NJ,school_state_NM,school_state_NV,school_state_NY,school_state_OH,school_state_OK,school_state_OR,school_state_PA,school_state_RI,school_state_SC,school_state_SD,school_state_TN,school_state_TX,school_state_UT,school_state_VA,school_state_VT,school_state_WA,school_state_WI,school_state_WV,school_state_WY,school_metro_rural,school_metro_suburban,school_metro_urban,school_charter_false,school_charter_true,school_magnet_false,school_magnet_true,school_year_round_false,school_year_round_true,teacher_prefix_Dr.,teacher_prefix_Mr.,teacher_prefix_Mrs.,teacher_prefix_Ms.,teacher_prefix_Teacher,teacher_teach_for_america_false,teacher_teach_for_america_true,primary_focus_subject_Applied Sciences,primary_focus_subject_Character Education,primary_focus_subject_Civics & Government,primary_focus_subject_College & Career Prep,primary_focus_subject_Community Service,primary_focus_subject_ESL,primary_focus_subject_Early Development,primary_focus_subject_Economics,primary_focus_subject_Environmental Science,primary_focus_subject_Extracurricular,primary_focus_subject_Financial Literacy,primary_focus_subject_Foreign Languages,primary_focus_subject_Gym & Fitness,primary_focus_subject_Health & Life Science,primary_focus_subject_Health & Wellness,primary_focus_subject_History & Geography,primary_focus_subject_Literacy,primary_focus_subject_Literature & Writing,primary_focus_subject_Mathematics,primary_focus_subject_Music,primary_focus_subject_Nutrition,primary_focus_subject_Other,primary_focus_subject_Parent Involvement,primary_focus_subject_Performing Arts,primary_focus_subject_Social Sciences,primary_focus_subject_Special Needs,primary_focus_subject_Sports,primary_focus_subject_Team Sports,primary_focus_subject_Visual Arts,resource_type_Books,resource_type_Other,resource_type_Supplies,resource_type_Technology,resource_type_Trips,resource_type_Visitors,poverty_level_high,poverty_level_high poverty,poverty_level_highest poverty,poverty_level_low,poverty_level_low poverty,poverty_level_minimal,poverty_level_moderate poverty,poverty_level_unknown,grade_level_Grades 3-5,grade_level_Grades 6-8,grade_level_Grades 9-12,grade_level_Grades PreK-2
0,813.5,24.0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
1,463.78,23.0,4,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
2,168.47,300.0,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
3,266.19,20.0,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
4,255.56,30.0,4,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0


In [15]:
count_failed= len(features[features['succeed']==0])
count_funded = len(features[features['succeed']==1])
pct_of_fail = count_failed/(count_failed+count_funded)
print("percentage of failed projects is ", pct_of_fail*100)
pct_of_fund = count_funded/(count_failed+count_funded)
print("percentage of funded projects is ", pct_of_fund*100)

percentage of failed projects is  30.126368370783478
percentage of funded projects is  69.87363162921652


In [None]:
trimmed.groupby('grade_level').mean()

In [None]:
trimmed.groupby('poverty_level').mean()

In [None]:
trimmed.groupby('resource_type').mean()#resource_type

In [16]:
# Use numpy to convert to arrays
import numpy as np

# Labels are the values we want to predict
labels = np.array(features['succeed'])

# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('succeed', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

In [17]:
print(features.shape)


(1424961, 117)


In [18]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25,
                                                                           random_state = 42)

In [None]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model 
rf = RandomForestRegressor(n_estimators= 1000, random_state=42)

# Train the model on training data
rf.fit(train_features, train_labels);