In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output

import datetime
from datetime import timedelta, date #for time duration calculations
from dateutil.parser import parse #for fuzzy finding year

# Feature Scaling
from sklearn.preprocessing import StandardScaler




import pickle #for saving output files, pickles
from sys import stdout
import time #for time.sleep function to delay calls
from tqdm import tqdm #for updating loop
#from os import listdir
#from os.path import isfile, join
import glob #pattern matching and expansion.

## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
from sqlalchemy.sql import table, column, select, update, insert
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

import pandas as pd

#In Python: Define your username and password used above. I've defined the database name (we're 
#using a dataset on births, so I call it birth_db). 
dbname = 'donors_db'
username = 'russell'
pswd = 'bradypodion'

In [16]:
intervals = (
    ('weeks', 604800),  # 60 * 60 * 24 * 7
    ('days', 86400),    # 60 * 60 * 24
    ('hours', 3600),    # 60 * 60
    ('minutes', 60),
    ('seconds', 1),
    )

def display_time(seconds, granularity=2):
    result = []

    for name, count in intervals:
        value = seconds // count
        if value:
            seconds -= value * count
            if value == 1:
                name = name.rstrip('s')
            result.append("{} {}".format(value, name))
    return ', '.join(result[:granularity])

# Function convert seconds into day.decimal  
def ConvertSectoDay(n): 
    day = n // (24 * 3600) 
    #print(day) #keep day
    n = n % (24 * 3600) 
    daydec=(n/86400) # add this to day
    addem=day+daydec
    #https://stackoverflow.com/a/48812729/1602288
    holder='{:g}'.format(float('{:.{p}g}'.format(addem, p=5)))
    return(float(holder))


def projectover(posted, completed,expiration):
    formatuse = '%Y-%m-%d %H:%M:%S' # The format: see down this page:https://docs.python.org/3/library/datetime.html
    otherformat = '%Y-%m-%d'
    
    #failed projects were never completed, so in those cases, use the expiration date
    # if variable is None:
    if completed is None:
        try:
            clock = datetime.datetime.strptime(expiration,formatuse) 
        except:
            try:
                clock = datetime.datetime.strptime(expiration,otherformat)
            except:
                clock = datetime.datetime.strptime('1900-01-01',otherformat)
    else:
        try:
            clock = datetime.datetime.strptime(completed,formatuse) 
        except:
            try:
                clock = datetime.datetime.strptime(completed,otherformat) 
            except:
                clock = datetime.datetime.strptime('1900-01-01',otherformat)
    return(clock)
                
        
                
def elapsedseconds(posted, completed, expiration):
    formatuse = '%Y-%m-%d %H:%M:%S' # The format: see down this page:https://docs.python.org/3/library/datetime.html
    otherformat = '%Y-%m-%d'
    
    #failed projects were never completed, so in those cases, use the expiration date
    # if variable is None:
    if completed is None:
        try:
            clock = datetime.datetime.strptime(expiration,formatuse) 
        except:
            try:
                clock = datetime.datetime.strptime(expiration,otherformat)
            except:
                clock = 'stop'
    else:
        try:
            clock = datetime.datetime.strptime(completed,formatuse) 
        except:
            try:
                clock = datetime.datetime.strptime(completed,otherformat) 
            except:
                clock = 'stop'
            
    if clock != 'stop': 
        try:
            startclock = datetime.datetime.strptime(posted,formatuse)
        except:
            startclock = datetime.datetime.strptime(posted,otherformat)

        elapsed = (clock-startclock).total_seconds()
        
    else:
        elapsed = 123456789
    return(elapsed)

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print(engine.url)
# Replace localhost with IP address if accessing a remote server

postgresql://russell:bradypodion@localhost/donors_db
postgresql://russell:bradypodion@localhost/donors_db


In [5]:
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))
print(engine.url)

True
postgresql://russell:bradypodion@localhost/donors_db


In [6]:
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [7]:
###########################################################
### query: from historical data
hist_query = """
SELECT * FROM hist_projects;\n"""


hist_state = pd.read_sql_query(hist_query,con)
orig_hist_rows = len(hist_state.index)
hist_state = hist_state.drop_duplicates(keep='first')
dedup_hist_rows = len(hist_state.index)

print("Historical Raw Obs = "+str(orig_hist_rows)+"\nDeDup Obs = "+str(dedup_hist_rows))
   

Historical Raw Obs = 1425169
DeDup Obs = 1425169


In [8]:
### Close communication with the database
con.close()

In [9]:
############# previous EDA suggests these are all abberant classes with less than 100 values
hist_state=hist_state[hist_state.school_state != 'La']
hist_state=hist_state[hist_state.teacher_prefix != 'Mr. & Mrs.']
hist_state=hist_state[hist_state.teacher_prefix != 'Mr. & Mrs. ']

In [17]:
hist_state['projectover'] = hist_state.apply(lambda row: projectover(row['date_posted'],row['date_completed'],row['date_expiration']),axis=1)

In [10]:
hist_state['latency_to_funded'] = hist_state.apply(lambda row: elapsedseconds(row['date_posted'],row['date_completed'],row['date_expiration']),axis=1)
# if the latency was non-addressable, the returned value = 123456789, so now we drop those
hist_state = hist_state[hist_state.latency_to_funded != 123456789]


In [11]:
hist_state['days_to_funding'] = hist_state.apply(lambda row: ConvertSectoDay(row.latency_to_funded),axis=1)

In [12]:
hist_state['succeed']= np.where(hist_state['funding_status']=='completed', 1, 0)

In [13]:
#replace 'f' and 't' with 'true' and 'false' for these columns
#https://stackoverflow.com/a/34697070/1602288

hist_state[['school_charter','school_magnet','school_year_round','school_nlns','school_kipp',
            'school_charter_ready_promise','teacher_teach_for_america','eligible_double_your_impact_match','eligible_almost_home_match']]= hist_state[['school_charter','school_magnet','school_year_round','school_nlns','school_kipp',
                             'school_charter_ready_promise','teacher_teach_for_america','eligible_double_your_impact_match','eligible_almost_home_match']].replace(['f','t'], ['false', 'true'])

#make new columns for posting time info, from splitting posting date = date_posted
hist_state[['posting_year','posting_month','posting_day']]=hist_state['date_posted'].str.split("-",expand=True)


In [22]:
#hist_state.astype({'date_posted':'Timestamp'}).dtypes #cast posting month as integer
hist_state['date_posted'].apply(pd.Timestamp)

0         2011-04-29
1         2011-04-29
2         2011-04-28
3         2011-04-28
4         2011-04-28
             ...    
1425164   2016-10-10
1425165   2016-10-10
1425166   2016-10-10
1425167   2016-10-10
1425168   2016-10-11
Name: date_posted, Length: 1425071, dtype: datetime64[ns]

In [None]:
def active_at_posting:
    

In [28]:
ref = pd.Timestamp(hist_state['date_posted'][0])
print(ref)
print(type(ref))
c1 =  pd.Timestamp(hist_state['date_posted'][1000])
print(c1)
print(type(c1))
c2 = hist_state['projectover'][1000]
print(c2)
print(type(c2))
if ref<c2 and ref>c1:
    print('1 active')

2011-04-29 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
2011-04-10 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
2011-04-15 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [33]:
testcase = hist_state.iloc[0:1000,:]
testcase = testcase.sort_values(by='date_posted', ascending=True)
testcase.tail(5)

Unnamed: 0,index,_projectid,_teacher_acctid,_schoolid,school_ncesid,school_latitude,school_longitude,school_city,school_state,school_zip,school_metro,school_district,school_county,school_charter,school_magnet,school_year_round,school_nlns,school_kipp,school_charter_ready_promise,teacher_prefix,teacher_teach_for_america,primary_focus_subject,primary_focus_area,secondary_focus_subject,secondary_focus_area,resource_type,poverty_level,grade_level,vendor_shipping_charges,sales_tax,payment_processing_charges,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,total_donations,num_donors,eligible_double_your_impact_match,eligible_almost_home_match,funding_status,date_posted,date_completed,date_thank_you_packet_mailed,date_expiration,calendar_completed,year_completed,calendar_expired,latency_to_funded,days_to_funding,succeed,posting_year,posting_month,posting_day,projectover
10,418,334dbf9e1515268f71e38f348b87b1d2,9d7051e2611cebdb758f1c7bd09360ac,cb1db2e3c1b355efb4e6f7b484ebf20d,370045000000.0,35.475513,-82.580324,Arden,NC,28704.0,suburban,Buncombe Co School District,Buncombe,False,False,False,False,False,False,Mrs.,False,Literacy,Literacy & Language,Literature & Writing,Literacy & Language,Books,high,Grades PreK-2,0.0,8.42,1.62,35.0,153.04,180.05,20.0,178.28,5,True,False,completed,2011-04-28,2011-04-29,,2011-09-27,2011-04-29,2011,2011-09-27,86400.0,1.0,1,2011,4,28,2011-04-29
12,436,fc5f8f5a43317a770443f5b8a47f01d2,c59ef8deffb929c27d181ec9ebfc490b,a0271c9c25e2538e103635d0fc55e619,403024000000.0,36.136589,-95.840123,Tulsa,OK,74128.0,urban,Tulsa Independent Sch Dist,Tulsa,False,False,False,False,False,False,Mr.,True,Social Sciences,History & Civics,Literature & Writing,Literacy & Language,Trips,high,Grades 3-5,0.0,0.0,5.78,35.0,425.78,500.92,20.0,500.92,1,False,False,completed,2011-04-28,2011-04-28,,2011-05-05,2011-04-28,2011,2011-05-05,0.0,0.0,1,2011,4,28,2011-04-28
7,347,2048bb46e755111a917105c7d0ef8c79,9935941422132e4c8cda75c7544ab46e,3dc2a7740a9d2c42e5e80f5456787e2d,540105000000.0,40.04245,-80.662671,Wheeling,WV,26003.0,urban,Ohio Co School District,Ohio,False,False,False,False,False,False,Mrs.,False,Special Needs,Special Needs,Literacy,Literacy & Language,Books,high,Grades 6-8,12.0,0.0,2.79,35.0,235.78,277.39,20.0,256.58,3,True,False,completed,2011-04-28,2011-04-30,,2011-09-27,2011-04-30,2011,2011-09-27,172800.0,2.0,1,2011,4,28,2011-04-30
1,185,4ff621b7ae38d10d350c207bd454b0a9,db7e23fd42f3d6cfd2aa673e008b73f6,a7cf5b5a11e2586b6deb168d71c763ef,63393010000.0,36.700132,-121.65776,Salinas,CA,93906.0,urban,Salinas City Elem Sch District,Monterey,False,False,False,False,False,False,Ms.,False,Mathematics,Math & Science,Literacy,Literacy & Language,Supplies,high,Grades PreK-2,3.84,35.14,5.76,35.0,463.78,545.62,23.0,523.92,20,True,False,completed,2011-04-29,2011-04-30,,2011-09-28,2011-04-30,2011,2011-09-28,86400.0,1.0,1,2011,4,29,2011-04-30
0,163,06dd1b6687a1cc9c3b6d005f0e678228,ec1afc0b1a77a3603718dcd1ca484106,26e476f93a7b248da90b883b8d45f3ff,181281000000.0,39.81611,-86.283113,Indianapolis,IN,46214.0,urban,Msd Of Wayne Twp,Marion,False,False,False,False,False,False,Mrs.,False,Literacy,Literacy & Language,,,Books,high,Grades PreK-2,0.0,0.0,11.5,35.0,813.5,957.06,24.0,,0,False,False,expired,2011-04-29,,,2011-09-28,,0,2011-09-28,13132800.0,152.0,0,2011,4,29,2011-09-28


In [42]:
def deltatimecalculator(start, stop):
    c2 =  pd.Timestamp(stop)
    c1 =  pd.Timestamp(start)
    delta = c2-c1
    return(delta.days)

In [44]:
testcase['delta'] =  testcase.apply(lambda row: deltatimecalculator(row['date_posted'],row['projectover']),axis=1)
testcase.tail(10)

Unnamed: 0,index,_projectid,_teacher_acctid,_schoolid,school_ncesid,school_latitude,school_longitude,school_city,school_state,school_zip,school_metro,school_district,school_county,school_charter,school_magnet,school_year_round,school_nlns,school_kipp,school_charter_ready_promise,teacher_prefix,teacher_teach_for_america,primary_focus_subject,primary_focus_area,secondary_focus_subject,secondary_focus_area,resource_type,poverty_level,grade_level,vendor_shipping_charges,sales_tax,payment_processing_charges,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,total_donations,num_donors,eligible_double_your_impact_match,eligible_almost_home_match,funding_status,date_posted,date_completed,date_thank_you_packet_mailed,date_expiration,calendar_completed,year_completed,calendar_expired,latency_to_funded,days_to_funding,succeed,posting_year,posting_month,posting_day,projectover,delta
5,286,39249b59347f25339689cb417e28864d,db7e23fd42f3d6cfd2aa673e008b73f6,a7cf5b5a11e2586b6deb168d71c763ef,63393010000.0,36.700132,-121.65776,Salinas,CA,93906.0,urban,Salinas City Elem Sch District,Monterey,False,False,False,False,False,False,Ms.,False,Mathematics,Math & Science,Literacy,Literacy & Language,Supplies,high,Grades PreK-2,8.0,73.17,11.99,35.0,927.78,1091.51,23.0,,0,False,False,expired,2011-04-28,,,2011-09-28,,0,2011-09-28,13219200.0,153.0,0,2011,4,28,2011-09-28,153
6,294,ec905c8224a8ca9f65cbe99c31512c26,c809fd540bd8ad186b6ab232f7337557,b20a75b2ff58323a02c84c1711ab3349,,41.798084,-87.705074,Chicago,IL,60632.0,urban,Chicago Psd-Area 54,Cook,False,False,False,False,False,False,Ms.,False,Literature & Writing,Literacy & Language,ESL,Literacy & Language,Books,high,Grades PreK-2,0.0,0.0,2.23,35.0,185.69,218.46,30.0,218.45,2,True,False,completed,2011-04-28,2011-04-28,2011-04-29,2011-09-28,2011-04-28,2011,2011-09-28,0.0,0.0,1,2011,4,28,2011-04-28,0
11,435,9b7c79e58ef390c487cbb91ce4ef8e9b,df748f43573462260a35d964725022d7,8fb4f73621a205729b26cd1eaab900a5,,32.990452,-92.709702,Lk Providence,LA,71254.0,rural,East Carroll Parish Sch Dist,East Carroll,False,False,False,False,False,False,Ms.,True,History & Geography,History & Civics,Social Sciences,History & Civics,Technology,high,Grades 6-8,12.0,11.2,1.92,35.0,188.09,221.28,87.0,221.28,3,False,False,reallocated,2011-04-28,2011-04-28,,2011-05-28,2011-04-28,2011,2011-05-28,0.0,0.0,0,2011,4,28,2011-04-28,0
8,374,379fa507f3c2b08e29cb723c8c40cbc1,a006826c170f91f85ff80dc5a132fade,ed047ff4a2b7d048537e32aa1312e949,,40.631962,-73.923897,Brooklyn,NY,11234.0,urban,Integrated Curriculum and Instruction Learning...,Brooklyn,False,True,False,False,False,False,Ms.,False,Health & Wellness,Health & Sports,Early Development,Applied Learning,Supplies,high,Grades PreK-2,40.02,0.0,5.0,35.0,413.54,486.52,18.0,,0,False,False,expired,2011-04-28,,,2011-09-27,,0,2011-09-27,13132800.0,152.0,0,2011,4,28,2011-09-27,152
9,398,703a5ef45ce441d8d313c3bb45f3b166,db7e23fd42f3d6cfd2aa673e008b73f6,a7cf5b5a11e2586b6deb168d71c763ef,63393010000.0,36.700132,-121.65776,Salinas,CA,93906.0,urban,Salinas City Elem Sch District,Monterey,False,False,False,False,False,False,Ms.,False,Literacy,Literacy & Language,Literature & Writing,Literacy & Language,Supplies,high,Grades PreK-2,83.06,63.33,10.38,35.0,883.91,1039.89,23.0,,0,False,False,expired,2011-04-28,,,2011-09-21,,0,2011-09-21,12614400.0,146.0,0,2011,4,28,2011-09-21,146
10,418,334dbf9e1515268f71e38f348b87b1d2,9d7051e2611cebdb758f1c7bd09360ac,cb1db2e3c1b355efb4e6f7b484ebf20d,370045000000.0,35.475513,-82.580324,Arden,NC,28704.0,suburban,Buncombe Co School District,Buncombe,False,False,False,False,False,False,Mrs.,False,Literacy,Literacy & Language,Literature & Writing,Literacy & Language,Books,high,Grades PreK-2,0.0,8.42,1.62,35.0,153.04,180.05,20.0,178.28,5,True,False,completed,2011-04-28,2011-04-29,,2011-09-27,2011-04-29,2011,2011-09-27,86400.0,1.0,1,2011,4,28,2011-04-29,1
12,436,fc5f8f5a43317a770443f5b8a47f01d2,c59ef8deffb929c27d181ec9ebfc490b,a0271c9c25e2538e103635d0fc55e619,403024000000.0,36.136589,-95.840123,Tulsa,OK,74128.0,urban,Tulsa Independent Sch Dist,Tulsa,False,False,False,False,False,False,Mr.,True,Social Sciences,History & Civics,Literature & Writing,Literacy & Language,Trips,high,Grades 3-5,0.0,0.0,5.78,35.0,425.78,500.92,20.0,500.92,1,False,False,completed,2011-04-28,2011-04-28,,2011-05-05,2011-04-28,2011,2011-05-05,0.0,0.0,1,2011,4,28,2011-04-28,0
7,347,2048bb46e755111a917105c7d0ef8c79,9935941422132e4c8cda75c7544ab46e,3dc2a7740a9d2c42e5e80f5456787e2d,540105000000.0,40.04245,-80.662671,Wheeling,WV,26003.0,urban,Ohio Co School District,Ohio,False,False,False,False,False,False,Mrs.,False,Special Needs,Special Needs,Literacy,Literacy & Language,Books,high,Grades 6-8,12.0,0.0,2.79,35.0,235.78,277.39,20.0,256.58,3,True,False,completed,2011-04-28,2011-04-30,,2011-09-27,2011-04-30,2011,2011-09-27,172800.0,2.0,1,2011,4,28,2011-04-30,2
1,185,4ff621b7ae38d10d350c207bd454b0a9,db7e23fd42f3d6cfd2aa673e008b73f6,a7cf5b5a11e2586b6deb168d71c763ef,63393010000.0,36.700132,-121.65776,Salinas,CA,93906.0,urban,Salinas City Elem Sch District,Monterey,False,False,False,False,False,False,Ms.,False,Mathematics,Math & Science,Literacy,Literacy & Language,Supplies,high,Grades PreK-2,3.84,35.14,5.76,35.0,463.78,545.62,23.0,523.92,20,True,False,completed,2011-04-29,2011-04-30,,2011-09-28,2011-04-30,2011,2011-09-28,86400.0,1.0,1,2011,4,29,2011-04-30,1
0,163,06dd1b6687a1cc9c3b6d005f0e678228,ec1afc0b1a77a3603718dcd1ca484106,26e476f93a7b248da90b883b8d45f3ff,181281000000.0,39.81611,-86.283113,Indianapolis,IN,46214.0,urban,Msd Of Wayne Twp,Marion,False,False,False,False,False,False,Mrs.,False,Literacy,Literacy & Language,,,Books,high,Grades PreK-2,0.0,0.0,11.5,35.0,813.5,957.06,24.0,,0,False,False,expired,2011-04-29,,,2011-09-28,,0,2011-09-28,13132800.0,152.0,0,2011,4,29,2011-09-28,152


In [41]:
[x.days for x in testcase['delta']]

[11,
 4,
 1,
 5,
 2,
 2,
 1,
 36,
 2,
 4,
 12,
 3,
 10,
 1,
 1,
 3,
 1,
 4,
 16,
 11,
 9,
 1,
 5,
 2,
 4,
 6,
 3,
 2,
 5,
 1,
 2,
 6,
 2,
 2,
 10,
 27,
 1,
 5,
 2,
 7,
 8,
 1,
 1,
 1,
 150,
 2,
 5,
 1,
 1,
 2,
 1,
 5,
 4,
 3,
 5,
 19,
 18,
 1,
 7,
 29,
 10,
 16,
 5,
 1,
 2,
 150,
 1,
 5,
 3,
 18,
 0,
 26,
 7,
 1,
 7,
 1,
 1,
 4,
 1,
 1,
 150,
 1,
 11,
 3,
 1,
 2,
 0,
 3,
 8,
 2,
 1,
 26,
 8,
 18,
 3,
 3,
 1,
 4,
 5,
 26,
 3,
 16,
 1,
 3,
 2,
 5,
 17,
 1,
 8,
 1,
 1,
 0,
 8,
 11,
 15,
 3,
 1,
 6,
 0,
 9,
 11,
 0,
 1,
 0,
 11,
 5,
 4,
 12,
 15,
 35,
 3,
 0,
 0,
 11,
 16,
 2,
 4,
 0,
 0,
 3,
 1,
 0,
 3,
 11,
 4,
 3,
 0,
 1,
 1,
 40,
 14,
 4,
 3,
 4,
 7,
 1,
 3,
 0,
 2,
 0,
 0,
 14,
 1,
 0,
 0,
 7,
 3,
 0,
 8,
 27,
 2,
 4,
 4,
 20,
 3,
 3,
 34,
 2,
 10,
 0,
 6,
 2,
 4,
 8,
 1,
 6,
 1,
 4,
 3,
 1,
 15,
 2,
 1,
 5,
 12,
 150,
 3,
 15,
 12,
 1,
 2,
 2,
 3,
 6,
 34,
 7,
 0,
 2,
 3,
 3,
 1,
 26,
 4,
 5,
 10,
 151,
 11,
 3,
 10,
 7,
 7,
 8,
 16,
 6,
 2,
 8,
 1,
 151,
 8,
 2,
 39,
 2,
 1,
 2,
 8,

In [None]:
for i in range(delta.days + 1):
    day = sdate + timedelta(days=i)
    print(day)

In [None]:
# trimmed=hist_state[['school_state','school_metro','school_charter', 'school_magnet', 
#                     'school_year_round','teacher_prefix','teacher_teach_for_america', 
#                     'primary_focus_subject','resource_type', 'poverty_level', 'grade_level',
#                     'total_price_excluding_optional_support','students_reached',
#                     'posting_month','days_to_funding']]

trimmed=hist_state[['total_price_excluding_optional_support','students_reached',
                    'posting_month','days_to_funding','succeed']]

trimmed = trimmed[trimmed.days_to_funding < 150]
trimmed = trimmed.dropna()
trimmed.shape

In [None]:
trimmed.astype({'posting_month':'int32'}).dtypes #cast posting month as integer

In [None]:
funded_x=trimmed[trimmed['succeed']==1]
notfund_y=trimmed[trimmed['succeed']==0]

In [None]:
funded_x=funded_x['days_to_funding']
notfund_y=notfund_y['days_to_funding']

In [None]:
sns.set_context("poster", font_scale=.6)

bins = np.linspace(0, 150, 100)
plt.hist(funded_x, bins, alpha=0.25, label='Funded')
plt.hist(notfund_y, bins, alpha=0.5, label='NOT funded')
plt.legend(loc='upper right')


plt.xlabel("Project duration");
plt.ylabel("Count");

plt.show()

In [None]:
sns.set_context("poster", font_scale=1.3)

fig, ax = plt.subplots(figsize=(12, 8))
sns.distplot(trimmed["days_to_funding"].dropna())
ax.set_xlim(1,150)
fig.tight_layout()

In [None]:
from sklearn import metrics 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

In [None]:
#binary output = is a project funded or not?
y = trimmed.succeed

#
x = trimmed.drop(['succeed'], axis=1)
x_scaled = preprocessing.scale(x)
# create training and testing vars
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.25)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

In [None]:
# fit a model
logistic_regression = LogisticRegression()
model = logistic_regression.fit(x_train, y_train)
predictions = logistic_regression.predict(x_test)

print("Score:", model.score(x_test, y_test))

In [None]:
y_pred = logistic_regression.predict(x_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy_percentage = 100 * accuracy
accuracy_percentage

In [None]:
cm = metrics.confusion_matrix(y_test, predictions)
print(cm)

In [None]:
# Use score method to get accuracy of model
score = model.score(x_test, y_test)
print(score)

matplotlib.rcParams.update({'font.size': 22})

plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'magma');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

import matplotlib

# font = {'family' : 'normal',
#         'weight' : 'bold',
#         'size'   : 30}

# matplotlib.rc('font', **font)
matplotlib.rcParams.update({'font.size': 42})

logit_roc_auc = roc_auc_score(y_test, logistic_regression.predict(x_test))
fpr, tpr, thresholds = roc_curve(y_test, logistic_regression.predict_proba(x_test)[:,1])
plt.figure(figsize=(9, 9))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc,linewidth=4)
plt.plot([0, 1], [0, 1],'r--',linewidth=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()


In [None]:
print(x.columns)
print(logistic_regression.coef_)


In [None]:
clf=logistic_regression

feature_importance = abs(clf.coef_[0])
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

matplotlib.rcParams.update({'font.size': 42})
plt.figure(figsize=(24, 16))
featfig = plt.figure()
featax = featfig.add_subplot(1, 1, 1)
featax.barh(pos, feature_importance[sorted_idx], align='center')
featax.set_yticks(pos)
featax.set_yticklabels(np.array(x.columns)[sorted_idx], fontsize=18)
featax.set_xlabel('Relative Feature Importance');

In [None]:
# Function convert months to school-year-months
def ConvertGregorian_to_School(m): 
    m=int(m)
    if m>=7:
        sm=m-6
    else:
        sm=m+6

    return(int(sm))

In [None]:
trimmed['posting_month'].value_counts()

In [None]:
trimmed['posting_month'] = trimmed.apply(lambda row: ConvertGregorian_to_School(row.posting_month),axis=1)

In [None]:
#binary output = is a project funded or not?
y = trimmed.succeed

#
x = trimmed.drop(['succeed'], axis=1)
x_scaled = preprocessing.scale(x)
# create training and testing vars
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.25)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

In [None]:
# fit a model
logistic_regression = LogisticRegression()
model = logistic_regression.fit(x_train, y_train)
predictions = logistic_regression.predict(x_test)

print("Score:", model.score(x_test, y_test))

In [None]:
# Use score method to get accuracy of model
score = model.score(x_test, y_test)
print(score)

matplotlib.rcParams.update({'font.size': 22})

plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'magma');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

In [None]:
logit_roc_auc = roc_auc_score(y_test, logistic_regression.predict(x_test))
fpr, tpr, thresholds = roc_curve(y_test, logistic_regression.predict_proba(x_test)[:,1])
plt.figure(figsize=(9, 9))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc,linewidth=4)
plt.plot([0, 1], [0, 1],'r--',linewidth=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()


In [None]:
clf=logistic_regression

feature_importance = abs(clf.coef_[0])
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

matplotlib.rcParams.update({'font.size': 42})
plt.figure(figsize=(24, 16))
featfig = plt.figure()
featax = featfig.add_subplot(1, 1, 1)
featax.barh(pos, feature_importance[sorted_idx], align='center')
featax.set_yticks(pos)
featax.set_yticklabels(np.array(x.columns)[sorted_idx], fontsize=18)
featax.set_xlabel('Relative Feature Importance');

In [None]:
# One-hot encode categorical features
features = pd.get_dummies(trimmed)
print(features.shape)
features = features.dropna()
print(features.shape)

features.head(5)

In [None]:
#binary output = is a project funded or not?
y = features.succeed

#
x = features.drop(['succeed'], axis=1)
x_scaled = preprocessing.scale(x)
# create training and testing vars
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.25)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

In [None]:
# fit a model
logistic_regression = LogisticRegression()
model = logistic_regression.fit(x_train, y_train)
predictions = logistic_regression.predict(x_test)

print("Score:", model.score(x_test, y_test))

In [None]:
count_failed= len(features[features['succeed']==0])
count_funded = len(features[features['succeed']==1])
pct_of_fail = count_failed/(count_failed+count_funded)
print("percentage of failed projects is ", pct_of_fail*100)
pct_of_fund = count_funded/(count_failed+count_funded)
print("percentage of funded projects is ", pct_of_fund*100)

In [None]:
trimmed.groupby('grade_level').mean()

In [None]:
trimmed.groupby('poverty_level').mean()

In [None]:
trimmed.groupby('resource_type').mean()#resource_type

In [None]:
# Use numpy to convert to arrays
import numpy as np

# Labels are the values we want to predict
labels = np.array(features['succeed'])

# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('succeed', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

In [None]:
print(features.shape)


In [None]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25,
                                                                           random_state = 42)

In [None]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model 
rf = RandomForestRegressor(n_estimators= 1000, random_state=42)

# Train the model on training data
rf.fit(train_features, train_labels);