In [1]:
#%reset

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
from datetime import datetime, timedelta, date #for time duration calculations
from dateutil.parser import parse #for fuzzy finding year

In [5]:
import pickle #for saving output files, pickles
from sys import stdout
import time #for time.sleep function to delay calls
from tqdm import tqdm #for updating loop
#from os import listdir
#from os.path import isfile, join
import glob #pattern matching and expansion.

In [6]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
from sqlalchemy.sql import table, column, select, update, insert
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

import pandas as pd

#In Python: Define your username and password used above. I've defined the database name (we're 
#using a dataset on births, so I call it birth_db). 
dbname = 'donors_db'
username = 'russell'
pswd = 'bradypodion'

In [7]:
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print(engine.url)
# Replace localhost with IP address if accessing a remote server

postgresql://russell:bradypodion@localhost/donors_db
postgresql://russell:bradypodion@localhost/donors_db


In [8]:
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))
print(engine.url)


True
postgresql://russell:bradypodion@localhost/donors_db


## This block reads in archival data (from the web.archive.org) and old data from (an Insight Fellow proj on AWS)

#### data from: https://web.archive.org/web/20121019202946/http://developer.donorschoose.org/the-data
#### Look at the first few rows of the CSV file
arch_data = pd.read_csv("/home/russell/Documents/DataScience/DonorsChoose/Data/donorschoose-org-1may2011-v1-projects.csv")

######drop 'active' projects

orig_rowlen=len(arch_data.index)

arch_data = arch_data[arch_data.funding_status != 'live']

new_rowlen=len(arch_data.index)

print("***Web Archive Data")

print("original rows ="+str(orig_rowlen)+", new rows ="+str(new_rowlen))

#### data from https://github.com/adilmoujahid/DonorsChoose_Visualization/issues/10
old_df = pd.read_csv("/home/russell/Downloads/opendata_projects.csv", thousands = ',')

######drop 'active' projects

old_rowlen=len(old_df.index)

old_df = old_df[old_df.funding_status != 'live']

new_oldrowlen=len(old_df.index)

print("****Github Data")

print("original rows ="+str(old_rowlen)+", new rows ="+str(new_oldrowlen))

#get shared column names, keeping order

keepcolumns=set(old_df.columns).intersection(arch_data.columns)

A=old_df.columns.values.tolist()

keepcolumns=sorted(keepcolumns, key=A.index)

set(old_df.columns).difference(arch_data.columns) #columns not shared

#############################
## keep only shared columns, then combine

arch_data = arch_data[keepcolumns]

old_df = old_df[keepcolumns]

bigold = arch_data.append(old_df) # combine

bigold['calendar_completed']=bigold.date_completed.str.split(' ').str[0]

bigold['year_completed']=bigold.calendar_completed.str.split('-').str[0]

bigold['calendar_expired']=bigold.date_expiration.str.split(' ').str[0]

del old_df;del arch_data

print(bigold.shape)

bigold.head(2)

bigold['funding_status'].value_counts()

### insert data into database from Python (proof of concept - this won't be useful for big data, of course)

bigold.to_sql('hist_projects', engine, if_exists='append',chunksize=100000)

### connect:
con = None

con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

### query:
sql_query = """
SELECT * FROM hist_projects WHERE school_state='NY';
"""

old_NYdata_from_sql = pd.read_sql_query(sql_query,con)

old_NYdata_from_sql.head(2)

### Close communication with the database
con.close()

In [17]:
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

# Close communication with the database
con.close()


(149420, 47)

#get all pickle files from folder into a list, 'picks'

picks = sorted(glob.glob("/home/russell/Documents/DataScience/DonorsChoose/Data/Metrics/*pickle")) 

print(len(picks))


#this will create a master list with the same # of elements as projects
basic_list = []

for pf in picks:
    #https://stackoverflow.com/a/3249684/1602288
    stdout.write("\r%s" % pf)
    stdout.flush()
    check=pickle.load(open(pf,"rb"))
    #check=check[['id','proposalURL']]
    if (isinstance(check, pd.DataFrame)) and (len(check.index)>0):
        basic_list.append(check)
    #basic_list.append(check)
    
    
    #basic_list.append(pickle.load(open(pf,"rb")))
    #basic_list = basic_list+(pickle.load(open(picks[0],"rb")))
    #sleep(.4)
stdout.write("\n")

In [10]:
bigframe = pd.concat(basic_list)
#schoolTypes, teacherTypes #these variables are 'dictionaries' and need to be dealt with
bigframe=bigframe.drop(['schoolTypes', 'teacherTypes'], axis=1)
bigframe.shape

(1642201, 37)

## insert data into database from Python

#### bigframe.to_sql('scraped_project_metrics', engine, if_exists='append',chunksize=100000)

In [None]:
#basic_list =pickle.load(open("/home/russell/Documents/DataScience/DonorsChoose/Data/BigFrame.pickle","rb"))

In [None]:
check="/home/russell/Documents/DataScience/DonorsChoose/Data/Biglist"
pickle_out = open(check+'.pickle',"wb")
pickle.dump(basic_list, pickle_out)
pickle_out.close()

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', 999)

In [None]:
nybig = bigframe[bigframe.state.eq('NY')]
nybig.shape

In [None]:
#https://stackoverflow.com/a/40121869/1602288
ffd = nybig['fullyFundedDate'].values.tolist()

years=[]

for date in ffd:
    try:
        years.append(parse(str(date), fuzzy=True).year)
    except:   
        years.append(np.nan)



In [None]:
nybig['ffyear']=years

In [None]:
# Close the connection
connection.close()

In [None]:
print(nybig.shape)
print(nydf.shape)

In [None]:
#nybig: 'latitude','longitude','ffyear'
#nydf: 'school_latitude', 'school_longitude','year_completed'

nybig['latitude']=nybig.latitude.astype(float)
nybig['longitude']=nybig.longitude.astype(float)
nybig = nybig.fillna(0)
nybig['ffyear']=nybig.ffyear.astype(int)
nybig['numDonors']=nybig.numDonors.astype(int)

nydf['school_latitude'] = nydf.school_latitude.astype(float)
nydf['school_longitude'] =nydf.school_longitude.astype(float)
nydf = nydf.fillna(0)
nydf['year_completed']=nydf.year_completed.astype(int)
nydf['num_donors']=nydf.num_donors.astype(int)

In [None]:
nybig['expirationDate']

In [None]:
nydf['calendar_expired']

In [None]:
#https://stackoverflow.com/a/41815118/1602288
new_ny = pd.merge(nybig,nydf,left_on=['latitude','longitude','ffyear','numDonors','expirationDate'],right_on = ['school_latitude', 'school_longitude','year_completed','num_donors','calendar_expired'])

In [None]:
new_ny.shape

In [None]:
print(new_ny.loc[[0]])

In [None]:
bigframe['expirationTime'] = bigframe['expirationTime'].apply(str)
bigframe['expirationTime']=bigframe['expirationTime'].str[1:-5]
bigframe['timeback']=pd.to_numeric(bigframe['expirationTime'])
bigframe['right_date']=pd.to_datetime(bigframe['expirationDate'],format='%Y-%m-%d')
bigframe['start_date'] = bigframe.apply(lambda row: row['right_date'] - timedelta(seconds=row['timeback']),axis = 1)

#bigframe['right_date'] - timedelta(seconds=bigframe['timeback'])



bigframe.head(3)   

In [None]:
bigframe['right_date']=pd.to_datetime(bigframe['expirationDate'],format='%Y-%m-%d')

In [None]:
bigframe['right_date']=pd.to_datetime(bigframe['expirationDate'].astype(str))

In [None]:
exp_date = bigframe.expirationDate.values[0]
type(exp_date)
#right_date = date.fromisoformat(exp_date) #gets date into a datetime.date format


## need to back calculate 'start time' using *expirationDate* & *expirationTime*

In [None]:

fullyFundedDate

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
(bigframe['proposalURL'][0])

In [None]:
(bigframe['expirationTime'][0])

In [None]:
for colu in bigframe.columns:
    print(bigframe[colu])

In [None]:
pd.set_option('display.max_columns', 999)
bigframe.loc[0]

The above line (to_sql) is doing a lot of heavy lifting. It's reading a dataframe, it's creating a table, and adding the data to the table. So ** SQLAlchemy is quite useful! **

In [None]:
print(bigframe.shape)
print(type(bigframe))
print(bigframe.columns)

In [None]:
#ok = bigframe.loc[bigframe.astype(str).drop_duplicates().index]
ok = bigframe.drop_duplicates(subset='id') #drop duplicate rows
print(type(ok))
print(ok.shape)

In [None]:
ok = ok.drop_duplicates(subset="id", keep="first") #for some reason, some duplicate projs were kept, this drops'em

In [None]:
pd.set_option('display.max_columns', None)
#d.set_option("max_rows", None) #undo by resetting --- pd.reset_option(“max_rows”)

In [None]:
trailerdict = dict(zip(ok['id'],ok['fulfillmentTrailer']))

In [None]:
pickle_out = open('/home/russell/Documents/DataScience/DonorsChoose/Data/trailers.pickle',"wb")
pickle.dump(trailerdict, pickle_out)
pickle_out.close()

In [None]:
donordict = dict(zip(ok['id'],ok['numDonors']))
pickle_out = open('/home/russell/Documents/DataScience/DonorsChoose/Data/donor_num.pickle',"wb")
pickle.dump(donordict, pickle_out)
pickle_out.close()

In [None]:

trailers =ok['fulfillmentTrailer']

In [None]:
sns.set_context("poster", font_scale=1.3)


In [None]:

fig, ax = plt.subplots(figsize=(12, 8))
sns.distplot(ok["numDonors"].dropna())
fig.tight_layout()

In [None]:
bigframe['fulfillmentTrailer'].value_counts()

In [None]:
bigframe['id'].value_counts()

In [None]:
f_trailers = bigframe['fulfillmentTrailer'].array
print(f_trailers)

In [None]:
type(what)

In [None]:
len(what)

In [None]:
w_a = what.array

In [None]:
len(w_a)

In [None]:
w_a[0]

In [None]:
for b in range(10):
    #print(b)
    print(bigframe['fulfillmentTrailer'].array[b])
    


In [None]:
bigframe['proposalURL'].value_counts()

In [None]:
checkdf = pd.DataFrame(project_IDs,columns =['proj_id'])
checkdf
pd.set_option("max_rows", None) #undo by resetting --- pd.reset_option(“max_rows”)
checkdf

In [None]:
checkdf['proj_id'].value_counts()

In [None]:
basic_list[0]

In [None]:
bewild = pd.DataFrame.from_dict(lookat)

In [None]:
import json
from bs4 import BeautifulSoup

In [None]:
import scrapy
#modified scrapy settings here:
#/home/russell/anaconda3/envs/insight/lib/python3.8/site-packages/scrapy/settings
#to include the user agents described here: https://www.scrapehero.com/how-to-fake-and-rotate-user-agents-using-python-3/

In [None]:
df = pd.read_csv("/home/russell/Downloads/opendata_projects.csv", thousands = ',')
pd.set_option('display.max_columns', None)
#pd.set_option("max_rows", None) #undo by resetting --- pd.reset_option(“max_rows”)bb

In [None]:
current = "https://www.donorschoose.org/common/json_feed.html?showFacetCounts=true&APIKey=H9v7hCeN&max=100&index=0"
historical = "https://www.donorschoose.org/common/json_feed.html?showFacetCounts=true&APIKey=H9v7hCeN&max=40&historical=true&index=0"


In [None]:
r = requests.get(historical)
data_dict = json.loads(r.text)

In [None]:
print(data_dict)

In [None]:
for key in data_dict.items():
    print (key)

In [None]:
dicts_on_this_page = list(data_dict.values())

proposal_ind = ([list(data_dict.keys()).index('proposals')])[0]  #

proposal_list = dicts_on_this_page[proposal_ind]
#dicts_on_this_page[proposal_ind]
#proposal_ind = which(data.keys()=='proposals')
#print(proposal_ind)
#print(proposals_on_this_page[7])

In [None]:
proposal_list[0]

In [None]:
first_proposal = proposal_list[0] #returns dictionary of first proposal items

In [None]:
type(first_proposal)

In [None]:
for key in first_proposal.items():
    print (key)

In [None]:
print(soup)

# _DRAW OUT SCREENS and FUNCTIONALITY GOALS FOR END OF WEEK_



## MONDAY
    Well scoped, clearly-defined problem + some data
## 	TUESDAY
    SQL[organized data] 
## 	WED 
    analytics/working algo/some results ----> (python linked to sql data)
## 	THUR – SQL mapped to PYTHON connected to FLASK (or something)


# GOOD Qs to ASK SELF AND OTHERS
#### 	-What’s actionable about your product?
#### 	-Did you try other models?
#### 	-Is this better than random?
#### 	-Is this better than the simplest model?
#### 	-Why did you choose these inputs?
#### 	-How did you validate this?
#### 	-What are your metrics for success?
#### 	-What are the assumptions of your model?
#### 	-How would you improve this project with more time?

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df['primary_focus_area'].value_counts()

In [None]:
expired = df[(df['funding_status']=='expired')]

In [None]:
df['_teacher_acctid'].value_counts()