In [None]:
#%reset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
from datetime import datetime, timedelta, date #for time duration calculations
from dateutil.parser import parse #for fuzzy finding year

In [None]:
import pickle #for saving output files, pickles
from sys import stdout
import time #for time.sleep function to delay calls
from tqdm import tqdm #for updating loop
#from os import listdir
#from os.path import isfile, join
import glob #pattern matching and expansion.

In [None]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
from sqlalchemy.sql import table, column, select, update, insert
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

import pandas as pd

#In Python: Define your username and password used above. I've defined the database name (we're 
#using a dataset on births, so I call it birth_db). 
dbname = 'donors_db'
username = 'xxxx' #Enter username here
pswd = 'xxxx' #enter system password here

In [None]:
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print(engine.url)
# Replace localhost with IP address if accessing a remote server

In [None]:
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))
print(engine.url)


# This block reads in archival data (from the web.archive.org) and old data from (an Insight Fellow proj on AWS)

In [None]:
#### data from: https://web.archive.org/web/20121019202946/http://developer.donorschoose.org/the-data
#### Look at the first few rows of the CSV file
arch_data = pd.read_csv("/home/russell/Documents/DataScience/DonorsChoose/Data/donorschoose-org-1may2011-v1-projects.csv")

######drop 'active' projects

orig_rowlen=len(arch_data.index)

arch_data = arch_data[arch_data.funding_status != 'live']

new_rowlen=len(arch_data.index)

print("***Web Archive Data")

print("original rows ="+str(orig_rowlen)+", new rows ="+str(new_rowlen))

#### data from https://github.com/adilmoujahid/DonorsChoose_Visualization/issues/10
old_df = pd.read_csv("/home/russell/Downloads/opendata_projects.csv", thousands = ',')

######drop 'active' projects

old_rowlen=len(old_df.index)

old_df = old_df[old_df.funding_status != 'live']

new_oldrowlen=len(old_df.index)

print("****Github Data")

print("original rows ="+str(old_rowlen)+", new rows ="+str(new_oldrowlen))

#get shared column names, keeping order

keepcolumns=set(old_df.columns).intersection(arch_data.columns)

A=old_df.columns.values.tolist()

keepcolumns=sorted(keepcolumns, key=A.index)

In [None]:
set(old_df.columns).difference(arch_data.columns) #columns not shared

In [None]:
#############################
## keep only shared columns, then combine

arch_data = arch_data[keepcolumns]

old_df = old_df[keepcolumns]

In [None]:
bigold = arch_data.append(old_df) # combine

bigold['calendar_completed']=bigold.date_completed.str.split(' ').str[0]
bigold['year_completed']=bigold.calendar_completed.str.split('-').str[0]
bigold['calendar_expired']=bigold.date_expiration.str.split(' ').str[0]

In [None]:
bigold['school_latitude'] = bigold.school_latitude.astype(float)

bigold['school_longitude'] =bigold.school_longitude.astype(float)

####replace nas for these two columns with 0, required for conversion to int
values = {'year_completed':0,'num_donors':0}

bigold = bigold.fillna(value=values)

bigold['year_completed']=bigold.year_completed.astype(int)

bigold['num_donors']=bigold.num_donors.astype(int)

In [None]:
#have composite df now (bigold), so clean up old_df and arch_data
del old_df;del arch_data

print(bigold.shape)

bigold.head(2)

bigold['funding_status'].value_counts()

# Insert archival into database from python

In [None]:
##### insert data into database from Python
bigold.to_sql('hist_projects', engine, if_exists='replace',chunksize=100000)

# Load data scraped from DC and saved in pickle format

In [None]:
#get all pickle files from folder into a list, 'picks'

picks = sorted(glob.glob("/home/russell/Documents/DataScience/DonorsChoose/Data/Metrics/*pickle")) 
picks=picks[0:3]
print(len(picks))


#this will create a master list with the same # of elements as projects
basic_list = []

for pf in picks:
    #https://stackoverflow.com/a/3249684/1602288
    stdout.write("\r%s" % pf)
    stdout.flush()
    check=pickle.load(open(pf,"rb"))
    #check=check[['id','proposalURL']]
    if (isinstance(check, pd.DataFrame)) and (len(check.index)>0):
        basic_list.append(check)
    #basic_list.append(check)
    
    
    #basic_list.append(pickle.load(open(pf,"rb")))
    #basic_list = basic_list+(pickle.load(open(picks[0],"rb")))
    #sleep(.4)
stdout.write("\n")

In [None]:
bigframe = pd.concat(basic_list)
#schoolTypes, teacherTypes #these variables are 'dictionaries' and need to be dealt with

bigframe=bigframe.drop(['schoolTypes', 'teacherTypes'], axis=1)

bigframe.shape

In [None]:
#https://stackoverflow.com/a/40121869/1602288
ffd = bigframe['fullyFundedDate'].values.tolist()

years=[]

for date in ffd:
    try:
        years.append(parse(str(date), fuzzy=True).year)
    except:   
        years.append(np.nan)

bigframe['ffyear']=years

In [None]:
bigframe.head()

In [None]:
#nybig: 'latitude','longitude','ffyear'

bigframe['latitude']=bigframe.latitude.astype(float)

bigframe['longitude']=bigframe.longitude.astype(float)

####replace nas for these two columns with 0, required for conversion to int
values = {'ffyear':0,'numDonors':0}
bigframe = bigframe.fillna(value=values)

bigframe['ffyear']=bigframe.ffyear.astype(int)

bigframe['numDonors']=bigframe.numDonors.astype(int)

In [None]:
bigframe['expirationTime'] = bigframe['expirationTime'].apply(str)
bigframe['expirationTime']=bigframe['expirationTime'].str[1:-5]
bigframe['timeback']=pd.to_numeric(bigframe['expirationTime'])
bigframe['right_date']=pd.to_datetime(bigframe['expirationDate'],format='%Y-%m-%d')
bigframe['start_date'] = bigframe.apply(lambda row: row['right_date'] - timedelta(seconds=row['timeback']),axis = 1)



bigframe.head(3)   

# Insert scraped data into database from python

In [None]:
# insert data into database from Python
bigframe.to_sql('scraped_project_metrics', engine, if_exists='append',chunksize=100000)