In [1]:
%matplotlib inline
import MySQLdb as mdb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

# Helper Function 

In [25]:
def fetch(cols,table,where=None,group_by=None,order_by=None,Desc=False,limit=None):
    """
    Purpose: Makes the MySQL quesries more efficitent and faster.
    
    Note: All inputs have to be in STRING format except the 'limit' which is an int
    """
    
    # Opening a Connection to donorsChoose Database
    con = mdb.connect('localhost', 'idx', 'donorsChoose', 'donors');
    cur = con.cursor()
    
    # Creating a query
    query = 'SELECT ' + cols + ' FROM ' + table + ' '
    if where: query += ' WHERE ' + where + ' '
    if group_by: query += 'GROUP BY ' + group_by + ' '
    if order_by: query += 'ORDER BY ' + order_by + ' '
    if Desc: query += 'DESC '
    if limit: query += 'LIMIT ' + str(limit)
    
    # fetching the data and turn it into a pandas dataframe
    cur.execute(query)
    output = np.array(cur.fetchall())
    colm = [f[0] for f in cur.description]
    output = pd.DataFrame(output)
    output.columns = colm

    # Closing the connection to the database
    con.close()
    
    return output

In [32]:
def sampleProportions_chanceOF_success(colName):
    """
    Purpose: Determine wether or not a binary variable contributes to the success probability
    
    Example: Is being a charter school on not correlated to the chance of getting funded.
    
    Note: This function is specifically designed for LA projects and the 
    independent test perfomed based on the funding status (funded or not)
    """
    
    # prevalence of the binary varible: what is the prevalence of the charter school?
    prevalence = float(sum(la_donations[colName] == '1'))

    # Conditional Probability: If it's a charter school, what is the chance of getting funded?
    success = sum(la_donations[la_donations[colName] == '1']['funding_status'] == 'completed')
    
    # Overal success rate: independent of wether or not it's a charter school
    overal_success = sum(la_donations.funding_status == 'completed')/float(la_donations.shape[0])
    
    # funded given it's a charter school
    p = success/prevalence
    sigma = np.sqrt(p*(1-p)/prevalence)
    percLow = p - 1.96*sigma
    percHigh = p + 1.96*sigma
    
    # Confident interval of success rate compare to the overal success
    return (percLow - overal_success, percHigh - overal_success)

# Finding The Donation Made to LA Projects 

Project ID is a common key between donation and project tables. So, let's find the projects in LA. Then, we can find all the donations made to the LA projects.

This can be accomplished with a simple inner join between funding status and la_donation.

This part is done in MySQL. Here is the code:

In [16]:
la_donations = fetch('*','donation_LA')

In [60]:
la_donations.head()

Unnamed: 0,_projectid,_donationid,is_teacher_acct,payment_method,payment_included_acct_credit,payment_included_campaign_gift_card,payment_included_web_purchased_gift_card,donation_total,dollar_amount,payment_was_promo_matched,via_giving_page,for_honoree,donor_city,donation_message,funding_status,funding_status_binary
0,"""""""0434af307dc45294c2eb748ffd935767""""""","""""""83f29a83b88d0c7a1c96cf9bd9f3b2ab""""""",0,no_cash_received,1,0,0,75.0,10_to_100,0,0,0,,,completed,1
1,"""""""59f17c3f118cef16b2fee23ed74753fe""""""","""""""db4df4b7ed7f3e2145fc4550243304d9""""""",0,creditcard,0,0,0,100.0,100_and_up,0,0,0,LOS ANGELES,"""""""I gave to this project because I studied ab...",completed,1
2,"""""""464e9eb3a3918a2d3f2006513bd42949""""""","""""""36a2ca9827e7d6ea6eb6235205b55309""""""",0,creditcard,0,0,0,40.0,10_to_100,0,0,0,"""""","""""""readers are like super heroes",completed,1
3,"""""""183af384da17ccbdd72140d8daabeb8a""""""","""""""c71d10db81e2c2e455d5a62dcf87550b""""""",0,no_cash_received,0,1,0,10.0,10_to_100,0,0,0,"""""","""""""GOD BLESS YOU! KEEP UP THE GOOD WORK!""""""",completed,1
4,"""""""350299a9765c77228f2bcb2c450a095d""""""","""""""8a8b40c32e06554570241cec6295e80c""""""",0,no_cash_received,0,1,0,25.0,10_to_100,0,0,0,,,completed,1


Need to convert payment_was_promo_matched into '1' and '0' so my sample proportions independent test can work.

In [61]:
la_donations.payment_was_promo_matched = la_donations.payment_was_promo_matched.map({'t':'1','f':'0'}).astype(int)

# Exploratory Analysis

Most of the variables are leaky variables because they are related to payment and some are already included in the projects table; 

For_honoree (donation included an honoree) is not an option for payment anymore; will drop it.

The only variables that are potentially informative, or at least interesting, are 

- is_teacher_acct: donation was made by a teacher
- via_giving_page: true if the donation was made through a Giving Page
- payment_was_promo_matched: a phrase that anyone can enter to activate a match (like Disney)

##  When Teachers Have Donated

In [47]:
print 'confidence interval of enchanced success chance: ', sampleProportions_chanceOF_success('is_teacher_acct')

confidence interval of enchanced success chance:  (-0.031795211794963363, -0.023391887943065681)


Interesting! If a teacher has donated to a projects, that project is a little less likely to get funded. Maybe because the teacher him/herself tried to fund the projects? Anyway, it is not going to be part of the predictive model.

## Posting on a Giving Page

In [48]:
print 'confidence interval of enchanced success chance: ', sampleProportions_chanceOF_success('via_giving_page')

confidence interval of enchanced success chance:  (0.013100546642633937, 0.01663640537536204)


This is potentially an important feature. Projects which are posted on a giving page and therefore have more exposure, have a higher of success. That will be included in my model. 

## Donations Included a Promo Code Partnership

In [63]:
print 'confidence interval of enchanced success chance: ', sampleProportions_chanceOF_success('payment_was_promo_matched')

confidence interval of enchanced success chance:  (0.0041577225661824091, 0.011119772379366566)


Ok, similar to giving page, it potentially have predictive values.

# Writing The Potentially Important Variable

In [64]:
la_donations.columns

Index([u'_projectid', u'_donationid', u'is_teacher_acct', u'payment_method',
       u'payment_included_acct_credit', u'payment_included_campaign_gift_card',
       u'payment_included_web_purchased_gift_card', u'donation_total',
       u'dollar_amount', u'payment_was_promo_matched', u'via_giving_page',
       u'for_honoree', u'donor_city', u'donation_message', u'funding_status',
       u'funding_status_binary'],
      dtype='object')

In [68]:
donations_impt_features = la_donations[['_projectid','payment_was_promo_matched','via_giving_page']]

In [71]:
donations_impt_features.to_csv('donations_preProcessed.csv',index=False)