In [234]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import operator
import matplotlib.cm as cm
import matplotlib.patches as patches
import math
import time
import json
import requests
import re

base_url = "https://api.genderize.io"
index_ref  = "/?name="

def grab_link_like_person(url, interval=2):
    '''
    Clicks a url address and sets of timer
    to ensure a specified rate limit for url requests
    to the target web server.

    Returns an opened url page to scrape  content.
    '''
    response = requests.get(url)
    time.sleep(interval)
    return response


def grab_gender(row, interval=2, column='First_name', reference_column='Gender', guess_column='Guess_gender'):
    '''
    Requests genderize.io for each row where
    gender is missing (np.nan). 
    Converts answer into a response dictionary
    and returns the 'gender' value to the cell.
    '''
    if isinstance(row[reference_column], float):
        requesturl = base_url + index_ref + row[column]
        responses = json.loads(grab_link_like_person(requesturl, interval).text)
        print(responses['name'], row[guess_column], responses['gender'].title())
        if responses['gender'] in ['female', 'male']:
            return responses['gender'].title()
    else:
        return row[guess_column]

    
def grab_gender_test(row, interval=2, column='First_name', reference_column='Gender', guess_column='Guess_gender'):
    '''
    Requests genderize.io for each row where
    gender is missing (np.nan). 
    Converts answer into a response dictionary
    and returns the 'gender' value to the cell.
    '''
    if isinstance(row[reference_column], float):
        x = 'Guess'
        row[guess_column] =x
        return x
    else:
        return row[guess_column]

def impute_by_class(with_null_data, student_data):
    for feature in with_null_data:
        unique_classes = student_data[reference_column].unique().tolist()
        feature_by_class_mean = student_data.groupby(reference_column)[feature].agg('mean').copy()
        feature_by_class_mean[0]    
        print('\n{};\nMeans for {}: Graduates {} vs. Non-graduates {}:'\
              .format(feature.upper(),feature, feature_by_class_mean[1],\
                      feature_by_class_mean[0]))
        student_data_byclass_mean[feature] = \
            student_data_byclass_mean.apply(lambda row: 
            convert_na_by_class_mean(row, feature, 
            unique_classes=unique_classes, 
            feature_by_class_mean=feature_by_class_mean,
            student_data=student_data), axis=1)
    return student_data_byclass_mean.copy()


def impute_var_mean(with_null_data, student_data_na_byfeaturemean):
    for feature in with_null_data:
        f_mean = student_data_na_byfeaturemean[feature].mean()
        print('\n{}; Mean for {}: {}'.format(feature.upper(),feature, f_mean))
        student_data_na_byfeaturemean[feature] = \
        student_data_na_byfeaturemean[feature].replace(np.nan, f_mean)
    return student_data_na_byfeaturemean.copy()


def convert_na_by_class_mean(row, column, unique_classes=None,
                             student_data=None, feature_by_class_mean=None,
                             reference_column='Grad', INDEX='ID'):
    '''
    Converts missing values with respect to class means.
    '''
    if not unique_classes:
        unique_classes = student_data[reference_column].unique().tolist()
    feature_by_class_mean = student_data.groupby(reference_column)[column].agg('mean').copy()
    if str(row[column])=="nan":
        for level in unique_classes:
            f_mean = feature_by_class_mean[level]
            if row[reference_column] == level:
                row[column] = f_mean
                return row[column]
    else:
        return row[column]


In [235]:
# Data Prep- Read in the data and impute convert string
# categorical variables to numeric codes.
student_csv = 'mock_student_data.csv'
student_data = pd.read_csv(student_csv)
student_data['Male']=student_data.Gender.map({'Female' : 0, 'Male' : 1})
student_data['Male_orig']=student_data.Gender.map({'Female' : 0, 'Male' : 1})
student_data['Grad']=student_data.Graduated.map({'No' : 0, 'Yes' : 1})
print('\nStudent Data has the following dimensions:', student_data.shape)
student_data.head(10)


Student Data has the following dimensions: (1000, 12)


Unnamed: 0,ID,First_name,Last_name,State,Gender,Age,GPA,Days_missed,Graduated,Male,Male_orig,Grad
0,1,Wayne,Boyd,Florida,Male,19.0,,9.0,Yes,1.0,1.0,1
1,2,Ann,Matthews,Pennsylvania,Female,18.0,3.0,,Yes,0.0,0.0,1
2,3,George,Matthews,Texas,,17.0,,10.0,Yes,,,1
3,4,Jerry,Ramos,California,Male,15.0,2.0,28.0,No,1.0,1.0,0
4,5,Andrea,Carroll,North Carolina,Female,,2.0,29.0,No,0.0,0.0,0
5,6,Annie,Fisher,Virginia,Female,19.0,,5.0,Yes,0.0,0.0,1
6,7,Stephanie,Barnes,Oklahoma,Female,,3.0,20.0,No,0.0,0.0,0
7,8,Janet,Jenkins,Louisiana,Female,15.0,4.0,20.0,Yes,0.0,0.0,1
8,9,Betty,Miller,,,17.0,3.0,23.0,No,,,0
9,10,Henry,Woods,Colorado,Male,19.0,4.0,14.0,Yes,1.0,1.0,1


In [236]:
# Take a look at some summary statistics for each feature:
number = 0
print('\nSummary Statistics:\n')
for feature in student_data.columns.tolist():
    number += 1
    x = 1000 - student_data[feature].count()
    print('\n{}. {}: {} Missing values\n'.format(number, feature, x), student_data[feature].describe())
 


Summary Statistics:


1. ID: 0 Missing values
 count    1000.000000
mean      500.500000
std       288.819436
min         1.000000
25%       250.750000
50%       500.500000
75%       750.250000
max      1000.000000
Name: ID, dtype: float64

2. First_name: 0 Missing values
 count     1000
unique     200
top        Amy
freq        12
Name: First_name, dtype: object

3. Last_name: 0 Missing values
 count     1000
unique     244
top       Ross
freq        13
Name: Last_name, dtype: object

4. State: 116 Missing values
 count       884
unique       49
top       Texas
freq         97
Name: State, dtype: object

5. Gender: 226 Missing values
 count        774
unique         2
top       Female
freq         398
Name: Gender, dtype: object

6. Age: 229 Missing values
 count    771.000000
mean      16.996109
std        1.458067
min       15.000000
25%       16.000000
50%       17.000000
75%       18.000000
max       19.000000
Name: Age, dtype: float64

7. GPA: 221 Missing values
 count    779.00

In [237]:
# Retrieve Genderized Gender for all missing Gender values:
student_data['Guess_gender'] = student_data['Gender']
Gender_nonmiss = student_data.copy()
student_data['Male'] = pd.DataFrame(Gender_nonmiss.apply(lambda row: grab_gender(row,2), axis=1))[0].map({'Female' : 0, 'Male' : 1, 'Guess': 100})
student_data['Guess_gender'] = pd.DataFrame(student_data['Guess_gender'].apply(lambda row: str(row)=='nan'))

George nan Male
Betty nan Female
Todd nan Male
Anthony nan Male
Kathleen nan Female
Ruth nan Female
Frank nan Male
Mark nan Male
Aaron nan Male
Stephen nan Male
Patrick nan Male
Lois nan Female
Emily nan Female
Melissa nan Female
Jack nan Male
Diana nan Female
Juan nan Male
William nan Male
Andrew nan Male
Bruce nan Male
James nan Male
Evelyn nan Female
Juan nan Male
Anne nan Female
Raymond nan Male
Evelyn nan Female
Brandon nan Male
Maria nan Female
Angela nan Female
Marilyn nan Female
Brandon nan Male
John nan Male
Patricia nan Female
Mary nan Female
Carol nan Female
Gloria nan Female
William nan Male
Victor nan Male
Amanda nan Female
Donald nan Male
Patricia nan Female
Melissa nan Female
Amy nan Female
Steve nan Male
Sean nan Male
Joseph nan Male
Pamela nan Female
Ernest nan Male
Wayne nan Male
Lawrence nan Male
Virginia nan Female
Jacqueline nan Female
Gloria nan Female
Catherine nan Female
Victor nan Male
Gary nan Male
Doris nan Female
Patricia nan Female
Jeffrey nan Male
Deborah 

In [246]:
student_data[['Gender', 'Male', 'Guess_gender', 'First_name']].head(10)

Unnamed: 0,Gender,Male,Guess_gender,First_name
0,Male,1,False,Wayne
1,Female,0,False,Ann
2,,1,True,George
3,Male,1,False,Jerry
4,Female,0,False,Andrea
5,Female,0,False,Annie
6,Female,0,False,Stephanie
7,Female,0,False,Janet
8,,0,True,Betty
9,Male,1,False,Henry


In [239]:
# Fill in missing values with the 
# mean of the values for that attribute.
print("\nOption 1: Fill in missing values with VARIABLE MEAN:")
student_data_na_byfeaturemean = student_data.copy()
with_null_data = ['Age', 'GPA', 'Days_missed']
print('\nAfter imputation:\n')
student_data_na_byfeaturemean = impute_var_mean(with_null_data, student_data_na_byfeaturemean)
student_data_na_byfeaturemean.describe()


Option 1: Fill in missing values with VARIABLE MEAN:

After imputation:


AGE; Mean for Age: 16.996108949416342

GPA; Mean for GPA: 2.988446726572529

DAYS_MISSED; Mean for Days_missed: 18.011138613861387


Unnamed: 0,ID,Age,GPA,Days_missed,Male,Male_orig,Grad
count,1000.0,1000.0,1000.0,1000.0,1000.0,774.0,1000.0
mean,500.5,16.996109,2.988447,18.011139,0.479,0.485788,0.593
std,288.819436,1.280089,0.722092,8.654698,0.499809,0.500121,0.491521
min,1.0,15.0,2.0,2.0,0.0,0.0,0.0
25%,250.75,16.0,2.0,12.0,0.0,0.0,0.0
50%,500.5,16.996109,3.0,18.011139,0.0,0.0,1.0
75%,750.25,18.0,4.0,25.0,1.0,1.0,1.0
max,1000.0,19.0,4.0,34.0,1.0,1.0,1.0


In [240]:
# Fill in missing values with a class-conditional mean 
# (where the class is whether they graduated or not).
print('\nOption 2: Replace missings with means conditional on graduation:\n')
reference_column = 'Grad'
student_data_byclass_mean = student_data.copy() 
student_data_byclass_mean = impute_by_class(with_null_data, student_data_byclass_mean)
print('\nAfter imputation:\n')
student_data_byclass_mean.describe()


Option 2: Replace missings with means conditional on graduation:


AGE;
Means for Age: Graduates 16.958874458874458 vs. Non-graduates 17.051779935275082:

GPA;
Means for GPA: Graduates 3.5053763440860215 vs. Non-graduates 2.515970515970516:

DAYS_MISSED;
Means for Days_missed: Graduates 16.77556109725686 vs. Non-graduates 19.22850122850123:

After imputation:



Unnamed: 0,ID,Age,GPA,Days_missed,Male,Male_orig,Grad
count,1000.0,1000.0,1000.0,1000.0,1000.0,774.0,1000.0
mean,500.5,16.996687,3.102688,17.773908,0.479,0.485788,0.593
std,288.819436,1.280278,0.753304,8.668384,0.499809,0.500121,0.491521
min,1.0,15.0,2.0,2.0,0.0,0.0,0.0
25%,250.75,16.0,2.0,12.0,0.0,0.0,0.0
50%,500.5,17.0,3.0,16.775561,0.0,0.0,1.0
75%,750.25,18.0,4.0,25.0,1.0,1.0,1.0
max,1000.0,19.0,4.0,34.0,1.0,1.0,1.0


In [241]:
print("ALTERNATIVE MISSINGS STRATEGY:\nWe should instead determine whether the \n\
presence of missings on a feature is correlated with the outcome \n\
or any other predictors.  If not, we may just want to use means, \n\
drop these cases entirely (less ideal, though perhaps negligible depending),\n\
or assign values at random. As long as they're 'representative' (missings are random),\n\
they shouldn't pose a problem and the means needn't be given special attention.\n\n\
We may also choose to stratify further \
and use sample weights and means.")
student_data_miss = student_data.copy()
student_data_missing = student_data.copy()
def is_missing(row):
    if str(row)=='nan':
        return 1
    return 0
numeric_variables = ['Age', 'GPA', 'Days_missed', 'Male']
describe_vars = []
all_vars = ['First_name']
for feature in numeric_variables:
    describe_vars.append(feature)
    bin_column = feature + '_miss'
    describe_vars.append(bin_column)
    student_data_missing[bin_column] = \
    student_data_missing[feature].apply(lambda row: is_missing(row))
missing_vars = [i for i in describe_vars if i not in numeric_variables]
missing_vars
all_vars += describe_vars
student_data_missing[all_vars].head(10)

ALTERNATIVE MISSINGS STRATEGY:
We should instead determine whether the 
presence of missings on a feature is correlated with the outcome 
or any other predictors.  If not, we may just want to use means, 
drop these cases entirely (less ideal, though perhaps negligible depending),
or assign values at random. As long as they're 'representative' (missings are random),
they shouldn't pose a problem and the means needn't be given special attention.

We may also choose to stratify further and use sample weights and means.


Unnamed: 0,First_name,Age,Age_miss,GPA,GPA_miss,Days_missed,Days_missed_miss,Male,Male_miss
0,Wayne,19.0,0,,1,9.0,0,1,0
1,Ann,18.0,0,3.0,0,,1,0,0
2,George,17.0,0,,1,10.0,0,1,0
3,Jerry,15.0,0,2.0,0,28.0,0,1,0
4,Andrea,,1,2.0,0,29.0,0,0,0
5,Annie,19.0,0,,1,5.0,0,0,0
6,Stephanie,,1,3.0,0,20.0,0,0,0
7,Janet,15.0,0,4.0,0,20.0,0,0,0
8,Betty,17.0,0,3.0,0,23.0,0,0,0
9,Henry,19.0,0,4.0,0,14.0,0,1,0


In [242]:
student_data_missing.describe()

Unnamed: 0,ID,Age,GPA,Days_missed,Male,Male_orig,Grad,Age_miss,GPA_miss,Days_missed_miss,Male_miss
count,1000.0,771.0,779.0,808.0,1000.0,774.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,16.996109,2.988447,18.011139,0.479,0.485788,0.593,0.229,0.221,0.192,0.0
std,288.819436,1.458067,0.818249,9.629371,0.499809,0.500121,0.491521,0.420399,0.415128,0.39407,0.0
min,1.0,15.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,250.75,16.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,500.5,17.0,3.0,18.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,750.25,18.0,4.0,27.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
max,1000.0,19.0,4.0,34.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [243]:
threshold = .2
select_by_class = []
select_randomly = []
for var in missing_vars:
    score = student_data_missing[var].corr(student_data_missing['Grad'])
    print('\n{} corr {}: {}'.format(var, 'Grad', score))
    if score >= threshold or  score < (0 - threshold):
        print('\t{} missing are very CORRELATED with Graduation ({})'.format(var, score))
        print('\tDECISION:Stratify imputations to match the class means'.format(var, score))
        var_orig = var.replace('_miss','')
        if var=='Days_missed_miss':
            var_orig = 'Days_missed'
        select_by_class += [var_orig]
    else:
        print('\t{} NOT CORRELATED,\n\tDECISION:Use variable median/mean imputations, \n\trandomize or drop if negligible:\n\t{}'.format(var, score))
        var_orig = var.replace('_miss','')
        if var=='Days_missed_miss':
            var_orig = 'Days_missed'
        select_randomly += [var_orig]
student_data_missing_imputed = student_data.copy()
student_data_missing_imputed = impute_by_class(select_by_class, student_data_missing_imputed)
student_data_missing_imputed = impute_by_class(select_randomly, student_data_missing_imputed)
student_data_missing_imputed.describe()


Age_miss corr Grad: -0.023238079705189228
	Age_miss NOT CORRELATED,
	DECISION:Use variable median/mean imputations, 
	randomize or drop if negligible:
	-0.023238079705189228

GPA_miss corr Grad: 0.441262748682337
	GPA_miss missing are very CORRELATED with Graduation (0.441262748682337)
	DECISION:Stratify imputations to match the class means

Days_missed_miss corr Grad: 0.4038450759150802
	Days_missed_miss missing are very CORRELATED with Graduation (0.4038450759150802)
	DECISION:Stratify imputations to match the class means

Male_miss corr Grad: nan
	Male_miss NOT CORRELATED,
	DECISION:Use variable median/mean imputations, 
	randomize or drop if negligible:
	nan

GPA;
Means for GPA: Graduates 3.5053763440860215 vs. Non-graduates 2.515970515970516:

DAYS_MISSED;
Means for Days_missed: Graduates 16.77556109725686 vs. Non-graduates 19.22850122850123:

AGE;
Means for Age: Graduates 16.95887445887448 vs. Non-graduates 17.051779935275054:

MALE;
Means for Male: Graduates 0.47217537942664417

Unnamed: 0,ID,Age,GPA,Days_missed,Male,Male_orig,Grad
count,1000.0,1000.0,1000.0,1000.0,1000.0,774.0,1000.0
mean,500.5,16.996687,3.102688,17.773908,0.479,0.485788,0.593
std,288.819436,1.280278,0.753304,8.668384,0.499809,0.500121,0.491521
min,1.0,15.0,2.0,2.0,0.0,0.0,0.0
25%,250.75,16.0,2.0,12.0,0.0,0.0,0.0
50%,500.5,17.0,3.0,16.775561,0.0,0.0,1.0
75%,750.25,18.0,4.0,25.0,1.0,1.0,1.0
max,1000.0,19.0,4.0,34.0,1.0,1.0,1.0


In [244]:
print('Problem B1:\n\nThe 2 unknown students, Chris and David, have equal probability\n' \
'of graduating.  This is known from respective higher income counterparts, Adam and Bob, \n' \
'who, despite their difference in income (delta $150,000) have the same probability\n\
of graduating.\n' \
'\nSince Chris and David also have a difference in income of $150,000, in the same direction\n'\
'their relative probablities must also be the same, although not 50%.')

Problem B1:

The 2 unknown students, Chris and David, have equal probability
of graduating.  This is known from respective higher income counterparts, Adam and Bob, 
who, despite their difference in income (delta $150,000) have the same probability
of graduating.

Since Chris and David also have a difference in income of $150,000, in the same direction
their relative probablities must also be the same, although not 50%.


In [245]:
print("Problem B2:\n\nA.i) Yes. The AfAm_Male interaction term tells us\n\
that the odds ratio of graduation of AfAm_Males to AfAm_Females is significantly smaller.\n\
\nA.ii) Likewise, AfAm_Male interaction term tells us\n\
that the odds ratio of graduation of AfAm_Males to NonAfam_Males is \n\
significantly smaller.\n\
\nB) Age isn't a good predictor of graduation.\n\
The coefficients are saying that an increase in age (either by years or years^2)\n\
won't increase the likelihood of graduation and this holds true for ages approaching 130.\n\
\nC) I'd likely drop the Age^2 and one of the gender values so that there aren't\n\
any closely correlated predicors that seem duplicitous.")

Problem B2:

A.i) Yes. The AfAm_Male interaction term tells us
that the odds ratio of graduation of AfAm_Males to AfAm_Females is significantly smaller.

A.ii) Likewise, AfAm_Male interaction term tells us
that the odds ratio of graduation of AfAm_Males to NonAfam_Males is 
significantly smaller.

B) Age isn't a good predictor of graduation.
The coefficients are saying that an increase in age (either by years or years^2)
won't increase the likelihood of graduation and this holds true for ages approaching 130.

C) I'd likely drop the Age^2 and one of the gender values so that there aren't
any closely correlated predicors that seem duplicitous.
