In [28]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import operator
import matplotlib.cm as cm
import matplotlib.patches as patches
import math
import time
import json
import requests

base_url = "https://api.genderize.io"
index_ref  = "/?name="

def grab_link_like_person(url, interval=2):
    '''
    Clicks a url address and sets of timer
    to ensure a specified rate limit for url requests
    to the target web server.

    Returns an opened url page to scrape  content.
    '''
    response = requests.get(url)
    time.sleep(interval)
    print(response)
    return response


def grab_gender(row, interval=2, column='First_name', reference_column='Gender', guess_column='Guess'):
    '''
    Requests genderize.io for each row where
    gender is missing (np.nan). 
    Converts answer into a response dictionary
    and returns the 'gender' value to the cell.
    '''
    if isinstance(row[reference_column], float):
        requesturl = base_url + index_ref + row[column]
        responses = json.loads(grab_link_like_person(requesturl, interval).text)
        print(responses['name'], row[guess_column], responses['gender'].title())
        row[guess_column] = responses['gender'].title()
        print(responses['name'], row[guess_column])
        return row[guess_column]
    else:
        row[guess_column] = row[guess_column]
        return row[guess_column]


def convert_na_by_class_mean(row, column, unique_classes=None,
                             student_data=None, feature_by_class_mean=None,
                             reference_column='Grad', INDEX='ID'):
    '''
    Converts missing values with respect to class means.
    '''
    if not unique_classes:
        unique_classes = student_data[reference_column].unique().tolist()
    feature_by_class_mean = student_data.groupby(reference_column)[column].agg('mean').copy()
    if str(row[column])=="nan":
        for level in unique_classes:
            f_mean = feature_by_class_mean[level]
            if row[reference_column] == level:
                row[column] = f_mean
                return row[column]
    else:
        return row[column]


In [4]:
# Data Prep- Read in the data and impute convert string
# categorical variables to numeric codes.
student_csv = 'mock_student_data.csv'
student_data = pd.read_csv(student_csv)
student_data['Male']=student_data.Gender.map({'Female' : 0, 'Male' : 1})
student_data['Grad']=student_data.Graduated.map({'No' : 0, 'Yes' : 1})
print('\nStudent Data has the following dimensions:', student_data.shape)
student_data.head(10)


Student Data has the following dimensions: (1000, 11)


Unnamed: 0,ID,First_name,Last_name,State,Gender,Age,GPA,Days_missed,Graduated,Male,Grad
0,1,Wayne,Boyd,Florida,Male,19.0,,9.0,Yes,1.0,1
1,2,Ann,Matthews,Pennsylvania,Female,18.0,3.0,,Yes,0.0,1
2,3,George,Matthews,Texas,,17.0,,10.0,Yes,,1
3,4,Jerry,Ramos,California,Male,15.0,2.0,28.0,No,1.0,0
4,5,Andrea,Carroll,North Carolina,Female,,2.0,29.0,No,0.0,0
5,6,Annie,Fisher,Virginia,Female,19.0,,5.0,Yes,0.0,1
6,7,Stephanie,Barnes,Oklahoma,Female,,3.0,20.0,No,0.0,0
7,8,Janet,Jenkins,Louisiana,Female,15.0,4.0,20.0,Yes,0.0,1
8,9,Betty,Miller,,,17.0,3.0,23.0,No,,0
9,10,Henry,Woods,Colorado,Male,19.0,4.0,14.0,Yes,1.0,1


In [21]:
# Take a look at some summary statistics for each feature:
number = 0
for feature in student_data.columns.tolist():
    number += 1
    print('{}. Inspect {}:\n'.format(number, feature), student_data[feature].describe())
    x = 1000 - student_data[feature].count()
    print('{} Missing values\n'.format(x))

1. Inspect ID:
 count    1000.000000
mean      500.500000
std       288.819436
min         1.000000
25%       250.750000
50%       500.500000
75%       750.250000
max      1000.000000
Name: ID, dtype: float64
0 Missing values

2. Inspect First_name:
 count     1000
unique     200
top        Amy
freq        12
Name: First_name, dtype: object
0 Missing values

3. Inspect Last_name:
 count     1000
unique     244
top       Ross
freq        13
Name: Last_name, dtype: object
0 Missing values

4. Inspect State:
 count       884
unique       49
top       Texas
freq         97
Name: State, dtype: object
116 Missing values

5. Inspect Gender:
 count        774
unique         2
top       Female
freq         398
Name: Gender, dtype: object
226 Missing values

6. Inspect Age:
 count    771.000000
mean      16.996109
std        1.458067
min       15.000000
25%       16.000000
50%       17.000000
75%       18.000000
max       19.000000
Name: Age, dtype: float64
229 Missing values

7. Inspect GPA:
 c

In [30]:
# Retrieve Genderized Gender for all missing Gender values:
student_data['Guess'] = student_data['Gender']
student_data2 = student_data.copy()
student_data2 = student_data2.apply(lambda row: grab_gender(row,2), axis=1)

<Response [429]>
<Response [429]>


KeyError: ('name', 'occurred at index 2')

In [31]:
student_data['Gender_guess'].head(10)

KeyError: 'Gender_guess'

In [32]:
# Fill in missing values with the 
# mean of the values for that attribute.
student_data_na_byfeaturemean = student_data.copy()
with_null_data = ['Age', 'GPA', 'Days_missed']
for feature in with_null_data:
    print(student_data_na_byfeaturemean[feature].describe())
    f_mean = student_data_na_byfeaturemean[feature].mean()
    print('Mean for {}: {}\n'.format(feature, f_mean))
    student_data_na_byfeaturemean[feature] = student_data_na_byfeaturemean[feature].replace(np.nan, f_mean)
    print(student_data_na_byfeaturemean[feature].describe())

count    771.000000
mean      16.996109
std        1.458067
min       15.000000
25%       16.000000
50%       17.000000
75%       18.000000
max       19.000000
Name: Age, dtype: float64
Mean for Age: 16.996108949416342

count    1000.000000
mean       16.996109
std         1.280089
min        15.000000
25%        16.000000
50%        16.996109
75%        18.000000
max        19.000000
Name: Age, dtype: float64
count    779.000000
mean       2.988447
std        0.818249
min        2.000000
25%        2.000000
50%        3.000000
75%        4.000000
max        4.000000
Name: GPA, dtype: float64
Mean for GPA: 2.988446726572529

count    1000.000000
mean        2.988447
std         0.722092
min         2.000000
25%         2.000000
50%         3.000000
75%         4.000000
max         4.000000
Name: GPA, dtype: float64
count    808.000000
mean      18.011139
std        9.629371
min        2.000000
25%        9.000000
50%       18.000000
75%       27.000000
max       34.000000
Name: Days_mi

In [34]:
# Fill in missing values with a class-conditional mean 
# (where the class is whether they graduated or not).
print('\n\nReplace missings by class means:\n')
reference_column = 'Grad'
student_data_byclass_mean = student_data.copy() 
for feature in with_null_data:
    print('\n{}:\n'.format(feature))
    unique_classes = student_data[reference_column].unique().tolist()
    feature_by_class_mean = student_data.groupby(reference_column)[feature].agg('mean').copy()
    student_data_byclass_mean[feature] = \
        student_data_byclass_mean.apply(lambda row: 
        convert_na_by_class_mean(row, feature, 
        unique_classes=unique_classes, 
        feature_by_class_mean=feature_by_class_mean,
        student_data=student_data), axis=1)
    print(student_data_byclass_mean[feature].describe())



Replace missings by class means:


Age:

count    1000.000000
mean       16.996687
std         1.280278
min        15.000000
25%        16.000000
50%        17.000000
75%        18.000000
max        19.000000
Name: Age, dtype: float64

GPA:

count    1000.000000
mean        3.102688
std         0.753304
min         2.000000
25%         2.000000
50%         3.000000
75%         4.000000
max         4.000000
Name: GPA, dtype: float64

Days_missed:

count    1000.000000
mean       17.773908
std         8.668384
min         2.000000
25%        12.000000
50%        16.775561
75%        25.000000
max        34.000000
Name: Days_missed, dtype: float64


In [52]:
print("ALTERNATE MISSINGS STRATEGY:\nWe should instead try to determine whether the \n\
presence of missings on a feature is correlated with the outcome \n\
or any other predictors.  If not, we may just want to use means, \n\
drop these cases entirely (less ideal, though perhaps negligible depending),\n\
or assign values at random. As long as they're 'representative' (missings are random),\n\
they shouldn't pose a problem. We may also choose to stratify further. \n\
Then we may want to use sample weights and means.")

ALTERNATE MISSINGS STRATEGY:
We should instead try to determine whether the 
presence of missings on a feature is correlated with the outcome 
or any other predictors.  If not, we may just want to use means, 
drop these cases entirely (less ideal, though perhaps negligible depending),
or assign values at random. As long as they're 'representative' (missings are random),
they shouldn't pose a problem. We may also choose to stratify further. 
Then we may want to use sample weights and means.


In [43]:
print('Problem B1:\n\nThe 2 unknown students, Chris and David, have equal probability\n' \
'of graduating.  This is known from respective higher income counterparts, Adam and Bob, \n' \
'who, despite their difference in income (delta $150,000) have the same probability\n\
of graduating.\n' \
'\nSince Chris and David also have a difference in income of $150,000, in the same direction\n'\
'their relative probablities must also be the same, although not 50%.')

Problem B1:

The 2 unknown students, Chris and David, have equal probability
of graduating.  This is known from respective higher income counterparts, Adam and Bob, 
who, despite their difference in income (delta $150,000) have the same probability
of graduating.

Since Chris and David also have a difference in income of $150,000, in the same direction
their relative probablities must also be the same, although not 50%.


In [49]:
print("Problem B2:\n\nA.i) Yes. The AfAm_Male interaction term tells us\n\
that the odds ratio of graduation of AfAm_Males to AfAm_Females is significantly smaller.\n\
\nA.ii) Likewise, AfAm_Male interaction term tells us\n\
that the odds ratio of graduation of AfAm_Males to NonAfam_Males is \n\
significantly smaller.\n\
\nB) Age isn't a good predictor of graduation.\n\
The coefficients are saying that an increase in age (either by years or years^2)\n\
won't increase the likelihood of graduation and this holds true for ages approaching 130.\n\
\nC) I'd likely drop the Age^2 and one of the gender values so that there aren't\n\
any closely correlated predicors that seem duplicitous.")

Problem B2:

A.i) Yes. The AfAm_Male interaction term tells us
that the odds ratio of graduation of AfAm_Males to AfAm_Females is significantly smaller.

A.ii) Likewise, AfAm_Male interaction term tells us
that the odds ratio of graduation of AfAm_Males to NonAfam_Males is 
significantly smaller.

B) Age isn't a good predictor of graduation.
The coefficients are saying that an increase in age (either by years or years^2)
won't increase the likelihood of graduation and this holds true for ages approaching 130.

C) I'd likely drop the Age^2 and one of the gender values so that there aren't
any closely correlated predicors that seem duplicitous.
