In [1]:
import pandas as pd
from collections import defaultdict
import pickle
import re
# see https://ipython.readthedocs.io/en/stable/interactive/magics.html
%pylab inline
# sets backend to render higher res images
%config InlineBackend.figure_formats = ['retina']
import seaborn as sns
sns.set_style("whitegrid")

Populating the interactive namespace from numpy and matplotlib


# Load the Page Ahead Book Up program information

In [2]:
with open('./data/interim/book_up_dict.pickle', 'rb') as handle:
    bookup_dct = pickle.load(handle)
type(bookup_dct)

collections.defaultdict

# Load the Map Assessment student information
Creates map_df

In [3]:
map_df = pd.read_pickle("./data/interim/map_df.pkl")
print('shape=',map_df.shape)
map_df.columns

shape= (82049, 30)


Index(['StudentID', 'CurrentEnrollmentSchoolID', 'CurrentEnrollmentSchoolName',
       'CurrentGrade', 'TestSchoolYear', 'TestSeason', 'TestSchoolID',
       'TestSchoolName', 'TestGrade', 'SubjectArea', 'TestName', 'RITScore',
       'PercentileRank', 'MetGrowthLastFallToThisFall',
       'MetGrowthLastSpringToThisSpring', 'MetGrowthLastFallToThisSpring',
       'BirthDate', 'Gender', 'RacialEthnicGroup', 'ELLStatus', 'IEPStatus',
       'Student504Status', 'GiftedStatus', 'PrimaryLanguage', 'HomeLanguage',
       'LivingWith', 'USAEntryDate', 'BirthCountry', 'ProjectedGradYear',
       'ExtractSchoolYear'],
      dtype='object')

# Load the Smarter Balance Assessment Score Information
Creates targets_df

In [4]:
targets_df = pd.read_pickle("./data/interim/targets.pkl")
print('shape=',targets_df.shape)
targets_df.columns

shape= (4438, 30)


Index(['StudentID', 'CurrentEnrollmentSchoolID', 'CurrentEnrollmentSchoolName',
       'CurrentGrade', 'TestSchoolYear', 'TestSeason', 'TestSchoolID',
       'TestSchoolName', 'TestGrade', 'SubjectArea', 'TestName', 'AttemptCode',
       'Attempt', 'Score', 'LevelCode', 'MetStandard', 'BirthDate', 'Gender',
       'RacialEthnicGroup', 'ELLStatus', 'IEPStatus', 'Student504Status',
       'GiftedStatus', 'PrimaryLanguage', 'HomeLanguage', 'LivingWith',
       'USAEntryDate', 'BirthCountry', 'ProjectedGradYear',
       'ExtractSchoolYear'],
      dtype='object')

# A function that returns all the data for a given student

In [5]:
def get_student_data(studentID):
    """Gets all records for studentID from the map_df
    -----
    input: a string containing the studentID
    returns: a pandas dataframe of all columns of the map_df for studentID
    """
    df = map_df[map_df.StudentID==studentID]
    return df

# Functions used to build the independent variables

In [6]:
def get_RITs(stu_data,school_year):
    """Gets the mean any available MAP RIT scores for each test season
    (Fall, Winter, Spring) of the 2015-16 school year (1st Grade).
    -----
    Inputs: A dataframe containing all the students data from the map_df
            and the school year in the form of '2015-16'
    Returns: G1_Fall_RIT, G1_Winter_RIT, G1_Spring_RIT for the given year if available.
    """
    G1_Fall_RIT, G1_Winter_RIT, G1_Spring_RIT = None, None, None
    yr_data = stu_data[stu_data.TestSchoolYear==school_year]
    RIT_by_season = yr_data.groupby(['TestSeason']).RITScore.mean()
    for season in RIT_by_season.index:
        if season=='Fall':
            G1_Fall_RIT=RIT_by_season[season]
        if season=='Spring':
            G1_Spring_RIT=RIT_by_season[season]
        if season=='Winter':
            G1_Winter_RIT=RIT_by_season[season]
    return G1_Fall_RIT, G1_Winter_RIT, G1_Spring_RIT

def get_last_RIT(Fall_RIT,Winter_RIT,Spring_RIT):
    """Returns the RIT score from latest in the school year.
    ---
    input: RIT scores for Fall, Winter and Spring
    returns: the season name and the score of the score from latest in the school year.
    """
    if Spring_RIT is not None:
        return 'Spring',Spring_RIT
    elif Winter_RIT is not None:
        return 'Winter',Winter_RIT
    elif Fall_RIT is not None:
        return 'Fall', Fall_RIT
    return None, None

def get_treatments(stu_data):
    """Determines a list of the year and school students would have received 
    Book Up program treatments from Page Ahead.
    -----
    input: A dataframe containing all rows of map_df for a given student
    returns: A list of tuples of the year and school ID a student should have
             received Book Up program books in the summer.
    """
    treatments = []
    # For each year of the student's data...
    for yr in stu_data.TestSchoolYear.unique():
        yr_data = stu_data[stu_data.TestSchoolYear==yr]
        # Get a collection of the schools they tested at...
        gb=yr_data.groupby(['TestSchoolID','TestGrade','TestSeason']).RITScore.mean()
        # Determine which testing record was the last in the school year to estimate
        # which school they were attending...
        last = 0
        last_ind = 0
        for i in range(0,len(gb)):
            position = ['Fall','Winter','Spring'].index(gb.index[i][2])
            if position > last:
                last_ind = i
                last = position
        SchoolID = gb.index[last_ind][0]
        TestGrade = gb.index[last_ind][1]
        # If the school they were attending was being served by the Book Up program
        # of Page Ahead then record the year and school in the treatments list.
        if SchoolID in bookup_dct[yr,TestGrade]:
            treatments.append((int(yr[:4])+1,SchoolID))
    return treatments

# Building the X dataframe:

In [7]:
example = get_student_data(3099442)
example.keys()

Index(['StudentID', 'CurrentEnrollmentSchoolID', 'CurrentEnrollmentSchoolName',
       'CurrentGrade', 'TestSchoolYear', 'TestSeason', 'TestSchoolID',
       'TestSchoolName', 'TestGrade', 'SubjectArea', 'TestName', 'RITScore',
       'PercentileRank', 'MetGrowthLastFallToThisFall',
       'MetGrowthLastSpringToThisSpring', 'MetGrowthLastFallToThisSpring',
       'BirthDate', 'Gender', 'RacialEthnicGroup', 'ELLStatus', 'IEPStatus',
       'Student504Status', 'GiftedStatus', 'PrimaryLanguage', 'HomeLanguage',
       'LivingWith', 'USAEntryDate', 'BirthCountry', 'ProjectedGradYear',
       'ExtractSchoolYear'],
      dtype='object')

The MVP involved gathering the minimum required columns:
* Last_G1_RIT: Last 1st Grade RIT score
* nTreatments: The number of treatments the student should have received.

In [8]:
X_df = pd.DataFrame(columns=['StudentID','G1_Fall_RIT','G1_Winter_RIT','G1_Spring_RIT',\
                             'Last_G1_RIT','Last_G1_RIT_Season',\
                             'Treatments','nTreatments'])
for studentID in targets_df.StudentID:
    Stu_data = get_student_data(studentID)
    G1_Fall_RIT, G1_Winter_RIT, G1_Spring_RIT = get_RITs(Stu_data,'2015-16')
    Last_RIT_Season, Last_RIT = get_last_RIT(G1_Fall_RIT, G1_Winter_RIT, G1_Spring_RIT)
    treatments = get_treatments(Stu_data)
    X_df = X_df.append({'StudentID':studentID,
                        'G1_Fall_RIT':G1_Fall_RIT,
                        'G1_Winter_RIT':G1_Winter_RIT,
                        'G1_Spring_RIT':G1_Spring_RIT,
                        'Last_G1_RIT':Last_RIT,
                        'Last_G1_RIT_Season':Last_RIT_Season,
                        'Treatments':treatments,
                        'nTreatments':int(len(treatments))
                       },ignore_index=True)

X_df.nTreatments = pd.to_numeric(X_df.nTreatments)
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4438 entries, 0 to 4437
Data columns (total 8 columns):
StudentID             4438 non-null object
G1_Fall_RIT           1619 non-null object
G1_Winter_RIT         1254 non-null object
G1_Spring_RIT         3571 non-null object
Last_G1_RIT           3625 non-null object
Last_G1_RIT_Season    3625 non-null object
Treatments            4438 non-null object
nTreatments           4438 non-null int64
dtypes: int64(1), object(7)
memory usage: 277.5+ KB


## Adding Gender=Female Feature

In [9]:
example = get_student_data(3099442)
#example = pd.DataFrame(data={'Gender': [None, None, None]})
example['Gender'].value_counts().empty
Gender = example['Gender'].value_counts()
Gender = None if Gender.empty else Gender.idxmax()
Gender

'Female'

In [10]:
if 'Female' in X_df.columns:
    X_df.drop(['Female'], axis=1,inplace=True)
Female = pd.DataFrame(columns=['Female'])
for studentID in targets_df.StudentID:
    Stu_data = get_student_data(studentID)
    Gender = Stu_data['Gender'].value_counts()
    Gender = None if Gender.empty else Gender.idxmax()
    Female = Female.append({'Female':Gender},ignore_index=True)
Female = pd.get_dummies(Female.Female)
X_df['Female'] = Female.Female.copy()
X_df.Female.value_counts(dropna=False)

0    2543
1    1895
Name: Female, dtype: int64

NOTE: 509 rows of the resulting Female dataframe are Female=0 and Male=0 (Gender was None).  So by keeping only Female.Female in the X_df I'm essentially recategorizing the None's as males!

In [11]:
Female[(Female.Female==0) & (Female.Male==0)].shape

(509, 2)

## Add Home Language Is English Feature

In [12]:
example = get_student_data(3099442)
#example = pd.DataFrame(data={'HomeLanguage': [None, None, None]})
example['HomeLanguage'].value_counts().empty
HomeLanguage = example['HomeLanguage'].value_counts()
HomeLanguage = None if HomeLanguage.empty else HomeLanguage.idxmax()
HomeLanguage

'Spanish'

In [13]:
if 'HomeLanIsEng' in X_df.columns:
    X_df.drop(['HomeLanIsEng'], axis=1,inplace=True)
HomeLanIsEng = pd.DataFrame(columns=['HomeLanIsEng'])
for studentID in targets_df.StudentID:
    Stu_data = get_student_data(studentID)
    HomeLanguage = Stu_data['HomeLanguage'].value_counts()
    HomeLanguage = None if HomeLanguage.empty else HomeLanguage.idxmax()
    HomeLanIsEng = HomeLanIsEng.append({'HomeLanIsEng':HomeLanguage},ignore_index=True)
HomeLanIsEng = pd.get_dummies(HomeLanIsEng.HomeLanIsEng)
X_df['HomeLanIsEng'] = HomeLanIsEng.English.copy()
X_df.HomeLanIsEng.value_counts()

1    3053
0    1385
Name: HomeLanIsEng, dtype: int64

In [14]:
HomeLanIsEng.shape

(4438, 59)

## Add A Primary Language Is English Feature

In [15]:
example = get_student_data(3099442)
#example = pd.DataFrame(data={'PrimaryLanguage': [None, None, None]})
example['PrimaryLanguage'].value_counts().empty
PrimaryLanguage = example['PrimaryLanguage'].value_counts()
PrimaryLanguage = None if PrimaryLanguage.empty else PrimaryLanguage.idxmax()
PrimaryLanguage

'Spanish'

In [16]:
if 'PrimaryLanIsEng' in X_df.columns:
    X_df.drop(['PrimaryLanIsEng'], axis=1,inplace=True)
PrimaryLanIsEng = pd.DataFrame(columns=['PrimaryLanIsEng'])
for studentID in targets_df.StudentID:
    Stu_data = get_student_data(studentID)
    PrimaryLanguage = Stu_data['PrimaryLanguage'].value_counts()
    PrimaryLanguage = None if PrimaryLanguage.empty else PrimaryLanguage.idxmax()
    PrimaryLanIsEng = PrimaryLanIsEng.append({'PrimaryLanIsEng':PrimaryLanguage},ignore_index=True)
PrimaryLanIsEng = pd.get_dummies(PrimaryLanIsEng.PrimaryLanIsEng)
X_df['PrimaryLanIsEng'] = PrimaryLanIsEng.English.copy()
X_df.PrimaryLanIsEng.value_counts()

1    3078
0    1360
Name: PrimaryLanIsEng, dtype: int64

In [17]:
print('Number of non-English languages:',len(PrimaryLanIsEng.columns)-1)
PrimaryLanIsEng.columns

Number of non-English languages: 54


Index(['Afrikaans', 'Amharic', 'Arabic', 'Bikol', 'Bilen', 'Bosnian',
       'Bulgarian', 'Burmese', 'Cakchiquel', 'Cambodian', 'Cham',
       'Chinese-Cantonese', 'Chinese-Mandarin', 'Chinese-Taiwanese',
       'Chinese-Unspecified', 'Creole', 'Czech', 'English', 'Estonian',
       'Ethiopic', 'Farsi', 'French', 'Fula', 'German', 'Hebrew, Modern',
       'Ilokano', 'Italian', 'Japanese', 'Karen', 'Khmer', 'Kikuya', 'Korean',
       'Lao', 'Maay', 'Mandingo', 'Maya-Quiche', 'Mien', 'Mongolian', 'Oromo',
       'Portuguese', 'Punjabi', 'Russian', 'Samoan', 'Somali', 'Soninke',
       'Spanish', 'Swahili', 'Tagalog', 'Thai', 'Tigrinya', 'Toishanese',
       'Ukrainian', 'Urdu', 'Vietnamese', 'Visayan'],
      dtype='object')

In [18]:
X_df[(X_df.PrimaryLanIsEng==1)&(X_df.HomeLanIsEng==0)].shape

(89, 11)

In [19]:
X_df[(X_df.PrimaryLanIsEng==0)&(X_df.HomeLanIsEng==1)].shape

(64, 11)

## Add Living With Feature
To keep this simple I'm only going to see if living with Both Parent gives a boost.

In [20]:
map_df.LivingWith.value_counts()

Both Parents       63494
Mother             15353
Father              1362
(Unknown)            698
Grandparent(s)       603
Guardian(s)          223
Foster Parent(s      182
Other Relative(      104
Agency/Social S       15
Alone                  8
Spouse/Partner         7
Name: LivingWith, dtype: int64

In [21]:
example = get_student_data(3099442)
#example = pd.DataFrame(data={'LivingWith': [None, None, None]})
example['LivingWith'].value_counts().empty
LivingWith = example['LivingWith'].value_counts()
LivingWith = None if LivingWith.empty else LivingWith.idxmax()
LivingWith

'Both Parents'

In [22]:
if 'LivingWithBothParents' in X_df.columns:
    X_df.drop(['LivingWithBothParents'], axis=1,inplace=True)
LivingWithBothParents = pd.DataFrame(columns=['LivingWithBothParents'])
for studentID in targets_df.StudentID:
    Stu_data = get_student_data(studentID)
    LivingWith = Stu_data['LivingWith'].value_counts()
    LivingWith = None if LivingWith.empty else LivingWith.idxmax()
    if LivingWith is not None:
        LivingWith = re.sub('[/() ]','',LivingWith)
    LivingWithBothParents = LivingWithBothParents.append({'LivingWithBothParents':LivingWith},ignore_index=True)
LivingWithBothParents = pd.get_dummies(LivingWithBothParents.LivingWithBothParents)
X_df['LivingWithBothParents'] = LivingWithBothParents.BothParents.copy()
X_df.LivingWithBothParents.value_counts()

1    3150
0    1288
Name: LivingWithBothParents, dtype: int64

## Racial/Ethnic Group

In [23]:
map_df.RacialEthnicGroup.value_counts()

White               35163
Black               14904
Multiracial         10877
Asian               10500
Hispanic             9968
Pacific Islander      379
American Indian       258
Name: RacialEthnicGroup, dtype: int64

In [24]:
example = get_student_data(3099442)
#example = pd.DataFrame(data={'Gender': [None, None, None]})
example['RacialEthnicGroup'].value_counts().empty
RacialEthnicGroup = example['RacialEthnicGroup'].value_counts()
RacialEthnicGroup = None if RacialEthnicGroup.empty else RacialEthnicGroup.idxmax()
RacialEthnicGroup

'Hispanic'

In [25]:
if len(list(X_df.filter(regex='RacialEthnicGroup_')))>0:
    X_df.drop(labels=list(X_df.filter(regex='RacialEthnicGroup_')),\
             axis=1,inplace=True)
RacialEthnicGroup = pd.DataFrame(columns=['RacialEthnicGroup'])
for studentID in targets_df.StudentID:
    Stu_data = get_student_data(studentID)
    Race = Stu_data['RacialEthnicGroup'].value_counts()
    Race = None if Race.empty else Race.idxmax()
    RacialEthnicGroup = RacialEthnicGroup.append({'RacialEthnicGroup':Race},ignore_index=True)
RacialEthnicGroup = pd.get_dummies(RacialEthnicGroup.RacialEthnicGroup,\
                                     prefix='RacialEthnicGroup_')
X_df = pd.concat([X_df,RacialEthnicGroup],axis=1)
print("# of new RacialEthnicGroup columns:",\
      len(list(X_df.filter(regex='RacialEthnicGroup_'))))

# of new RacialEthnicGroup columns: 7


Set if condition to True to drop all RacialEthnicGroup columns from X_df

In [26]:
if False:
    X_df.drop(labels=list(X_df.filter(regex='RacialEthnicGroup')),\
              axis=1,inplace=True)
    print(len(list(X_df.filter(regex='RacialEthnicGroup'))))

## School Most Attended
TestSchoolID

In [27]:
#example = get_student_data(3099442)
example = pd.DataFrame(data={'TestSchoolID': [211, 267, 123]})
example['TestSchoolID'].value_counts().empty
TestSchoolID = example['TestSchoolID'].value_counts()
TestSchoolID = None if TestSchoolID.empty else TestSchoolID.idxmax()
TestSchoolID

211

In [28]:
if len(list(X_df.filter(regex='MostAttendedSchool_')))>0:
    X_df.drop(labels=list(X_df.filter(regex='MostAttendedSchool_')),\
             axis=1,inplace=True)
MostAttendedSchoolID = pd.DataFrame(columns=['MostAttendedSchoolID'])
for studentID in targets_df.StudentID:
    Stu_data = get_student_data(studentID)
    TestSchoolID = Stu_data['TestSchoolID'].value_counts()
    TestSchoolID = None if TestSchoolID.empty else TestSchoolID.idxmax()
    MostAttendedSchoolID = MostAttendedSchoolID.append({'MostAttendedSchoolID':TestSchoolID},\
                                                       ignore_index=True)
MostAttendedSchoolID = pd.get_dummies(MostAttendedSchoolID.MostAttendedSchoolID,\
                                     prefix='MostAttendedSchool_')
X_df = pd.concat([X_df,MostAttendedSchoolID],axis=1)
print("# of new MostAttendedSchool columns:",\
      len(list(X_df.filter(regex='MostAttendedSchool_'))))

# of new MostAttendedSchool columns: 70


Set if condition to True to drop all MostAttendedSchool columns from X_df

In [29]:
if False:
    X_df.drop(labels=list(X_df.filter(regex='MostAttendedSchool_')),\
              axis=1,inplace=True)
    print(len(list(X_df.filter(regex='MostAttendedSchool_'))))

# Building the Y dataframe:

In [30]:
Y_df = targets_df[targets_df.StudentID.isin(X_df.StudentID)][['StudentID',
                                                              'Score',
                                                              'LevelCode',
                                                              'MetStandard']]
print(len(Y_df))
Y_df.MetStandard.value_counts()

4438


Y    2879
N    1486
Name: MetStandard, dtype: int64

# Pickle X_df and Y_df

In [31]:
X_df.to_pickle('./data/interim/X_df.pkl')
Y_df.to_pickle('./data/interim/Y_df.pkl')

In [34]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4438 entries, 0 to 4437
Data columns (total 89 columns):
StudentID                              4438 non-null object
G1_Fall_RIT                            1619 non-null object
G1_Winter_RIT                          1254 non-null object
G1_Spring_RIT                          3571 non-null object
Last_G1_RIT                            3625 non-null object
Last_G1_RIT_Season                     3625 non-null object
Treatments                             4438 non-null object
nTreatments                            4438 non-null int64
Female                                 4438 non-null uint8
HomeLanIsEng                           4438 non-null uint8
PrimaryLanIsEng                        4438 non-null uint8
LivingWithBothParents                  4438 non-null uint8
RacialEthnicGroup__American Indian     4438 non-null uint8
RacialEthnicGroup__Asian               4438 non-null uint8
RacialEthnicGroup__Black               4438 non-null uint8
R