In [1]:
import pandas as pd
from collections import defaultdict
import pickle

Read in the relevant python data setup objects.

In [39]:
with open('./data/interim/book_up_dict.pickle', 'rb') as handle:
    bookup_dct = pickle.load(handle)
type(bookup_dct)

collections.defaultdict

In [14]:
map_df = pd.read_pickle("./data/interim/map_df.pkl")
print('shape=',map_df.shape)
map_df.columns

shape= (82049, 30)


Index(['StudentID', 'CurrentEnrollmentSchoolID', 'CurrentEnrollmentSchoolName',
       'CurrentGrade', 'TestSchoolYear', 'TestSeason', 'TestSchoolID',
       'TestSchoolName', 'TestGrade', 'SubjectArea', 'TestName', 'RITScore',
       'PercentileRank', 'MetGrowthLastFallToThisFall',
       'MetGrowthLastSpringToThisSpring', 'MetGrowthLastFallToThisSpring',
       'BirthDate', 'Gender', 'RacialEthnicGroup', 'ELLStatus', 'IEPStatus',
       'Student504Status', 'GiftedStatus', 'PrimaryLanguage', 'HomeLanguage',
       'LivingWith', 'USAEntryDate', 'BirthCountry', 'ProjectedGradYear',
       'ExtractSchoolYear'],
      dtype='object')

In [15]:
targets_df = pd.read_pickle("./data/interim/targets.pkl")
print('shape=',targets_df.shape)
targets_df.columns

shape= (4438, 30)


Index(['StudentID', 'CurrentEnrollmentSchoolID', 'CurrentEnrollmentSchoolName',
       'CurrentGrade', 'TestSchoolYear', 'TestSeason', 'TestSchoolID',
       'TestSchoolName', 'TestGrade', 'SubjectArea', 'TestName', 'AttemptCode',
       'Attempt', 'Score', 'LevelCode', 'MetStandard', 'BirthDate', 'Gender',
       'RacialEthnicGroup', 'ELLStatus', 'IEPStatus', 'Student504Status',
       'GiftedStatus', 'PrimaryLanguage', 'HomeLanguage', 'LivingWith',
       'USAEntryDate', 'BirthCountry', 'ProjectedGradYear',
       'ExtractSchoolYear'],
      dtype='object')

In [185]:
def get_student_data(studentID):
    """Gets all records for studentID from the map_df
    -----
    input: a string containing the studentID
    returns: a pandas dataframe of all columns of the map_df for studentID
    """
    df = map_df[map_df.StudentID==studentID]
    return df

A get_student_data usage example for a random studentID from targets_df:
* StudentID 3099442 was helpful for development of the get_treatments function.

In [247]:
example = get_student_data(targets_df.StudentID.sample().values[0])
example

Unnamed: 0,StudentID,CurrentEnrollmentSchoolID,CurrentEnrollmentSchoolName,CurrentGrade,TestSchoolYear,TestSeason,TestSchoolID,TestSchoolName,TestGrade,SubjectArea,...,IEPStatus,Student504Status,GiftedStatus,PrimaryLanguage,HomeLanguage,LivingWith,USAEntryDate,BirthCountry,ProjectedGradYear,ExtractSchoolYear
36521,3099442,211,Frantz Coe Elementary,1,2015-16,Spring,211,Frantz Coe Elementary,1,Reading,...,N,N,Not Eligible,Spanish,Spanish,Both Parents,NaT,USA,(n/a),2015-16
91366,3099442,267,Roxhill Elementary,2,2016-17,Fall,211,Frantz Coe Elementary,2,Reading,...,N,N,Not Eligible,Spanish,Spanish,Both Parents,NaT,USA,(n/a),2016-17
91368,3099442,267,Roxhill Elementary,2,2016-17,Spring,267,Roxhill Elementary,2,Reading,...,N,N,Not Eligible,Spanish,Spanish,Both Parents,NaT,USA,(n/a),2016-17


In [41]:
example.iloc[0].TestGrade

'1'

In [42]:
bookup_dct[example.iloc[0].TestSchoolYear,example.iloc[0].TestGrade]

[215, 251, 219, 221, 220, 233, 275, 267]

In [270]:
sum = example.groupby(['TestSchoolYear','TestSchoolID','TestSeason']).RITScore.mean()

In [271]:
sum

TestSchoolYear  TestSchoolID  TestSeason
2015-16         211           Spring        176
2016-17         211           Fall          157
                267           Spring        184
Name: RITScore, dtype: int32

In [272]:
singleYear = example[example.TestSchoolYear=='2016-17']
gb=singleYear.groupby(['TestSchoolID','TestGrade','TestSeason']).RITScore.mean()
gb.index.levels

FrozenList([[211, 267], ['2'], ['Fall', 'Spring']])

In [273]:
gb

TestSchoolID  TestGrade  TestSeason
211           2          Fall          157
267           2          Spring        184
Name: RITScore, dtype: int32

In [274]:
sum.index.levels

FrozenList([['2015-16', '2016-17'], [211, 267], ['Fall', 'Spring']])

In [275]:
for i in range(0,len(gb)):
    print(gb.index[i],sum[i])

(211, '2', 'Fall') 176
(267, '2', 'Spring') 157


In [146]:
example.TestSchoolYear.unique()

array(['2015-16', '2016-17'], dtype=object)

In [302]:
Treatments = []
for yr in example.TestSchoolYear.unique():
    yr_data = example[example.TestSchoolYear==yr]
    gb=yr_data.groupby(['TestSchoolID','TestGrade','TestSeason']).RITScore.mean()
    last = 0
    last_ind = 0
    for i in range(0,len(gb)):
        position = ['Fall','Winter','Spring'].index(gb.index[i][2])
        if position > last:
            last_ind = i
            last = position
    SchoolID = gb.index[last_ind][0]
    TestGrade = gb.index[last_ind][1]
    if SchoolID in bookup_dct[yr,TestGrade]:
        Treatments.append((int(yr[:4])+1,SchoolID))
#     print(yr,SchoolID,TestGrade)
#     print('len',len(gb))
#     print(yr,gb.index[i],sum[i])
#     print(yr,gb.index[i][2])
print(Treatments,len(Treatments))

[(2017, 267)] 1


In [200]:
def get_RITs(stu_data,school_year):
    """Gets the mean any available MAP RIT scores for each test season
    (Fall, Winter, Spring) of the 2015-16 school year (1st Grade).
    -----
    Inputs: A dataframe containing all the students data from the map_df
            and the school year in the form of '2015-16'
    Returns: G1_Fall_RIT, G1_Winter_RIT, G1_Spring_RIT for the given year if available.
    """
    G1_Fall_RIT, G1_Winter_RIT, G1_Spring_RIT = None, None, None
    yr_data = stu_data[stu_data.TestSchoolYear==school_year]
    RIT_by_season = yr_data.groupby(['TestSeason']).RITScore.mean()
    for season in RIT_by_season.index:
        if season=='Fall':
            G1_Fall_RIT=RIT_by_season[season]
        if season=='Spring':
            G1_Spring_RIT=RIT_by_season[season]
        if season=='Winter':
            G1_Winter_RIT=RIT_by_season[season]
    return G1_Fall_RIT, G1_Winter_RIT, G1_Spring_RIT

In [208]:
get_RITs(example,'2015-16')

(158, 170, 175)

In [199]:
def get_last_RIT(Fall_RIT,Winter_RIT,Spring_RIT):
    """Returns the RIT score from latest in the school year.
    ---
    input: RIT scores for Fall, Winter and Spring
    returns: the season name and the score of the score from latest in the school year.
    """
    if Spring_RIT is not None:
        return 'Spring',Spring_RIT
    elif Winter_RIT is not None:
        return 'Winter',Winter_RIT
    elif Fall_RIT is not None:
        return 'Fall', Fall_RIT
    return None, None

In [310]:
def get_treatments(stu_data):
    """Determines a list of the year and school students would have received 
    Book Up program treatments from Page Ahead.
    -----
    input: A dataframe containing all rows of map_df for a given student
    returns: A list of tuples of the year and school ID a student should have
             received Book Up program books in the summer.
    """
    treatments = []
    # For each year of the student's data...
    for yr in stu_data.TestSchoolYear.unique():
        yr_data = stu_data[stu_data.TestSchoolYear==yr]
        # Get a collection of the schools they tested at...
        gb=yr_data.groupby(['TestSchoolID','TestGrade','TestSeason']).RITScore.mean()
        # Determine which testing record was the last in the year to estimate
        # which school they were attending...
        last = 0
        last_ind = 0
        for i in range(0,len(gb)):
            position = ['Fall','Winter','Spring'].index(gb.index[i][2])
            if position > last:
                last_ind = i
                last = position
        SchoolID = gb.index[last_ind][0]
        TestGrade = gb.index[last_ind][1]
        # If they school they were attending was being served by the Book Up program
        # of Page Ahead then record the year and school in the treatments list.
        if SchoolID in bookup_dct[yr,TestGrade]:
            treatments.append((int(yr[:4])+1,SchoolID))
    return treatments

In [311]:
treat=get_treatments(example)
treat

[(2017, 267)]

In [307]:
X_df = pd.DataFrame(columns=['StudentID','G1_Fall_RIT','G1_Winter_RIT','G1_Spring_RIT',\
                             'Last_G1_RIT','Last_G1_RIT_Season','Treatments','nTreatments'])
for studentID in targets_df.StudentID[:10]:
    Stu_data = get_student_data(studentID)
    G1_Fall_RIT, G1_Winter_RIT, G1_Spring_RIT = get_RITs(Stu_data,'2015-16')
    Last_RIT_Season, Last_RIT = get_last_RIT(G1_Fall_RIT, G1_Winter_RIT, G1_Spring_RIT)
    treatments = get_treatments(Stu_data)
    X_df = X_df.append({'StudentID':studentID,
                        'G1_Fall_RIT':G1_Fall_RIT,
                        'G1_Winter_RIT':G1_Winter_RIT,
                        'G1_Spring_RIT':G1_Spring_RIT,
                        'Last_G1_RIT':Last_RIT,
                        'Last_G1_RIT_Season':Last_RIT_Season,
                        'Treatments':treatments,
                        'nTreatments':len(treatments)
                       },ignore_index=True)

In [308]:
X_df

Unnamed: 0,StudentID,G1_Fall_RIT,G1_Winter_RIT,G1_Spring_RIT,Last_G1_RIT,Last_G1_RIT_Season,Treatments,nTreatments
0,3967736,,166.0,178.0,178.0,Spring,[],0
1,3381735,,162.0,173.0,173.0,Spring,[],0
2,3567417,,,196.0,196.0,Spring,[],0
3,3130417,,,211.0,211.0,Spring,[],0
4,3670417,179.0,187.0,192.0,192.0,Spring,[],0
5,3623417,,,,,,[],0
6,3560413,164.0,,172.0,172.0,Spring,[],0
7,3540413,163.0,,189.0,189.0,Spring,[],0
8,3163413,186.0,198.0,199.0,199.0,Spring,[],0
9,3158413,,,187.0,187.0,Spring,[],0
