In [1]:
import pandas as pd

# Load
Function to import the MAP assessments from csv files which contain the RIT Scores for K-12.  
Function to import the SBAC assessments from its csv file.

In [2]:
def load_map_df_from_csv(start_year,end_year):
    data_folder = './data/raw/'
    filename_common = '_ADR-301-MAP-Assessment-Scores.csv'
    map_df = None
    for yr in range(start_year,end_year+1):
        df = pd.read_csv(data_folder+str(yr)+filename_common, encoding = "ISO-8859-1")
        if map_df is None:
            map_df = df.copy()
        else:
            map_df = map_df.append(df, ignore_index = True).copy()
    return map_df

def load_sbac_df_from_csv(start_year,end_year):
    data_folder = './data/raw/'
    filename_common = '_ADR-311-State-Assessment-Scores.csv'
    sbac_df = None
    for yr in range(start_year,end_year+1):
        df = pd.read_csv(data_folder+str(yr)+filename_common, encoding = "ISO-8859-1")
        if sbac_df is None:
            sbac_df = df.copy()
        else:
            sbac_df = sbac_df.append(df, ignore_index = True).copy()
    return sbac_df

Load the testing data:

In [3]:
map_df = load_map_df_from_csv(2015,2018)
print("{:,}".format(map_df.StudentID.nunique()),"K-12 students of MAP data loaded.")
sbac_df = load_sbac_df_from_csv(2018,2018)
print("{:,}".format(sbac_df.StudentID.nunique()),"K-12 students of SBAC data loaded.")

41,211 K-12 students of MAP data loaded.
29,917 K-12 students of SBAC data loaded.


# Clean
Function to clean the K-2 data:
* Drop columns originally related to personally identifiable information: LastName, FirstName, PhoneNumber
* Drop discontinued data column: OnTrackToGraduate
* Remove all rows that contain SubjectArea='Mathematics'
* Remove all rows for kids in grades 3-12.
* Correct column data types.

In [4]:
def clean_map_df(map_df):
    map_df.drop(columns=['LastName','FirstName','PhoneNumber','OnTrackToGraduate'],\
                inplace=True)
    map_df = map_df[map_df.SubjectArea != 'Mathematics']
    map_df = map_df[map_df.CurrentGrade.isin(['K','1','2'])]
    map_df = map_df.astype({'StudentID':'int32',\
                        'CurrentEnrollmentSchoolID':'int32',\
                        'TestSchoolID':'int32',\
                        'RITScore':'int32',\
                        'PercentileRank':pd.Int32Dtype(),\
                        'MetGrowthLastFallToThisFall':pd.Int32Dtype(),\
                        'MetGrowthLastSpringToThisSpring':pd.Int32Dtype(),\
                        'MetGrowthLastFallToThisSpring':pd.Int32Dtype()})
    map_df.TestSchoolYear = map_df.TestSchoolYear.str.replace('\x96','-')
    map_df.ExtractSchoolYear = map_df.ExtractSchoolYear.str.replace('\x96','-')
    map_df.BirthDate = pd.to_datetime(map_df.BirthDate)
    map_df.USAEntryDate = pd.to_datetime(map_df.USAEntryDate,errors='coerce')
    return map_df

# I've commented this out for now while I was troubleshooting whey I can't find the
# 2015 Kindergartener's in the SBAC file.
# def clean_sbac_df(SBAC_df):
#     SBAC_df.drop(columns=['LastName','FirstName','PhoneNumber','OnTrackToGraduate'],\
#                 inplace=True)
#     SBAC_df = SBAC_df[SBAC_df.SubjectArea == 'Reading/ELA'].copy()
#     SBAC_df = SBAC_df[SBAC_df.TestGrade == 3]
#     SBAC_df.BirthDate = pd.to_datetime(SBAC_df.BirthDate)
#     SBAC_df.USAEntryDate = pd.to_datetime(SBAC_df.USAEntryDate,errors='coerce')
#     SBAC_df.TestSchoolYear = SBAC_df.TestSchoolYear.str.replace('\x96','-')
#     SBAC_df.ExtractSchoolYear = SBAC_df.ExtractSchoolYear.str.replace('\x96','-')
#     return SBAC_df

Clean the MAP (K-2) data:

In [5]:
map_df = clean_map_df(map_df)
map_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82049 entries, 3 to 224858
Data columns (total 30 columns):
StudentID                          82049 non-null int32
CurrentEnrollmentSchoolID          82049 non-null int32
CurrentEnrollmentSchoolName        82049 non-null object
CurrentGrade                       82049 non-null object
TestSchoolYear                     82049 non-null object
TestSeason                         82049 non-null object
TestSchoolID                       82049 non-null int32
TestSchoolName                     82049 non-null object
TestGrade                          82049 non-null object
SubjectArea                        82049 non-null object
TestName                           82049 non-null object
RITScore                           82049 non-null int32
PercentileRank                     82048 non-null Int32
MetGrowthLastFallToThisFall        8673 non-null Int32
MetGrowthLastSpringToThisSpring    19367 non-null Int32
MetGrowthLastFallToThisSpring      14522 no

I may want to open this dataframe in Numbers:

In [6]:
#map_df.to_csv('./data/interim/map_df.csv')

In [75]:
map_df[map_df.CurrentGrade=='K'].groupby(['CurrentGrade','TestSeason','TestSchoolYear']).RITScore.count()

CurrentGrade  TestSeason  TestSchoolYear
K             Fall        2015-16            860
                          2016-17            832
                          2017-18            485
                          2018-19            516
              Spring      2015-16           2556
                          2016-17           2463
                          2017-18           2419
              Winter      2015-16           4346
                          2016-17           4326
                          2017-18           4211
                          2018-19           4259
Name: RITScore, dtype: int64

Interesting that there are a lot more Kindergarteners tested in Winter and few in the Fall.  What is that all about?

In [56]:
map_df[(map_df.CurrentGrade=='K') & (map_df.TestSchoolYear=='2015-16')].groupby(['TestSchoolID','TestSeason']).RITScore.count()[:25]

TestSchoolID  TestSeason
201           Winter        103
202           Winter         45
203           Winter         69
204           Fall           88
              Spring         83
              Winter         87
205           Spring         72
              Winter         73
207           Fall           40
              Spring         41
              Winter         38
208           Fall           70
              Spring         72
              Winter         70
209           Winter        121
211           Winter         97
212           Spring         40
              Winter         43
215           Spring         61
              Winter         55
218           Spring          1
              Winter         46
219           Fall           39
              Spring         42
              Winter         42
Name: RITScore, dtype: int64

Schools definitely test kids with different frequencies!  Are they spreading the testing out over multiple testing seasons or are they retesting kids at each testing season?

In [58]:
map_df[(map_df.CurrentGrade=='K') & (map_df.TestSchoolYear=='2015-16') & (map_df.TestSchoolID==204)].iloc[:15,:12]

Unnamed: 0,StudentID,CurrentEnrollmentSchoolID,CurrentEnrollmentSchoolName,CurrentGrade,TestSchoolYear,TestSeason,TestSchoolID,TestSchoolName,TestGrade,SubjectArea,TestName,RITScore
628,3787446,204,Daniel Bagley Elementary,K,2015-16,Fall,204,Daniel Bagley Elementary,K,Reading,Reading K-2 - Common Core 2010,150
630,3787446,204,Daniel Bagley Elementary,K,2015-16,Spring,204,Daniel Bagley Elementary,K,Reading,Reading K-2 - Common Core 2010,178
632,3787446,204,Daniel Bagley Elementary,K,2015-16,Winter,204,Daniel Bagley Elementary,K,Reading,Reading K-2 - Common Core 2010,166
693,3391445,204,Daniel Bagley Elementary,K,2015-16,Fall,204,Daniel Bagley Elementary,K,Reading,Reading K-2 - Common Core 2010,156
695,3391445,204,Daniel Bagley Elementary,K,2015-16,Spring,204,Daniel Bagley Elementary,K,Reading,Reading K-2 - Common Core 2010,179
697,3391445,204,Daniel Bagley Elementary,K,2015-16,Winter,204,Daniel Bagley Elementary,K,Reading,Reading K-2 - Common Core 2010,164
700,3322445,204,Daniel Bagley Elementary,K,2015-16,Fall,204,Daniel Bagley Elementary,K,Reading,Reading K-2 - Common Core 2010,146
702,3322445,204,Daniel Bagley Elementary,K,2015-16,Spring,204,Daniel Bagley Elementary,K,Reading,Reading K-2 - Common Core 2010,162
704,3322445,204,Daniel Bagley Elementary,K,2015-16,Winter,204,Daniel Bagley Elementary,K,Reading,Reading K-2 - Common Core 2010,146
1471,3204445,204,Daniel Bagley Elementary,K,2015-16,Fall,204,Daniel Bagley Elementary,K,Reading,Reading K-2 - Common Core 2010,149


Well at least Daniel Bagley tests each kid in each season.

In [76]:
map_df.groupby(['CurrentGrade','TestSeason','TestSchoolYear']).StudentID.nunique()

CurrentGrade  TestSeason  TestSchoolYear
1             Fall        2015-16           2132
                          2016-17           2299
                          2017-18           2104
                          2018-19           2197
              Spring      2015-16           4447
                          2016-17           4469
                          2017-18           4183
              Winter      2015-16           1615
                          2016-17           1717
                          2017-18           1376
                          2018-19           1245
2             Fall        2015-16           2326
                          2016-17           2425
                          2017-18           2098
                          2018-19           2260
              Spring      2015-16           4154
                          2016-17           4346
                          2017-18           4111
              Winter      2015-16           1506
                          20

In [36]:
map_df.groupby(['CurrentGrade','TestSeason','TestSchoolYear']).RITScore.mean()

CurrentGrade  TestSeason  TestSchoolYear
1             Fall        2015-16           164.819418
                          2016-17           164.668552
                          2017-18           162.963878
                          2018-19           164.008648
              Spring      2015-16           182.655948
                          2016-17           181.134034
                          2017-18           181.757112
              Winter      2015-16           172.505263
                          2016-17           169.976704
                          2017-18           171.947674
                          2018-19           170.859438
2             Fall        2015-16           179.739467
                          2016-17           181.244948
                          2017-18           178.886136
                          2018-19           181.165487
              Spring      2015-16           194.033702
                          2016-17           193.702485
                        

From this it is highly likely that in the MAP score data fall comes before winter which comes before spring.

In [59]:
map_df[(map_df.TestSchoolYear=='2015-16') & (map_df.CurrentGrade=='K')].groupby(['StudentID','TestSeason']).RITScore.mean()[:50]

StudentID  TestSeason
3000448    Winter        166
3000457    Winter        160
3001440    Spring        162
           Winter        161
3001441    Spring        161
3001452    Fall          139
           Spring        160
           Winter        155
3001459    Winter        167
3002441    Spring        149
           Winter        143
3002444    Winter        170
3002445    Winter        147
3003440    Spring        160
           Winter        165
3003444    Spring        163
           Winter        155
3003446    Winter        162
3003448    Winter        161
3004413    Spring        151
           Winter        144
3004440    Winter        167
3004441    Winter        161
3004452    Spring        120
           Winter        120
3005440    Winter        172
3005444    Fall          165
           Spring        188
           Winter        170
3005446    Winter        132
3006441    Winter        161
3006444    Winter        155
3006445    Fall          159
           Spring    

Individual kids definitely get tested in multiple seasons in some schools!

For the MAP data files:

In [70]:
print('#Rows were TestGrade!=CurrentGrade:',len(map_df[map_df.TestGrade != map_df.CurrentGrade]))
print('#Rows were TestGrade==CurrentGrade:',len(map_df[map_df.TestGrade == map_df.CurrentGrade]))

#Rows were TestGrade!=CurrentGrade: 65
#Rows were TestGrade==CurrentGrade: 81984


For the SBAC data file:

In [73]:
print('#Rows where TestGrade!=CurrentGrade-1:',len(sbac_df[sbac_df.TestGrade!=sbac_df.CurrentGrade-1]))
print('#Rows where TestGrade==CurrentGrade-1:',len(sbac_df[sbac_df.TestGrade==sbac_df.CurrentGrade-1]))

#Rows where TestGrade!=CurrentGrade-1: 325
#Rows where TestGrade==CurrentGrade-1: 65115


In [77]:
def get_student_data(studentID):
    df = map_df[map_df.StudentID==studentID]
    return df

Let's look at some samples of student's data:

In [102]:
get_student_data(map_df.StudentID.sample().values[0])

Unnamed: 0,StudentID,CurrentEnrollmentSchoolID,CurrentEnrollmentSchoolName,CurrentGrade,TestSchoolYear,TestSeason,TestSchoolID,TestSchoolName,TestGrade,SubjectArea,...,IEPStatus,Student504Status,GiftedStatus,PrimaryLanguage,HomeLanguage,LivingWith,USAEntryDate,BirthCountry,ProjectedGradYear,ExtractSchoolYear
18526,3588445,292,Hazel Wolf K-8 School,K,2015-16,Spring,292,Hazel Wolf K-8 School,K,Reading,...,N,N,Not Eligible,English,English,Both Parents,NaT,USA,(n/a),2015-16
18528,3588445,292,Hazel Wolf K-8 School,K,2015-16,Winter,292,Hazel Wolf K-8 School,K,Reading,...,N,N,Not Eligible,English,English,Both Parents,NaT,USA,(n/a),2015-16
79234,3588445,292,Hazel Wolf K-8 School,1,2016-17,Spring,292,Hazel Wolf K-8 School,1,Reading,...,N,N,Not Eligible,English,English,Both Parents,NaT,USA,(n/a),2016-17
79236,3588445,292,Hazel Wolf K-8 School,1,2016-17,Winter,292,Hazel Wolf K-8 School,1,Reading,...,N,N,Not Eligible,English,English,Both Parents,NaT,USA,(n/a),2016-17
146429,3588445,292,Hazel Wolf K-8 School,2,2017-18,Spring,292,Hazel Wolf K-8 School,2,Reading,...,N,N,Not Eligible,English,English,Both Parents,NaT,USA,(n/a),2017-18
146431,3588445,292,Hazel Wolf K-8 School,2,2017-18,Winter,292,Hazel Wolf K-8 School,2,Reading,...,N,N,Not Eligible,English,English,Both Parents,NaT,USA,(n/a),2017-18


In [150]:
living_with_situations = map_df.groupby(['StudentID']).LivingWith.nunique()
living_with_situations[living_with_situations>1].sample().index[0]

3506410

In [143]:
get_student_data(living_with_situations[living_with_situations>1].sample().index[0]).LivingWith

49845     Agency/Social S
49846     Agency/Social S
122858             Mother
122860             Mother
122894             Mother
Name: LivingWith, dtype: object

In [173]:
schools_df = map_df.groupby(['CurrentEnrollmentSchoolID','CurrentEnrollmentSchoolName']).size().reset_index(name='Freq').drop(columns=['Freq'])
school_names = schools_df.set_index('CurrentEnrollmentSchoolID')
school_IDs = schools_df.set_index('CurrentEnrollmentSchoolName')

Series([], Name: CurrentEnrollmentSchoolID, dtype: int64)

Confirming that there are no duplicate school numbers in the collection:

In [186]:
uniqueIDs = schools_df.groupby(['CurrentEnrollmentSchoolID']).CurrentEnrollmentSchoolID.nunique()
uniqueIDs[uniqueIDs>1]

Series([], Name: CurrentEnrollmentSchoolID, dtype: int64)

This is how to refer to a school name from its ID #:

In [185]:
school_names.loc[201][0]

'Adams Elementary'

Taking the list of school names from the Seattle Onboarding History.xlsx workbook, I'll figure out what school ID's these schools are:

In [187]:
schoolNames = ['Bailey Gatzert Elementary',
'Concord International',
'Dearborn Park International School',
'Dunlap Elementary School',
'Emerson Elementary',
'Graham HIll Elementary',
'Hawthorne Elementary School',
'Highland Park Elementary',
'John Muir Elementary',
'Maple Elementary',
'Martin Luther King Jr. Elementary',
'Northgate Elementary School',
'Rainier View Elementary',
'Sanislo Elementary',
'Van Asselt Elementary',
'West Seattle Elementary',
'Wing Luke Elementary',
'Roxhill Elementary School']

In [261]:
for name in schoolNames:
    for part in name.split():
        if school_names.CurrentEnrollmentSchoolName.str.contains(part).any():
            print(name,'\n', school_names[school_names.CurrentEnrollmentSchoolName.str.contains(part)].CurrentEnrollmentSchoolName)
            break

Bailey Gatzert Elementary 
 CurrentEnrollmentSchoolID
226    Bailey Gatzert Elementary
Name: CurrentEnrollmentSchoolName, dtype: object
Concord International 
 CurrentEnrollmentSchoolID
215    Concord International School
Name: CurrentEnrollmentSchoolName, dtype: object
Dearborn Park International School 
 CurrentEnrollmentSchoolID
251    Dearborn Park International School
Name: CurrentEnrollmentSchoolName, dtype: object
Dunlap Elementary School 
 CurrentEnrollmentSchoolID
219    Dunlap Elementary
Name: CurrentEnrollmentSchoolName, dtype: object
Emerson Elementary 
 CurrentEnrollmentSchoolID
221    Emerson Elementary
Name: CurrentEnrollmentSchoolName, dtype: object
Graham HIll Elementary 
 CurrentEnrollmentSchoolID
220    Graham Hill Elementary
Name: CurrentEnrollmentSchoolName, dtype: object
Hawthorne Elementary School 
 CurrentEnrollmentSchoolID
233    Hawthorne Elementary
Name: CurrentEnrollmentSchoolName, dtype: object
Highland Park Elementary 
 CurrentEnrollmentSchoolID
235    Hig

Clean the SBAC data:

In [None]:
# This cell is for making reloading the SBAC data easy:
sbac_df = load_sbac_df_from_csv(2018,2018)
print(sbac_df.StudentID.nunique()," K-12 students of SBAC data loaded.")

Notice that I'm __*not*__ filtering to only include 3rd graders:

In [None]:
def clean_sbac_df(SBAC_df):
    SBAC_df.drop(columns=['LastName','FirstName','PhoneNumber','OnTrackToGraduate'],\
                inplace=True)
    SBAC_df = SBAC_df[SBAC_df.SubjectArea == 'Reading/ELA'].copy()
    #SBAC_df = SBAC_df[SBAC_df.TestGrade == 3]
    SBAC_df.BirthDate = pd.to_datetime(SBAC_df.BirthDate)
    SBAC_df.USAEntryDate = pd.to_datetime(SBAC_df.USAEntryDate,errors='coerce')
    SBAC_df.TestSchoolYear = SBAC_df.TestSchoolYear.str.replace('\x96','-')
    SBAC_df.ExtractSchoolYear = SBAC_df.ExtractSchoolYear.str.replace('\x96','-')
    return SBAC_df

In [None]:
sbac_df = clean_sbac_df(sbac_df)
#sbac_df.info()

In [None]:
map_df.TestSchoolYear.iloc[0]

In [None]:
map_df[(map_df.TestSchoolYear=='2015-16') & (map_df.CurrentGrade=='K')].StudentID.nunique()

In [None]:
sbac_df.StudentID.nunique()

In [None]:
kinder2015 = map_df[(map_df.TestSchoolYear=='2015-16') & (map_df.CurrentGrade=='1')]
sbac_df[sbac_df.StudentID.isin(kinder2015.StudentID)].TestGrade.value_counts()

In [None]:
sbac_df.CurrentGrade.value_counts().sort_index()

In [None]:
sbac_df[sbac_df.CurrentGrade==4].StudentID.nunique()

In [None]:
schools = map_df[['CurrentEnrollmentSchoolID','CurrentEnrollmentSchoolName']]
schools.groupby(['CurrentEnrollmentSchoolID','CurrentEnrollmentSchoolName']).size()

In [None]:
sbac_df.TestSchoolYear.value_counts()

In [None]:
def has_bookup_for_2015_kinder(school):
    """
    input
    """
    return 1 if school in []