# Sample import of KS2 data into a pandas dataframe

In [1]:
# Import the required libraries

import pandas as pd
import scipy.stats

## Import the LEA data

In [2]:
leas_df = pd.read_csv('data/2015-2016/la_and_region_codes_meta.csv')
leas_df.head()

Unnamed: 0,LEA,LA Name,REGION,REGION NAME
0,841,Darlington,1,North East A
1,840,County Durham,1,North East A
2,805,Hartlepool,1,North East A
3,806,Middlesbrough,1,North East A
4,807,Redcar and Cleveland,1,North East A


# Import the KS2 data

Most of the field names are given in the `ks2_meta` file, so we'll use that to keep track of the types of various columns. 

In [3]:
ks2cols = pd.read_csv('data/2015-2016/ks2_meta.csv')
ks2cols['Field Name'] = ks2cols['Field Name'].apply(lambda r: r.strip(),)
ks2cols

Unnamed: 0,Column,Field Name,Label/Description
0,1,RECTYPE,Record type (1=mainstream school; 2=special sc...
1,2,ALPHAIND,Alphabetic index
2,3,LEA,Local authority number
3,4,ESTAB,Establishment number
4,5,URN,School unique reference number
5,6,SCHNAME,School/Local authority name
6,7,ADDRESS1,School address (1)
7,8,ADDRESS2,School address (2)
8,9,ADDRESS3,School address (3)
9,10,TOWN,School town


Some columns contain integers, but _**pandas**_ will treat any numeric column with `na` values as `float64`, due to NumPy's number type hierarchy. 

In [4]:
int_cols = [c for c in ks2cols['Field Name'] 
            if c.startswith('T')
            if c not in ['TOWN', 'TELNUM', 'TKS1AVERAGE']]
int_cols += ['RECTYPE', 'ALPHAIND', 'LEA', 'ESTAB', 'URN', 'URN_AC', 'ICLOSE']
int_cols += ['READ_AVERAGE', 'GPS_AVERAGE', 'MAT_AVERAGE']

Some columns contain percentages. We'll convert these to floating point numbers on import.

Note that we also need to handle the case of `SUPP` and `NEW` in the data.

In [5]:
def p2f(x):
    if x.strip('%').isnumeric():
        return float(x.strip('%'))/100
    elif x in ['SUPP', 'NEW', 'LOWCOV', 'NA', '']:
        return 0.0
    else:
        return x

These are the columns to try to convert from percentages. Note that we can be generous here, as columns like PCODE (postcode) will return the original value if the conversion fails.

In [6]:
percent_cols = [f for f in ks2cols['Field Name'] if f.startswith('P')]
percent_cols += ['WRITCOV', 'MATCOV', 'READCOV'] 
percent_cols += ['PTMAT_HIGH', 'PTREAD_HIGH', 'PSENELSAPK', 'PSENELK', 'PTGPS_HIGH']
percent_converters = {c: p2f for c in percent_cols}

In [7]:
ks2_df = pd.read_csv('data/2015-2016/england_ks2final.csv', 
                   na_values=['SUPP', 'NEW', 'LOWCOV', 'NA', ''],
                   converters=percent_converters)

Drop the summary rows, keeping just the rows for mainstream and special schools.

In [8]:
ks2_df = ks2_df[(ks2_df['RECTYPE'] == 1) | (ks2_df['RECTYPE'] == 2)]

Convert everything to numbers, if possible.

In [9]:
ks2_df = ks2_df.apply(pd.to_numeric, errors='ignore')

Merge the LEA data into the school data

In [10]:
ks2_df = pd.merge(ks2_df, leas_df, on=['LEA'])
ks2_df.head().T

Unnamed: 0,0,1,2,3,4
RECTYPE,1,1,1,1,1
ALPHAIND,53372,11156,11160,11256,16366
LEA,201,202,202,202,202
ESTAB,3614,3323,3327,2842,2184
URN,100000,100028,100029,130342,100013
SCHNAME,Sir John Cass's Foundation Primary School,"Christ Church Primary School, Hampstead",Christ Church School,Christopher Hatton Primary School,Edith Neville Primary School
ADDRESS1,St James's Passage,Christ Church Hill,Redhill Street,38 Laystall Street,174 Ossulston Street
ADDRESS2,Duke's Place,,Camden,,
ADDRESS3,,,,,
TOWN,London,London,London,London,London


# Getting a feel for the data


In [11]:
!head -5 'data/2015-2016/england_ks2final.csv'

RECTYPE,ALPHAIND,LEA,ESTAB,URN,SCHNAME,ADDRESS1,ADDRESS2,ADDRESS3,TOWN,PCODE,TELNUM,URN_AC,SCHNAME_AC,OPEN_AC,NFTYPE,ICLOSE,RELDENOM,AGERANGE,CONFEXAM,TAB15,TAB1618,TOTPUPS,TPUPYEAR,TELIG,BELIG,GELIG,PBELIG,PGELIG,TKS1AVERAGE,TKS1GROUP_L,PTKS1GROUP_L,TKS1GROUP_M,PTKS1GROUP_M,TKS1GROUP_H,PTKS1GROUP_H,TKS1GROUP_NA,PTKS1GROUP_NA,TFSM6CLA1A,PTFSM6CLA1A,TNotFSM6CLA1A,PTNotFSM6CLA1A,TEALGRP2,PTEALGRP2,TMOBN,PTMOBN,TSENELSE,PSENELSE,PTRWM_EXP,PTRWM_HIGH,READPROG,READPROG_LOWER,READPROG_UPPER,READCOV,WRITPROG,WRITPROG_LOWER,WRITPROG_UPPER,WRITCOV,MATPROG,MATPROG_LOWER,MATPROG_UPPER,MATCOV,PTREAD_EXP,PTREAD_HIGH,PTREAD_AT,READ_AVERAGE,PTGPS_EXP,PTGPS_HIGH,PTGPS_AT,GPS_AVERAGE,PTMAT_EXP,PTMAT_HIGH,PTMAT_AT,MAT_AVERAGE,PTWRITTA_EXP,PTWRITTA_HIGH,PTWRITTA_WTS,PTWRITTA_AD,PTSCITA_EXP,PTSCITA_AD,PTREADTA_EXP,PTREADTA_AD,PTMATTA_EXP,PTMATTA_AD,PTRWM_EXP_B,PTRWM_EXP_G,PTRWM_EXP_L,PTRWM_EXP_M,PTRWM_EXP_H,PTRWM_EXP_FSM6CLA1A,PTRWM_EXP_NotFSM6CLA1A,DIFFN_RWM_EXP,PTRWM_EXP_EAL,PTRWM_EXP_MOBN,PTRWM_HIGH_B,

Decide it would help the analysis to use mongodb

In [12]:
!/usr/bin/mongoimport --port 27351 --drop --db ks2 --collection ks2_data \
    --type csv --headerline --ignoreBlanks \
    --file data/2015-2016/england_ks2final.csv

2018-03-22T13:05:01.533+0000	connected to: localhost:27351
2018-03-22T13:05:01.533+0000	dropping: ks2.ks2_data
2018-03-22T13:05:04.534+0000	[##############..........] ks2.ks2_data	10.1MB/16.5MB (61.0%)
2018-03-22T13:05:06.338+0000	[########################] ks2.ks2_data	16.5MB/16.5MB (100.0%)
2018-03-22T13:05:06.338+0000	imported 16316 documents


In [13]:
# import the required libraries
import pymongo
import bson

In [14]:
# Open a connection to the Mongo server
client = pymongo.MongoClient('mongodb://localhost:27351/')

In [15]:
# open the imported databse and collection
db = client.ks2
ks2_data = db.ks2_data

In [16]:
# check the number of documents matches those given above

In [17]:
ks2_data.find().count()

16316

In [18]:
# look at one document
ks2_data.find_one()

{'BELIG': 302432,
 'DIFFN_MATPROG': 0,
 'DIFFN_READPROG': 0,
 'DIFFN_RWM_EXP': 0,
 'DIFFN_RWM_HIGH': 0,
 'DIFFN_WRITPROG': 0,
 'GELIG': 289836,
 'GPS_AVERAGE': 104,
 'GPS_AVERAGE_FSM6CLA1A': 102,
 'GPS_AVERAGE_H': 110,
 'GPS_AVERAGE_L': 94,
 'GPS_AVERAGE_M': 103,
 'GPS_AVERAGE_NotFSM6CLA1A': 105,
 'MATPROG_B': 0,
 'MATPROG_B_LOWER': 0,
 'MATPROG_B_UPPER': 0,
 'MATPROG_EAL': 0,
 'MATPROG_EAL_LOWER': 0,
 'MATPROG_EAL_UPPER': 0,
 'MATPROG_FSM6CLA1A': 0,
 'MATPROG_FSM6CLA1A_LOWER': 0,
 'MATPROG_FSM6CLA1A_UPPER': 0,
 'MATPROG_G': 0,
 'MATPROG_G_LOWER': 0,
 'MATPROG_G_UPPER': 0,
 'MATPROG_H': 0,
 'MATPROG_H_LOWER': 0,
 'MATPROG_H_UPPER': 0,
 'MATPROG_L': 0,
 'MATPROG_L_LOWER': 0,
 'MATPROG_L_UPPER': 0,
 'MATPROG_M': 0,
 'MATPROG_MOBN': 0,
 'MATPROG_MOBN_LOWER': 0,
 'MATPROG_MOBN_UPPER': 0,
 'MATPROG_M_LOWER': 0,
 'MATPROG_M_UPPER': 0,
 'MATPROG_NotFSM6CLA1A': 0,
 'MATPROG_NotFSM6CLA1A_LOWER': 0,
 'MATPROG_NotFSM6CLA1A_UPPER': 0,
 'MAT_AVERAGE': 103,
 'MAT_AVERAGE_FSM6CLA1A': 101,
 'MAT_AVERA

In [21]:
# lets quickly look at these as a dataframe
results_df = pd.DataFrame(list(ks2_data.find())).head()

Unnamed: 0,ADDRESS1,ADDRESS2,ADDRESS3,AGERANGE,ALPHAIND,BELIG,CONFEXAM,DIFFN_MATPROG,DIFFN_READPROG,DIFFN_RWM_EXP,...,WRITPROG_MOBN,WRITPROG_MOBN_LOWER,WRITPROG_MOBN_UPPER,WRITPROG_M_LOWER,WRITPROG_M_UPPER,WRITPROG_NotFSM6CLA1A,WRITPROG_NotFSM6CLA1A_LOWER,WRITPROG_NotFSM6CLA1A_UPPER,WRITPROG_UPPER,_id
0,,,,,,302432,,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,,5ab3a9fd20d68895d0684be1
1,,,,,,299318,,0,0,0,...,0.1,0.0,0.0,0.0,0.0,0.1,0,0,,5ab3a9fd20d68895d0684be2
2,St James's Passage,Duke's Place,,3-11,53372.0,16,,2.7,0.6,23,...,2.2,-0.1,4.5,-0.8,4.8,3.6,0.5,6.7,4.5,5ab3a9fd20d68895d0684be3
3,,,,,,16,,2.7,0.6,23,...,2.2,-0.1,4.5,-0.8,4.8,3.6,0.5,6.7,4.5,5ab3a9fd20d68895d0684be4
4,Christ Church Hill,,,4-11,11156.0,16,,SUPP,SUPP,SUPP,...,4.0,1.1,6.9,0.1,7.3,SUPP,SUPP,SUPP,6.9,5ab3a9fd20d68895d0684be5


There quite a few NaNs in there but before cleaning I will decide on which column information I am interested in.

With there being so many different columns it is quite hard to read them all

In [35]:
# get a subset of data to look at
# DataFrame containing School Type, Perceentage of math expected standars and Reading expecting standard
type_math_read_df = pd.DataFrame(list(ks2_data.find({},{'_id':0, 'NFTYPE': 1, 'PTMAT_EXP':1, 'PTMAT_HIGH':1, 'PTREAD_EXP':1, 'PTREAD_HIGH':1})))
type_math_read_df.head()

Unnamed: 0,NFTYPE,PTMAT_EXP,PTMAT_HIGH,PTREAD_EXP,PTREAD_HIGH
0,,70%,17%,66%,19%
1,,70%,17%,66%,19%
2,VA,100%,18%,93%,18%
3,,100%,18%,93%,18%
4,VA,90%,43%,86%,48%


In [37]:
# Container containing the mean values of each measure for each school type
means_df = type_math_read_df.groupby('NFTYPE')['PTMAT_EXP','PTMAT_HIGH', 'PTREAD_EXP', 'PTREAD_HIGH'].mean()
means_df

DataError: No numeric types to aggregate

In [39]:
type_math_read_df['PTMAT_EXP'].describe()

count     15769
unique       98
top        SUPP
freq        483
Name: PTMAT_EXP, dtype: object