In [37]:
#import packages for EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [38]:
#get session & role info
import sagemaker
from sagemaker import utils
from sagemaker.serializers import CSVSerializer
import boto3
import os

from sagemaker.session import Session
from sagemaker import get_execution_role

session = sagemaker.Session()

bucket= session.default_bucket()
print("Default Bucket: {}".format(bucket))

prefix = 'CapstoneProject'
output_path = f"s3://{bucket}/{prefix}/output/"
region = session.boto_region_name
print("AWS Region: {}".format(region))

role = get_execution_role()
print("RoleArn: {}".format(role))

Default Bucket: sagemaker-us-east-1-513238734400
AWS Region: us-east-1
RoleArn: arn:aws:iam::513238734400:role/service-role/AmazonSageMaker-ExecutionRole-20220121T133455


In [40]:
#AutoGluon Training
from ag_model import (
    AutoGluonTraining,
    AutoGluonInferenceModel,
    AutoGluonTabularPredictor,
)

In [13]:
#read in csv
data = pd.read_csv('s3://sagemaker-studio-rqk547knaq/CapstoneProject/nhes_16_pfi_pu.csv')
data.shape

(14075, 822)

In [14]:
#drop columns with weights, imputation flags, info on how survey was completed, and age/sex/grade of non-sampled children in the household
data.drop(data.loc[:, 'MODECOMP':], inplace=True, axis=1)

In [15]:
data.shape
data['SEABSNT'].max()

364

In [17]:
#drop rows with no entry for days absent from school
data = data[data['SEABSNT']!=-1]
data.shape

(13523, 372)

In [19]:
#drop children who are homeschooled, and columns related to homeschooling choices
data=data[data['HSWHOX']==-1]
data.drop(data.loc[:, 'HSWHOX':'HSNATL'], inplace=True, axis=1)

In [24]:
#drop ID number
data.drop(['BASMID', 'QTYPE'], inplace=True, axis=1)
data.describe()

Unnamed: 0,GRADE,SCPUBPRI,DISTASSI,SCHRTSCHL,SNEIGHBRX,SPUBCHOIX,SCONSIDR,SPERFORM,S1STCHOI,SSAMSC,...,ZIP18PO2,ZIPBLHI2,ZIPLOCL,S16CHART,S16NUMST,S16PBPV,S16TYPE,SCHLGRAD,ENGLSPANX,AGE2015
count,13523.0,13523.0,13523.0,13523.0,13523.0,13523.0,13523.0,13523.0,13523.0,13523.0,...,13523.0,13523.0,13523.0,13523.0,13523.0,13523.0,13523.0,13523.0,13523.0,13523.0
mean,9.65873,3.75257,0.921763,1.597796,1.815869,1.942542,1.694225,-0.325889,1.180581,1.028248,...,2.036604,2.658877,22.68964,1.625305,2.786512,1.11351,3.709606,3.036161,1.049841,12.05169
std,3.838806,0.744754,0.771633,0.959289,0.387605,0.772644,0.460752,1.039979,0.384685,0.165687,...,0.967253,1.062233,10.26508,0.975685,1.339795,0.317227,1.056901,1.249384,0.217624,3.787829
min,2.0,1.0,-1.0,-1.0,1.0,1.0,1.0,-1.0,1.0,1.0,...,1.0,1.0,11.0,-9.0,-9.0,1.0,-9.0,-9.0,1.0,3.0
25%,7.0,4.0,1.0,2.0,2.0,1.0,1.0,-1.0,1.0,1.0,...,1.0,2.0,13.0,2.0,2.0,1.0,4.0,2.0,1.0,9.0
50%,10.0,4.0,1.0,2.0,2.0,2.0,2.0,-1.0,1.0,1.0,...,2.0,3.0,21.0,2.0,3.0,1.0,4.0,3.0,1.0,12.0
75%,13.0,4.0,1.0,2.0,2.0,3.0,2.0,1.0,1.0,1.0,...,3.0,4.0,23.0,2.0,4.0,1.0,4.0,4.0,1.0,15.0
max,15.0,4.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,...,4.0,4.0,43.0,3.0,5.0,2.0,4.0,5.0,2.0,20.0


In [36]:
#Still way too many features! We'll keep the following: current grade, type of school, 
#whether the student has been in the same school the whole year, grades, enrolled in AP classes,
#grades repeated, suspensions, expulsions, month & year of birth, country of birth, race/ethnicity, sex, enrolled in ESL, DISABILITY, zip code poverty/minority percentage/community type, school enrollment, age as of Dec 31, 2018 
#target variable is true if total absences are greater than or equal to 15 over the school year
X = data.loc[:, ['GRADE','SCPUBPRI', 'DISTASSI', 'SCHRTSCHL', 'SSAMSC', 'SEGRADES', 'SEADPLCXX', 'SEREPEAT', 'SESUSOUT', 'SESUSPIN', 'SEEXPEL', 'CDOBMM', 'CDOBYY', 'CPLCBRTH', 'CMOVEAGE', 'CHISPAN', 'CAMIND', 'CASIAN', 'CBLACK', 'CPACI', 'CWHITE','CHISPRM', 'CSEX', 'CENGLPRG','DISABLTYX', 'DISBLTY2X', 'RACEETHN', 'RACEETH2', 'ZIP18PO2', 'ZIPBLHI2','ZIPLOCL','S16NUMST', 'S16PBPV', 'S16TYPE', 'AGE2015'  ]]
y = data['SEABSNT']>=15

In [47]:
#Split into training and testing, stratified by target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)

In [22]:
train_dataset = X_train
train_dataset['target']=y_train
test_dataset = X_test
test_dataset['target'] = y_test
train_dataset.to_csv('train.csv')
test_dataset.to_csv('test.csv')

NameError: name 'X_train' is not defined

In [42]:
#Set up AutoGluon Training container
ag = AutoGluonTraining(
    role=role,
    entry_point="tabular_train.py",
    region=region,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    framework_version="0.3.1",
    base_job_name="autogluon-tabular-train",
)

In [44]:
#Upload data to S3
s3_prefix = f"autogluon_sm/{utils.sagemaker_timestamp()}"
train_input = ag.sagemaker_session.upload_data(
    path="train.csv", key_prefix=s3_prefix
)
eval_input = ag.sagemaker_session.upload_data(
    path="test.csv", key_prefix=s3_prefix
)
config_input = ag.sagemaker_session.upload_data(
    path="config.yaml", key_prefix=s3_prefix
)

In [None]:
job_name = utils.unique_name_from_base("test-autogluon-image")
ag.fit(
    {"config": config_input, "train": train_input, "test": eval_input},
    job_name=job_name,
)

2022-01-24 21:10:42 Starting - Starting the training job...

291