In [191]:
import pandas as pd

ufo_df = pd.read_csv("ufo_fullset.csv")
ufo_df

Unnamed: 0,reportedTimestamp,eventDate,eventTime,shape,duration,witnesses,weather,firstName,lastName,latitude,longitude,sighting,physicalEvidence,contact,researchOutcome
0,1977-04-04T04:02:23.340Z,1977-03-31,23:46,circle,4,1,rain,Ila,Bashirian,47.329444,-122.578889,Y,N,N,explained
1,1982-11-22T02:06:32.019Z,1982-11-15,22:04,disk,4,1,partly cloudy,Eriberto,Runolfsson,52.664913,-1.034894,Y,Y,N,explained
2,1992-12-07T19:06:52.482Z,1992-12-07,19:01,circle,49,1,clear,Miller,Watsica,38.951667,-92.333889,Y,N,N,explained
3,2011-02-24T21:06:34.898Z,2011-02-21,20:56,disk,13,1,partly cloudy,Clifton,Bechtelar,41.496944,-71.367778,Y,N,N,explained
4,1991-03-09T16:18:45.501Z,1991-03-09,11:42,circle,17,1,mostly cloudy,Jayda,Ebert,47.606389,-122.330833,Y,N,N,explained
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,2019-01-18T14:41:45.346Z,2019-01-14,07:25,square,95,10,snow,Taryn,Osinski,42.033333,-87.733333,Y,N,N,unexplained
17996,2016-09-20T23:24:30.488Z,2016-09-14,13:27,circle,55,10,stormy,Derick,Pouros,43.004444,-71.348889,Y,N,N,explained
17997,1977-03-10T13:48:27.305Z,1977-03-06,21:59,circle,39,10,partly cloudy,Judah,Purdy,36.866389,-83.888889,Y,N,N,explained
17998,1971-07-23T13:22:36.190Z,1971-07-18,13:59,square,28,10,snow,Imelda,Botsford,35.385833,-94.398333,Y,N,N,unexplained


In [192]:
ufo_df['shape'].unique()

array(['circle', 'disk', 'sphere', 'triangle', 'light', 'pyramid',
       'square', 'box', 'oval', nan], dtype=object)

##  Feature Analysis

EventDate - Important but only for metadata, like whether it was a weekend or weekday (people can fool around on weekends more). Night time people can make more mistakes.  Older sightings may be less relaiable. 

Shape - Important 

Duration - Important, the longer something is there if there is a correlation

Witnesses - Important if there is a correlation between num witneeses 

Names - Remove

Latitude/Long - Important, some places are alien hotspots

Sighting - remove, only one value 

Physical Evidence - important if unexplained sightings have a large number of physical evidence

Contact - important if unexplained sightings have a large number of contact

Weather - make it ordinal since visibility goes down as the weather gets worse


## Feature Engineering

Shape - Can bin into circular/squarish/triangular/other, reduce dimensions from 10 to 4 for one-hot encoding

Duration - Keep
Witnesses - Keep
Lat/Long - Keep 

Weather - Ordinal (assign integers) 
PhysicalEvidence - Binarize 
Contact - Binarize 

Night - Transform EventTime

Drop - EventDate, EventTime, FirstName, LastName, Sighting.



In [193]:
# drop features

ufo_df = ufo_df.drop(['reportedTimestamp', 'eventDate', 'firstName', 'lastName', 'sighting'], axis=1) 

In [194]:
ufo_df['hour'] = pd.to_datetime(ufo_df['eventTime'], format='%H:%M').dt.hour
ufo_df = ufo_df.drop(['eventTime'], axis=1)
ufo_df

Unnamed: 0,shape,duration,witnesses,weather,latitude,longitude,physicalEvidence,contact,researchOutcome,hour
0,circle,4,1,rain,47.329444,-122.578889,N,N,explained,23
1,disk,4,1,partly cloudy,52.664913,-1.034894,Y,N,explained,22
2,circle,49,1,clear,38.951667,-92.333889,N,N,explained,19
3,disk,13,1,partly cloudy,41.496944,-71.367778,N,N,explained,20
4,circle,17,1,mostly cloudy,47.606389,-122.330833,N,N,explained,11
...,...,...,...,...,...,...,...,...,...,...
17995,square,95,10,snow,42.033333,-87.733333,N,N,unexplained,7
17996,circle,55,10,stormy,43.004444,-71.348889,N,N,explained,13
17997,circle,39,10,partly cloudy,36.866389,-83.888889,N,N,explained,21
17998,square,28,10,snow,35.385833,-94.398333,N,N,unexplained,13


In [195]:
ufo_df['night'] = ufo_df['hour'].transform(lambda x: 0 if 6 <= x <= 18 else 1)
ufo_df = ufo_df.drop(['hour'], axis=1)
ufo_df

Unnamed: 0,shape,duration,witnesses,weather,latitude,longitude,physicalEvidence,contact,researchOutcome,night
0,circle,4,1,rain,47.329444,-122.578889,N,N,explained,1
1,disk,4,1,partly cloudy,52.664913,-1.034894,Y,N,explained,1
2,circle,49,1,clear,38.951667,-92.333889,N,N,explained,1
3,disk,13,1,partly cloudy,41.496944,-71.367778,N,N,explained,1
4,circle,17,1,mostly cloudy,47.606389,-122.330833,N,N,explained,0
...,...,...,...,...,...,...,...,...,...,...
17995,square,95,10,snow,42.033333,-87.733333,N,N,unexplained,0
17996,circle,55,10,stormy,43.004444,-71.348889,N,N,explained,0
17997,circle,39,10,partly cloudy,36.866389,-83.888889,N,N,explained,1
17998,square,28,10,snow,35.385833,-94.398333,N,N,unexplained,0


In [196]:
ufo_df

Unnamed: 0,shape,duration,witnesses,weather,latitude,longitude,physicalEvidence,contact,researchOutcome,night
0,circle,4,1,rain,47.329444,-122.578889,N,N,explained,1
1,disk,4,1,partly cloudy,52.664913,-1.034894,Y,N,explained,1
2,circle,49,1,clear,38.951667,-92.333889,N,N,explained,1
3,disk,13,1,partly cloudy,41.496944,-71.367778,N,N,explained,1
4,circle,17,1,mostly cloudy,47.606389,-122.330833,N,N,explained,0
...,...,...,...,...,...,...,...,...,...,...
17995,square,95,10,snow,42.033333,-87.733333,N,N,unexplained,0
17996,circle,55,10,stormy,43.004444,-71.348889,N,N,explained,0
17997,circle,39,10,partly cloudy,36.866389,-83.888889,N,N,explained,1
17998,square,28,10,snow,35.385833,-94.398333,N,N,unexplained,0


In [197]:
weather_dummies =  pd.get_dummies(ufo_df['weather'], prefix='weather')
weather_dummies

Unnamed: 0,weather_clear,weather_fog,weather_mostly cloudy,weather_partly cloudy,weather_rain,weather_snow,weather_stormy
0,0,0,0,0,1,0,0
1,0,0,0,1,0,0,0
2,1,0,0,0,0,0,0
3,0,0,0,1,0,0,0
4,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...
17995,0,0,0,0,0,1,0
17996,0,0,0,0,0,0,1
17997,0,0,0,1,0,0,0
17998,0,0,0,0,0,1,0


In [198]:
ufo_df = pd.concat([ufo_df, weather_dummies], axis=1)
ufo_df

Unnamed: 0,shape,duration,witnesses,weather,latitude,longitude,physicalEvidence,contact,researchOutcome,night,weather_clear,weather_fog,weather_mostly cloudy,weather_partly cloudy,weather_rain,weather_snow,weather_stormy
0,circle,4,1,rain,47.329444,-122.578889,N,N,explained,1,0,0,0,0,1,0,0
1,disk,4,1,partly cloudy,52.664913,-1.034894,Y,N,explained,1,0,0,0,1,0,0,0
2,circle,49,1,clear,38.951667,-92.333889,N,N,explained,1,1,0,0,0,0,0,0
3,disk,13,1,partly cloudy,41.496944,-71.367778,N,N,explained,1,0,0,0,1,0,0,0
4,circle,17,1,mostly cloudy,47.606389,-122.330833,N,N,explained,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,square,95,10,snow,42.033333,-87.733333,N,N,unexplained,0,0,0,0,0,0,1,0
17996,circle,55,10,stormy,43.004444,-71.348889,N,N,explained,0,0,0,0,0,0,0,1
17997,circle,39,10,partly cloudy,36.866389,-83.888889,N,N,explained,1,0,0,0,1,0,0,0
17998,square,28,10,snow,35.385833,-94.398333,N,N,unexplained,0,0,0,0,0,0,1,0


In [199]:
ufo_df = ufo_df.drop(['weather'], axis=1)

In [200]:
ufo_df

Unnamed: 0,shape,duration,witnesses,latitude,longitude,physicalEvidence,contact,researchOutcome,night,weather_clear,weather_fog,weather_mostly cloudy,weather_partly cloudy,weather_rain,weather_snow,weather_stormy
0,circle,4,1,47.329444,-122.578889,N,N,explained,1,0,0,0,0,1,0,0
1,disk,4,1,52.664913,-1.034894,Y,N,explained,1,0,0,0,1,0,0,0
2,circle,49,1,38.951667,-92.333889,N,N,explained,1,1,0,0,0,0,0,0
3,disk,13,1,41.496944,-71.367778,N,N,explained,1,0,0,0,1,0,0,0
4,circle,17,1,47.606389,-122.330833,N,N,explained,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,square,95,10,42.033333,-87.733333,N,N,unexplained,0,0,0,0,0,0,1,0
17996,circle,55,10,43.004444,-71.348889,N,N,explained,0,0,0,0,0,0,0,1
17997,circle,39,10,36.866389,-83.888889,N,N,explained,1,0,0,0,1,0,0,0
17998,square,28,10,35.385833,-94.398333,N,N,unexplained,0,0,0,0,0,0,1,0


In [201]:
ufo_df['physicalEvidence'] = ufo_df['physicalEvidence'].transform(lambda x: 1 if x=='Y' else 0)
ufo_df['contact'] = ufo_df['contact'].transform(lambda x: 1 if x=='Y' else 0)

ufo_df

Unnamed: 0,shape,duration,witnesses,latitude,longitude,physicalEvidence,contact,researchOutcome,night,weather_clear,weather_fog,weather_mostly cloudy,weather_partly cloudy,weather_rain,weather_snow,weather_stormy
0,circle,4,1,47.329444,-122.578889,0,0,explained,1,0,0,0,0,1,0,0
1,disk,4,1,52.664913,-1.034894,1,0,explained,1,0,0,0,1,0,0,0
2,circle,49,1,38.951667,-92.333889,0,0,explained,1,1,0,0,0,0,0,0
3,disk,13,1,41.496944,-71.367778,0,0,explained,1,0,0,0,1,0,0,0
4,circle,17,1,47.606389,-122.330833,0,0,explained,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,square,95,10,42.033333,-87.733333,0,0,unexplained,0,0,0,0,0,0,1,0
17996,circle,55,10,43.004444,-71.348889,0,0,explained,0,0,0,0,0,0,0,1
17997,circle,39,10,36.866389,-83.888889,0,0,explained,1,0,0,0,1,0,0,0
17998,square,28,10,35.385833,-94.398333,0,0,unexplained,0,0,0,0,0,0,1,0


In [202]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

ufo_df['researchOutcome'] = le.fit_transform(ufo_df['researchOutcome'])
ufo_df

Unnamed: 0,shape,duration,witnesses,latitude,longitude,physicalEvidence,contact,researchOutcome,night,weather_clear,weather_fog,weather_mostly cloudy,weather_partly cloudy,weather_rain,weather_snow,weather_stormy
0,circle,4,1,47.329444,-122.578889,0,0,0,1,0,0,0,0,1,0,0
1,disk,4,1,52.664913,-1.034894,1,0,0,1,0,0,0,1,0,0,0
2,circle,49,1,38.951667,-92.333889,0,0,0,1,1,0,0,0,0,0,0
3,disk,13,1,41.496944,-71.367778,0,0,0,1,0,0,0,1,0,0,0
4,circle,17,1,47.606389,-122.330833,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,square,95,10,42.033333,-87.733333,0,0,2,0,0,0,0,0,0,1,0
17996,circle,55,10,43.004444,-71.348889,0,0,0,0,0,0,0,0,0,0,1
17997,circle,39,10,36.866389,-83.888889,0,0,0,1,0,0,0,1,0,0,0
17998,square,28,10,35.385833,-94.398333,0,0,2,0,0,0,0,0,0,1,0


In [203]:
# explained - 0 
# probably - 1
# unexplained - 2

In [204]:
#drop n/a only 2/18 000 samples 

ufo_df = ufo_df.dropna()
ufo_df

Unnamed: 0,shape,duration,witnesses,latitude,longitude,physicalEvidence,contact,researchOutcome,night,weather_clear,weather_fog,weather_mostly cloudy,weather_partly cloudy,weather_rain,weather_snow,weather_stormy
0,circle,4,1,47.329444,-122.578889,0,0,0,1,0,0,0,0,1,0,0
1,disk,4,1,52.664913,-1.034894,1,0,0,1,0,0,0,1,0,0,0
2,circle,49,1,38.951667,-92.333889,0,0,0,1,1,0,0,0,0,0,0
3,disk,13,1,41.496944,-71.367778,0,0,0,1,0,0,0,1,0,0,0
4,circle,17,1,47.606389,-122.330833,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,square,95,10,42.033333,-87.733333,0,0,2,0,0,0,0,0,0,1,0
17996,circle,55,10,43.004444,-71.348889,0,0,0,0,0,0,0,0,0,0,1
17997,circle,39,10,36.866389,-83.888889,0,0,0,1,0,0,0,1,0,0,0
17998,square,28,10,35.385833,-94.398333,0,0,2,0,0,0,0,0,0,1,0


In [205]:
'''
cicular_list = [ 'circle', 'disk', 'sphere', 'oval', 'light']
square_list =  ['square', 'box']
triangular_list = ['triangle', 'pyramid',]


ufo_df['shape'] = ufo_df['shape'].transform(lambda x: 'circular' if x in cicular_list else x)
ufo_df['shape'] = ufo_df['shape'].transform(lambda x: 'squarish' if x in square_list else x)
ufo_df['shape'] = ufo_df['shape'].transform(lambda x: 'triangular' if x in triangular_list else x)
ufo_df
'''

"\ncicular_list = [ 'circle', 'disk', 'sphere', 'oval', 'light']\nsquare_list =  ['square', 'box']\ntriangular_list = ['triangle', 'pyramid',]\n\n\nufo_df['shape'] = ufo_df['shape'].transform(lambda x: 'circular' if x in cicular_list else x)\nufo_df['shape'] = ufo_df['shape'].transform(lambda x: 'squarish' if x in square_list else x)\nufo_df['shape'] = ufo_df['shape'].transform(lambda x: 'triangular' if x in triangular_list else x)\nufo_df\n"

In [206]:
ufo_df['shape'].unique()

array(['circle', 'disk', 'sphere', 'triangle', 'light', 'pyramid',
       'square', 'box', 'oval'], dtype=object)

In [207]:
ufo_df['shape'].unique()

array(['circle', 'disk', 'sphere', 'triangle', 'light', 'pyramid',
       'square', 'box', 'oval'], dtype=object)

In [208]:
# one hot encode shapes
shape_dummies =  pd.get_dummies(ufo_df['shape'], prefix='shape')
shape_dummies


Unnamed: 0,shape_box,shape_circle,shape_disk,shape_light,shape_oval,shape_pyramid,shape_sphere,shape_square,shape_triangle
0,0,1,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
17995,0,0,0,0,0,0,0,1,0
17996,0,1,0,0,0,0,0,0,0
17997,0,1,0,0,0,0,0,0,0
17998,0,0,0,0,0,0,0,1,0


In [209]:
ufo_df = pd.concat([ufo_df, shape_dummies], axis=1)
ufo_df

Unnamed: 0,shape,duration,witnesses,latitude,longitude,physicalEvidence,contact,researchOutcome,night,weather_clear,...,weather_stormy,shape_box,shape_circle,shape_disk,shape_light,shape_oval,shape_pyramid,shape_sphere,shape_square,shape_triangle
0,circle,4,1,47.329444,-122.578889,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
1,disk,4,1,52.664913,-1.034894,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,circle,49,1,38.951667,-92.333889,0,0,0,1,1,...,0,0,1,0,0,0,0,0,0,0
3,disk,13,1,41.496944,-71.367778,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,circle,17,1,47.606389,-122.330833,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,square,95,10,42.033333,-87.733333,0,0,2,0,0,...,0,0,0,0,0,0,0,0,1,0
17996,circle,55,10,43.004444,-71.348889,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
17997,circle,39,10,36.866389,-83.888889,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
17998,square,28,10,35.385833,-94.398333,0,0,2,0,0,...,0,0,0,0,0,0,0,0,1,0


In [210]:
ufo_df = ufo_df.drop(['shape'], axis=1)

In [211]:
ufo_df

Unnamed: 0,duration,witnesses,latitude,longitude,physicalEvidence,contact,researchOutcome,night,weather_clear,weather_fog,...,weather_stormy,shape_box,shape_circle,shape_disk,shape_light,shape_oval,shape_pyramid,shape_sphere,shape_square,shape_triangle
0,4,1,47.329444,-122.578889,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
1,4,1,52.664913,-1.034894,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,49,1,38.951667,-92.333889,0,0,0,1,1,0,...,0,0,1,0,0,0,0,0,0,0
3,13,1,41.496944,-71.367778,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,17,1,47.606389,-122.330833,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,95,10,42.033333,-87.733333,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,1,0
17996,55,10,43.004444,-71.348889,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
17997,39,10,36.866389,-83.888889,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
17998,28,10,35.385833,-94.398333,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [212]:
# prepare for sagemaker xgboost - target in first col, no headers

cols = list(ufo_df.columns)
a, b = cols.index('researchOutcome'), cols.index('duration')
cols[b], cols[a] = cols[a], cols[b]
ufo_df = ufo_df[cols]
ufo_df

Unnamed: 0,researchOutcome,witnesses,latitude,longitude,physicalEvidence,contact,duration,night,weather_clear,weather_fog,...,weather_stormy,shape_box,shape_circle,shape_disk,shape_light,shape_oval,shape_pyramid,shape_sphere,shape_square,shape_triangle
0,0,1,47.329444,-122.578889,0,0,4,1,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,1,52.664913,-1.034894,1,0,4,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,1,38.951667,-92.333889,0,0,49,1,1,0,...,0,0,1,0,0,0,0,0,0,0
3,0,1,41.496944,-71.367778,0,0,13,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,1,47.606389,-122.330833,0,0,17,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,2,10,42.033333,-87.733333,0,0,95,0,0,0,...,0,0,0,0,0,0,0,0,1,0
17996,0,10,43.004444,-71.348889,0,0,55,0,0,0,...,1,0,1,0,0,0,0,0,0,0
17997,0,10,36.866389,-83.888889,0,0,39,1,0,0,...,0,0,1,0,0,0,0,0,0,0
17998,2,10,35.385833,-94.398333,0,0,28,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [213]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(ufo_df, test_size=0.1, random_state=42, shuffle=True)

In [214]:
train_df.to_csv('train_preprocessed_ufo_data.csv', index=False, header=False)
test_df.to_csv('validation_preprocessed_ufo_data.csv', index=False, header=False)

In [215]:
!aws s3 cp train_preprocessed_ufo_data.csv  s3://preshen-sagemaker-sme-project/algorithm-lab/train/
!aws s3 cp validation_preprocessed_ufo_data.csv s3://preshen-sagemaker-sme-project/algorithm-lab/validation/

upload: ./train_preprocessed_ufo_data.csv to s3://preshen-sagemaker-sme-project/algorithm-lab/train/train_preprocessed_ufo_data.csv
upload: ./validation_preprocessed_ufo_data.csv to s3://preshen-sagemaker-sme-project/algorithm-lab/validation/validation_preprocessed_ufo_data.csv


In [216]:
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri 
from sagemaker.session import s3_input, Session
            
    
bucket = "preshen-sagemaker"
prefix = "algorithm-lab"
    
# initialize hyperparameters
hyperparameters = {
        "objective":"multi:softprob",
        "num_class": "3",
        "num_round": "50",
        "subsample":"0.7"
        }

# set an output path where the trained model will be saved

output_path = 's3://{}/{}/{}/output'.format(bucket, prefix, 'xgb-built-in')

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
xgboost_container = get_image_uri(boto3.Session().region_name,
                          'xgboost', 
                          repo_version='1.0-1')

# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_name=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          train_instance_count=1, 
                                          train_instance_type='ml.m5.2xlarge', 
                                          train_volume_size=5, # 5 GB 
                                          output_path=output_path)

# define the data type and paths to the training and validation datasets
content_type = "csv"
train_input = s3_input("s3://{}/{}/{}/".format(bucket, prefix, 'train'), content_type=content_type)
validation_input = s3_input("s3://{}/{}/{}/".format(bucket, prefix, 'validation'), content_type=content_type)


# execute the XGBoost training job
estimator.fit({'train': train_input, 'validation': validation_input})


'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-11-07 14:08:27 Starting - Starting the training job...
2020-11-07 14:08:28 Starting - Launching requested ML instances......
2020-11-07 14:09:53 Starting - Preparing the instances for training...
2020-11-07 14:10:26 Downloading - Downloading input data...
2020-11-07 14:10:48 Training - Downloading the training image..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softprob to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[14:11:03] 16198x23 matrix with 372554 entries loaded from /opt/ml/input/data/tra

In [189]:
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri 
from sagemaker.session import s3_input, Session
            
    
bucket = "preshen-sagemaker"
prefix = "algorithm-lab"
    
# initialize hyperparameters
hyperparameters = {
        "objective":"multi:softprob",
        "num_class": "3",
        "booster": "dart",
        "num_round": "100",
        "max_depth": 10
        }

# set an output path where the trained model will be saved

output_path = 's3://{}/{}/{}/output'.format(bucket, prefix, 'xgb-built-in')

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
xgboost_container = get_image_uri(boto3.Session().region_name,
                          'xgboost', 
                          repo_version='1.0-1')

# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_name=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          train_instance_count=1, 
                                          train_instance_type='ml.m5.2xlarge', 
                                          train_volume_size=5, # 5 GB 
                                          output_path=output_path)

# define the data type and paths to the training and validation datasets
content_type = "csv"
train_input = s3_input("s3://{}/{}/{}/".format(bucket, prefix, 'train'), content_type=content_type)
validation_input = s3_input("s3://{}/{}/{}/".format(bucket, prefix, 'validation'), content_type=content_type)


# execute the XGBoost training job
estimator.fit({'train': train_input, 'validation': validation_input})


'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-11-07 13:54:30 Starting - Starting the training job...
2020-11-07 13:54:32 Starting - Launching requested ML instances......
2020-11-07 13:55:37 Starting - Preparing the instances for training...
2020-11-07 13:56:19 Downloading - Downloading input data
2020-11-07 13:56:19 Training - Downloading the training image...
2020-11-07 13:56:46 Training - Training image download completed. Training in progress.[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter booster value dart to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softprob to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input i


2020-11-07 13:57:09 Uploading - Uploading generated training model
2020-11-07 13:57:09 Completed - Training job completed
Training seconds: 56
Billable seconds: 56


In [190]:
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri 
from sagemaker.session import s3_input, Session
            
    
bucket = "preshen-sagemaker-sme-project"
prefix = "algorithm-lab"
    
# initialize hyperparameters
hyperparameters = {
        "objective":"multi:softprob",
        "num_class": "3",
        "booster": "gbtree",
        "num_round": "100",
        "max_depth": 10
        }

# set an output path where the trained model will be saved

output_path = 's3://{}/{}/{}/output'.format(bucket, prefix, 'xgb-built-in')

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
xgboost_container = get_image_uri(boto3.Session().region_name,
                          'xgboost', 
                          repo_version='1.0-1')

# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_name=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          train_instance_count=1, 
                                          train_instance_type='ml.m5.2xlarge', 
                                          train_volume_size=5, # 5 GB 
                                          output_path=output_path)

# define the data type and paths to the training and validation datasets
content_type = "csv"
train_input = s3_input("s3://{}/{}/{}/".format(bucket, prefix, 'train'), content_type=content_type)
validation_input = s3_input("s3://{}/{}/{}/".format(bucket, prefix, 'validation'), content_type=content_type)


# execute the XGBoost training job
estimator.fit({'train': train_input, 'validation': validation_input})


'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-11-07 13:59:51 Starting - Starting the training job...
2020-11-07 13:59:53 Starting - Launching requested ML instances......
2020-11-07 14:01:03 Starting - Preparing the instances for training...
2020-11-07 14:01:47 Downloading - Downloading input data
2020-11-07 14:01:47 Training - Downloading the training image...
2020-11-07 14:02:21 Uploading - Uploading generated training model.[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter booster value gbtree to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softprob to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINF


2020-11-07 14:02:29 Completed - Training job completed
Training seconds: 48
Billable seconds: 48


# Linear Learner

In [185]:
ll_container =  get_image_uri(boto3.Session().region_name,
                          'linear-learner')

# initialize hyperparameters
hyperparameters = {
        "predictor_type":"multiclass_classifier",
        "num_classes": "3"     }

ll_estimator = sagemaker.estimator.Estimator(image_name=ll_container, 
                                          role=sagemaker.get_execution_role(),
                                          hyperparameters=hyperparameters,
                                          train_instance_count=1, 
                                          train_instance_type='ml.m5.2xlarge', 
                                          train_volume_size=5, # 5 GB 
                                          output_path=output_path)

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [187]:
# define the data type and paths to the training and validation datasets
content_type = "text/csv"
train_input = s3_input("s3://{}/{}/{}/".format(bucket, prefix, 'train'), content_type=content_type)
validation_input = s3_input("s3://{}/{}/{}/".format(bucket, prefix, 'validation'), content_type=content_type)


# execute the XGBoost training job
ll_estimator.fit({'train': train_input, 'validation': validation_input})

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-11-07 13:39:47 Starting - Starting the training job...
2020-11-07 13:39:50 Starting - Launching requested ML instances......
2020-11-07 13:40:54 Starting - Preparing the instances for training...
2020-11-07 13:41:41 Downloading - Downloading input data...
2020-11-07 13:42:12 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[11/07/2020 13:42:28 INFO 140252411643712] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_step': u'auto', u'init_metho


2020-11-07 13:42:43 Uploading - Uploading generated training model
2020-11-07 13:42:43 Completed - Training job completed
Training seconds: 62
Billable seconds: 62


In [217]:
# change to 90/10 train test split instead of 80/20

content_type = "text/csv"
train_input = s3_input("s3://{}/{}/{}/".format(bucket, prefix, 'train'), content_type=content_type)
validation_input = s3_input("s3://{}/{}/{}/".format(bucket, prefix, 'validation'), content_type=content_type)


# execute the XGBoost training job
ll_estimator.fit({'train': train_input, 'validation': validation_input})

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-11-07 18:09:37 Starting - Starting the training job...
2020-11-07 18:09:39 Starting - Launching requested ML instances......
2020-11-07 18:10:45 Starting - Preparing the instances for training......
2020-11-07 18:11:43 Downloading - Downloading input data...
2020-11-07 18:12:31 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[11/07/2020 18:12:34 INFO 140552628516672] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_


2020-11-07 18:12:48 Uploading - Uploading generated training model
2020-11-07 18:12:48 Completed - Training job completed
Training seconds: 65
Billable seconds: 65
