In [13]:
from __future__ import print_function
import json
import os
import pickle
import sys
import traceback
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import re
import numpy as np

In [2]:
def preprocess(df, encoder):
    train = df[['Status', 'Levels', 'Rooms', 'Materials', 'Engagement', 'source']]
    train_df = pd.get_dummies(train)
    train_df = train_df.rename(columns = {'Status_no longer leaking, wet <1 day': 'Status_no longer leaking, wet less than 1 day',
                                     'Status_no longer leaking, wet >1 day': 'Status_no longer leaking, wet more than 1 day'})
    train_df.columns=train_df.columns.str.replace(r'[^0-9a-zA-Z ]', ' ', regex=True)

In [3]:
prefix = 'C:/ml_projects/pm_projects/crawford/bdt_digital_desk/ml-ops/triage-nb/sagemaker/opt/ml/'

input_path = prefix + 'input/data'
output_path = os.path.join(prefix, 'output')
model_path = os.path.join(prefix, 'model')
param_path = os.path.join(prefix, 'input/config/hyperparameters.json')

# This algorithm has a single channel of input data called 'training'. Since we run in
# File mode, the input files are copied to the directory specified here.
channel_name='training'
training_path = os.path.join(input_path, channel_name)

In [4]:
with open(param_path, "r") as f:
    ff = f.read()

In [5]:
def train():
    print("Starting the training.")
    try:
        with open(param_path, 'r') as tc:
            training_params = json.load(tc)
        
        # Take the set of files and read them all into a single pandas dataframe
        input_files = [os.path.join(training_path, file) for file in os.listdir(training_path)]
        if len(input_files) == 0:
            raise ValueError(("There are no files in {}.\n" + 
                                'This usually indicates that the channel ({}) was incorrectly specified,\n' +
                              'the data specification in S3 was incorrectly specified or the role specified\n' +
                              'does not have permission to access the data.').format(training_path, channel_name))
        
        # Reading the training data.
        raw_data = [ pd.read_csv(file) for file in input_files if file.endswith(".csv")]
        
        # Prepare the dataset.
        train_data = pd.concat(raw_data)
        train = train_data[['Status', 'Levels', 'Rooms', 'Materials', 'Engagement', 'source']]
        train_df = pd.get_dummies(train)
        train_df = train_df.rename(columns = {'Status_no longer leaking, wet <1 day': 'Status_no longer leaking, wet less than 1 day',
                                        'Status_no longer leaking, wet >1 day': 'Status_no longer leaking, wet more than 1 day'})
        train_df.columns=train_df.columns.str.replace(r'[^0-9a-zA-Z ]', ' ', regex=True)
        encoder = LabelEncoder()
        y_train = encoder.fit_transform(train_data[['labels']])
        dtrain = xgb.DMatrix(data=train_df, label=y_train)

        # Training the model.
        model = xgb.train(training_params, dtrain)

        # Save the model
        with open(os.path.join(model_path, "xgb-model.pkl"), "wb") as out:
            pickle.dump(model, out)
        with open(os.path.join(model_path, "xgb-encoder.pkl"), "wb") as out:
            pickle.dump(encoder, out)

        print("Training complete.")

    except Exception as e:
            # Write out an error file. This will be returned as the failureReason in the
        # DescribeTrainingJob result.
        trc = traceback.format_exc()
        with open(os.path.join(output_path, 'failure'), 'w') as s:
            s.write('Exception during training: ' + str(e) + '\n' + trc)
        # Printing this causes the exception to be in the training job logs, as well.
        print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr)
        # A non-zero exit code causes the training job to be marked as Failed.
        sys.exit(255)

In [6]:
if __name__ == "__main__":
    train()

    # A zero exit code causes the job to be marked a Succeeded.
    sys.exit(0)

Starting the training.
Parameters: { "n_gpus" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Training complete.


  return f(*args, **kwargs)


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [7]:
class ScoringService(object):
    model = None  # Where we keep the model when it's loaded
    encoder = None
    
    @classmethod
    def get_model(cls):
        # Load Encoder if it is not loaded
        if cls.encoder == None:
            with open(os.path.join(model_path, "xgb-encoder.pkl"), "rb") as inp:
                cls.encoder = pickle.load(inp)
        
        # Get the model object for this instance, loading it if it's not already loaded.
        if cls.model == None:
            with open(os.path.join(model_path,"xgb-model.pkl"), "rb") as inp:
                cls.model = pickle.load(inp)
        
        # Return both models
        return cls.encoder, cls.model

    @classmethod
    def predict(cls, input):
        """For the input, do the predictions and return them.

        Args:
            input (a pandas dataframe): The data on which to do the predictions. There will be
                one prediction per row in the dataframe"""
        encoder, model = cls.get_model()
        prediction = model.predict(input)
        pred_ = encoder.inverse_transform(prediction.astype(np.int32))
        return pred_

In [8]:
def preprocess_data(input_object):
    print("INPUT: ", input_object)
    temp_lst = []
    columns = ['Status active leak  currently wet', 
               'Status livable still in dwelling   building', 
               'Status livable temporarily displaced', 
               'Status no longer leaking  wet less than 1 day', 
               'Status no longer leaking  wet more than 1 day', 
               'Status no  no opening', 
               'Status unlivable displaced', 
               'Status yes  covered tarped', 
               'Status yes  uncovered not tarped', 
               'Levels 1 floor', 
               'Levels 2 floors or more', 
               'Levels no', 
               'Rooms 1 2 rooms', 
               'Rooms 3 rooms', 
               'Rooms 4 rooms', 
               'Rooms 5 or more rooms', 
               'Rooms no', 
               'Materials cabinets', 
               'Materials ceiling', 
               'Materials contents  personal belongings ', 
               'Materials exterior contents  grill  patio furniture  etc  ', 
               'Materials exterior structure', 
               'Materials fixtures', 
               'Materials floors', 
               'Materials hardscapes   fence', 
               'Materials no', 
               'Materials roof', 
               'Materials siding', 
               'Materials vinyl floor covering', 
               'Materials walls', 
               'Materials windows', 
               'Engagement attorney', 
               'Engagement contractor', 
               'Engagement no', 
               'Engagement public adjuster', 
               'source electrical', 
               'source fireplace', 
               'source lightning', 
               'source other', 
               'source stove', 
               'source unknown', 
               'source water', 
               'source wildfire', 
               'source wind']
    input_obj_lst = [k+' '+v for k, v in input_object.items()]
    for i in range(len(input_obj_lst)):
        if input_obj_lst[i] == 'Status no longer leaking, wet <1 day':
            input_obj_lst[i] = 'Status no longer leaking  wet less than 1 day'
        elif input_obj_lst[i] == 'Status no longer leaking, wet >1 day':
            input_obj_lst[i] = 'Status no longer leaking  wet more than 1 day'
        input_obj_lst[i] = re.sub('[,.()-/]+', ' ', input_obj_lst[i])

    for col in columns:
        if col in input_obj_lst:
            temp_lst.append(1)
        else:
            temp_lst.append(0)
    df_test = pd.DataFrame(columns=columns, data=np.array(temp_lst).reshape(1, 44), index=None)
    x_test = xgb.DMatrix(data=df_test)
    return x_test

In [38]:
# ['Status', 'Levels', 'Rooms', 'Materials', 'Engagement', 'source']]
x_data = {"Status": "yes, covered/tarped",
          'Levels': "no",
          "Rooms": "1-2 rooms",
          "Materials": "hardscapes / fence",
          "Engagement": "contractor",
          "source": "wind"
         }

In [39]:
data = preprocess_data(x_data)

INPUT:  {'Status': 'yes, covered/tarped', 'Levels': 'no', 'Rooms': '1-2 rooms', 'Materials': 'hardscapes / fence', 'Engagement': 'contractor', 'source': 'wind'}


In [52]:
data

<xgboost.core.DMatrix at 0x1d72147e248>

In [40]:
result = ScoringService.predict(data)

In [41]:
result

array(['LA'], dtype=object)

In [30]:
!pip install xgboost==1.5.2


Collecting xgboost==1.5.2
  Downloading xgboost-1.5.2-py3-none-win_amd64.whl (106.6 MB)
     -------------------------------------- 106.6/106.6 MB 1.6 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-1.5.2


In [50]:
import tarfile
import subprocess

In [51]:
bash_command = "tar -cvpzf model.tar.gz xgb-model.pkl xgb-encoder.pkl"
process = subprocess.Popen(bash_command.split(), stdout=subprocess.PIPE)
output, error = process.communicate()

In [48]:
error

In [49]:
output

b''