In [1]:
# load in encoder & model
from joblib import load
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def load_data():
    ''' Load in the pretrained model & label encoders.
    '''
    d = load("label_encoder.joblib.dat")
    d_classes = load("encoder_classes.joblib.dat")
    dialect_classifier = load("dialect_classifier.joblib.dat")
    test_case = load("test_case.joblib.dat")

    # remove target class from test data
    del test_case["class_target"]

    # update the classes for each of our label encoders
    for key,item in d.items():
        d[key]._classes = d_classes[key]

    return d, d_classes, dialect_classifier, test_case

def encode_data(input_data):
    ''' Encode our input data with pre-trained label encoders.
    '''
    # encode our test data
    test_case_encoded = input_data

    for i, row in test_case.items():
        test_case_encoded[i] = d[i].transform([test_case[i]])

    test_case_encoded = test_case_encoded.apply(lambda x:x[0])

    return test_case_encoded

def predict_cities(test_case_encoded):
    ''' Take in encoded data & return top three predicted cities.
    '''
    # convert input data to DMatrix format
    test_case_encoded_d = xgb.DMatrix(test_case_encoded)
    test_case_encoded_d.feature_names =  test_case_encoded.index.tolist()

    # classify using our pre-trained model
    predictions = dialect_classifier.predict(test_case_encoded_d)

    # return the top 3 classes
    top_3 = np.argsort(predictions, axis=1)[ : ,-3 : ]

    cities = d["class_target"].inverse_transform(top_3[0].tolist())

    return cities

In [2]:
d, d_classes, dialect_classifier, test_case = load_data()
test_case_encoded = encode_data(test_case)
predict_cities(test_case_encoded)
# expected output = 'new york NY', 'houston TX', 'miami FL'

array(['new york NY', 'houston TX', 'miami FL'], dtype=object)

In [5]:
predict_cities(test_case_encoded)[0:3]

array(['new york NY', 'houston TX', 'miami FL'], dtype=object)