In [121]:
import pandas as pd
import requests

data = {'age': 19,
         'workclass': 'Private',
         'fnlgt': 149184,
         'education': 'HS-grad',
         'marital_status': 'Never-married',
         'occupation': 'Prof-specialty',
         'relationship': 'Not-in-family',
         'race': 'White',
         'sex': 'Male',
         'hoursPerWeek': 60,
         'nativeCountry': 'United-States'
}

local = "http://127.0.0.1:8000"
aws = "http://mlops-loadb-103hyin3qcqrm-38620e611b8de66e.elb.us-east-1.amazonaws.com:8000"

In [122]:
r = requests.post(f'{local}/', json=data)

assert r.status_code == 200

print("Response code: %s" % r.status_code)
print("Response body: %s" % r.json())

Response code: 200
Response body: {'prediction': '<=50K'}


In [22]:
import pandas as pd
data_dict = {'age': 19,
                 'workclass': 'Private',
                 'fnlgt': 77516,
                 'education': 'HS-grad',
                 'marital-status': 'Never-married',
                 'occupation': 'Own-child',
                 'relationship': 'Husband',
                 'race': 'Black',
                 'sex': 'Male',
                 'hours-per-week': 40,
                 'native-country': 'United-States'
                 }
df = pd.DataFrame(data=data_dict.values(),
                      index=data_dict.keys()).T

In [23]:
df

Unnamed: 0,age,workclass,fnlgt,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country
0,19,Private,77516,HS-grad,Never-married,Own-child,Husband,Black,Male,40,United-States


In [24]:
 df = pd.DataFrame.from_dict([data_dict], orient='columns')

In [25]:
df

Unnamed: 0,age,workclass,fnlgt,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country
0,19,Private,77516,HS-grad,Never-married,Own-child,Husband,Black,Male,40,United-States


In [7]:

def process_data(X,
                 root_path=None,
                 categorical_features=[],
                 label=None,
                 training=True,
                 encoder=None,
                 lb=None,
                 ):
    """ Process the data used in the machine learning pipeline.

    Processes the data using one hot encoding for the categorical features and a
    label binarizer for the labels. This can be used in either training or
    inference/validation.

    Note: depending on the type of model used, you may want to add in functionality that
    scales the continuous data.

    Inputs
    ------
    X : pd.DataFrame
        Dataframe containing the features and label. Columns in `categorical_features`
    root_path: str:
        String path to the root folder
    categorical_features: list[str]
        List containing the names of the categorical features (default=[])
    label : str
        Name of the label column in `X`. If None, then an empty array will be returned
        for y (default=None)
    training : bool
        Indicator if training mode or inference/validation mode.
    encoder : sklearn.preprocessing._encoders.OneHotEncoder
        Trained sklearn OneHotEncoder, only used if training=False.
    lb : sklearn.preprocessing._label.LabelBinarizer
        Trained sklearn LabelBinarizer, only used if training=False.

    Returns
    -------
    X : np.array
        Processed data.
    y : np.array
        Processed labels if labeled=True, otherwise empty np.array.
    encoder : sklearn.preprocessing._encoders.OneHotEncoder
        Trained OneHotEncoder if training is True, otherwise returns the encoder passed
        in.
    lb : sklearn.preprocessing._label.LabelBinarizer
        Trained LabelBinarizer if training is True, otherwise returns the binarizer
        passed in.
    """

    if label is not None:
        y = X[label]
        X = X.drop([label], axis=1)
    else:
        y = np.array([])

    X_categorical = X[categorical_features].values
    X_continuous = X.drop(*[categorical_features], axis=1)

    if training is True:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        lb = LabelBinarizer()
        X_categorical = encoder.fit_transform(X_categorical)
        y = lb.fit_transform(y.values).ravel()
    else:
        X_categorical = encoder.transform(X_categorical)
        try:
            y = lb.transform(y.values).ravel()
        # Catch the case where y is None because we're doing inference.
        except AttributeError:
            pass

    X = np.concatenate([X_continuous, X_categorical], axis=1)

    if training is True:
        dump(encoder, f"{root_path}/model_output/encoder.joblib")
        dump(lb, f"{root_path}/model_output/lb.joblib")
        dff = pd.DataFrame(X)
        dff.to_csv(f"{root_path}/model_output/tt.csv")

    return X, y, encoder, lb

In [37]:
def run_inference(data, cat_features):
    """Load model and run inference
    Parameters
    ----------
    root_path
    data
    cat_features

    Returns
    -------
    prediction
    """
    model = joblib.load("/Users/srs/Projects/udacity/ml_devops/mlops-census/src/model_output/model.joblib")
    encoder = joblib.load("/Users/srs/Projects/udacity/ml_devops/mlops-census/src/model_output/encoder.joblib")
    lb = joblib.load("/Users/srs/Projects/udacity/ml_devops/mlops-census/src/model_output/lb.joblib")

    X, _, _, _ = process_data(X=data,
                              categorical_features=cat_features,
                              encoder=encoder, 
                              lb=lb,
                              training=False)
    preds = model.predict(X)
    prediction = lb.inverse_transform(preds)[0]

    return prediction.strip()

In [108]:
data_dict

{'age': 42,
 'workclass': ' Private',
 'fnlgt': 159449,
 'education': ' Bachelors',
 'marital_status': ' Married-civ-spouse',
 'occupation': ' Exec-managerial',
 'relationship': ' Husband',
 'race': ' White',
 'sex': ' Male',
 'hours_per_week': 40,
 'native_country': ' United-States'}

In [103]:
data_dict.pop('salary')

KeyError: 'salary'

In [113]:
{key.strip(): values.strip() for key, values in data_dict.items()}

AttributeError: 'int' object has no attribute 'strip'

In [100]:
data_dict ={'age': 60,
                 'workclass': 'Self-emp-not-inc',
                 'fnlgt': 209642,
                 'education': 'Assoc-acdm',
                 'marital_status': ' Married-civ-spouse',
                 'occupation': 'Exec-managerial',
                 'relationship': 'Husband',
                 'race': 'White',
                 'sex': 'Male',
                 'hours-per-week': 80,
                 'native_country': 'United-States'
                 }
df = pd.DataFrame.from_dict([data_dict], orient='columns')
df

Unnamed: 0,age,workclass,fnlgt,education,marital_status,occupation,relationship,race,sex,hours-per-week,native_country
0,60,Self-emp-not-inc,209642,Assoc-acdm,Married-civ-spouse,Exec-managerial,Husband,White,Male,80,United-States


In [101]:
run_inference(df,cat_features)

'<=50K'

In [125]:
ddd = dd[dd.salary == ' >50K']
for i in range(100):
    data_dict = ddd.iloc[i].to_dict()
    data_dict.pop('salary')
    df = pd.DataFrame.from_dict([data_dict], orient='columns')
    print(run_inference(df,cat_features))
    if '>50K' == run_inference(df,cat_features):
        break
    print('----')

<=50K
----
<=50K
----
>50K


In [126]:
data_dict

{'age': 42,
 'workclass': 'Private',
 'fnlgt': 159449,
 'education': 'Bachelors',
 'marital_status': 'Married-civ-spouse',
 'occupation': 'Exec-managerial',
 'relationship': 'Husband',
 'race': 'White',
 'sex': 'Male',
 'hours_per_week': 40,
 'native_country': 'United-States'}

In [86]:
run_inference(df,cat_features)

'<=50K'

In [61]:
cat_features  = [
    "workclass",
      "education",
      "marital_status",
      "occupation",
      "relationship",
      "race",
      "sex",
      "native_country",
  ]

In [62]:
cat_features

['workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native_country']

In [63]:
import joblib

In [64]:
run_inference(df,cat_features)

'<=50K'

In [65]:
import numpy as np

In [36]:
!pwd

/Users/srs/Projects/udacity/ml_devops/mlops-census/notebooks


In [123]:
dd= pd.read_csv('/Users/srs/Projects/udacity/ml_devops/mlops-census/src/data/clean/census.csv')

In [124]:
dd

Unnamed: 0,age,workclass,fnlgt,education,marital_status,occupation,relationship,race,sex,hours_per_week,native_country,salary
0,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...
32532,27,Private,257302,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,38,United-States,<=50K
32533,40,Private,154374,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K
32534,58,Private,151910,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,40,United-States,<=50K
32535,22,Private,201490,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,20,United-States,<=50K


In [90]:
dd.iloc[0].to_dict()

{'age': 39,
 'workclass': ' State-gov',
 'fnlgt': 77516,
 'education': ' Bachelors',
 'marital_status': ' Never-married',
 'occupation': ' Adm-clerical',
 'relationship': ' Not-in-family',
 'race': ' White',
 'sex': ' Male',
 'hours_per_week': 40,
 'native_country': ' United-States',
 'salary': ' <=50K'}

In [None]:
df

In [80]:
add.salary.value_counts()

 <=50K    24698
 >50K      7839
Name: salary, dtype: int64

In [83]:
dd.salary.iloc[0]

' <=50K'

In [84]:
dd[dd.salary == ' >50K']

Unnamed: 0,age,workclass,fnlgt,education,marital_status,occupation,relationship,race,sex,hours_per_week,native_country,salary
7,52,Self-emp-not-inc,209642,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,>50K
8,31,Private,45781,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,50,United-States,>50K
9,42,Private,159449,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,40,United-States,>50K
10,37,Private,280464,Some-college,Married-civ-spouse,Exec-managerial,Husband,Black,Male,80,United-States,>50K
11,30,State-gov,141297,Bachelors,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,40,India,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...
32515,71,?,287372,Doctorate,Married-civ-spouse,?,Husband,White,Male,10,United-States,>50K
32521,39,Local-gov,111499,Assoc-acdm,Married-civ-spouse,Adm-clerical,Wife,White,Female,20,United-States,>50K
32530,53,Private,321865,Masters,Married-civ-spouse,Exec-managerial,Husband,White,Male,40,United-States,>50K
32533,40,Private,154374,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K


In [115]:
dd.workclass.value_counts()

 Private             22673
 Self-emp-not-inc     2540
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64

In [117]:
col_input_clean = ["workclass", "education", "marital_status", "occupation", "relationship","race", "sex","nativeCountry"]

In [120]:
i = 0
dd[col_input_clean[i]] = dd[col_input_clean[i]].apply(lambda x: x.strip())

In [None]:
['age', 'workclass', 'fnlgt', 'education', 'marital_status',
       'occupation', 'relationship', 'race', 'sex', 'hours_per_week',
       'native_country', 'salary'