In [1]:
import pandas as pd
import numpy as np
from numpy import std, mean
import logging
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, LabelEncoder
from sklearn.metrics import fbeta_score, precision_score, recall_score
# from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
import os


In [2]:
# df = pd.read_csv("data/census.csv", skipinitialspace = True)

In [3]:
!ls data/

[1m[36mclean[m[m [1m[36mmodel[m[m [1m[36mraw[m[m


In [4]:
df = pd.read_csv("data/raw/census.csv")

In [5]:
df.columns = df.columns.str.strip()

In [6]:
df.head(10)

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [7]:
# lb = LabelEncoder() 
# df.salary = lb.fit_transform(df.salary)

In [8]:
df.salary.value_counts()

 <=50K    24720
 >50K      7841
Name: salary, dtype: int64

In [9]:
# !pip3 install pandas_profiling

In [10]:
# import pandas_profiling

# profile = pandas_profiling.ProfileReport(df)
# profile.to_widgets()

In [11]:
df = df.drop_duplicates()

In [12]:
train_set, test_set = train_test_split(df, test_size=0.20, random_state=42, stratify=df.salary)

In [13]:
train_set.head(4)

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
24764,33,Private,279173,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,60,United-States,<=50K
9890,59,Private,271571,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,15024,0,50,United-States,>50K
2754,80,Self-emp-not-inc,184335,7th-8th,4,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,30,United-States,<=50K
5298,33,Private,100135,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,1740,25,United-States,<=50K


In [14]:
train_set.head(4)

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
24764,33,Private,279173,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,60,United-States,<=50K
9890,59,Private,271571,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,15024,0,50,United-States,>50K
2754,80,Self-emp-not-inc,184335,7th-8th,4,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,30,United-States,<=50K
5298,33,Private,100135,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,1740,25,United-States,<=50K


In [15]:
train_set.salary.value_counts()/len(train_set)

 <=50K    0.759076
 >50K     0.240924
Name: salary, dtype: float64

In [16]:
test_set.salary.value_counts()/len(test_set)

 <=50K    0.759066
 >50K     0.240934
Name: salary, dtype: float64

In [17]:
cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
    ]

In [59]:
def process_data(
    X, categorical_features=[], label=None, training=True, encoder=None, lb=None
):
    """ Process the data used in the machine learning pipeline.

    Processes the data using one hot encoding for the categorical features and a
    label binarizer for the labels. This can be used in either training or
    inference/validation.

    Note: depending on the type of model used, you may want to add in functionality that
    scales the continuous data.

    Inputs
    ------
    X : pd.DataFrame
        Dataframe containing the features and label. Columns in `categorical_features`
    categorical_features: list[str]
        List containing the names of the categorical features (default=[])
    label : str
        Name of the label column in `X`. If None, then an empty array will be returned
        for y (default=None)
    training : bool
        Indicator if training mode or inference/validation mode.
    encoder : sklearn.preprocessing._encoders.OneHotEncoder
        Trained sklearn OneHotEncoder, only used if training=False.
    lb : sklearn.preprocessing._label.LabelBinarizer
        Trained sklearn LabelBinarizer, only used if training=False.

    Returns
    -------
    X : np.array
        Processed data.
    y : np.array
        Processed labels if labeled=True, otherwise empty np.array.
    encoder : sklearn.preprocessing._encoders.OneHotEncoder
        Trained OneHotEncoder if training is True, otherwise returns the encoder passed
        in.
    lb : sklearn.preprocessing._label.LabelBinarizer
        Trained LabelBinarizer if training is True, otherwise returns the binarizer
        passed in.
    """

    if label is not None:
        y = X[label]
        X = X.drop([label], axis=1)
    else:
        label ='salary'
        X = X.drop([label], axis=1)
        y = np.array([])

    X_categorical = X[categorical_features].values
    X_continuous = X.drop(*[categorical_features], axis=1)

    if training is True:
        encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
        lb = LabelBinarizer()
        X_categorical = encoder.fit_transform(X_categorical)
        y = lb.fit_transform(y.values).ravel()
    else:
#         X_categorical = encoder.fit_transform(X_categorical)
        X_categorical = encoder.transform(X_categorical)
        try:
            y = lb.transform(y.values).ravel()
        # Catch the case where y is None because we're doing inference.
        except AttributeError:
            pass

    X = np.concatenate([X_continuous, X_categorical], axis=1)
    return X, y, encoder, lb


In [60]:
X_train, y_train, encoder, lb = process_data(
    train_set, categorical_features=cat_features, label="salary", training=True
)

In [61]:
X_train[0:1,:]

array([[3.30000e+01, 2.79173e+05, 9.00000e+00, 0.00000e+00, 0.00000e+00,
        6.00000e+01, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00

In [62]:
y_train[0:4]

array([0, 1, 0, 0])

In [63]:
encoder

In [64]:
lb

In [65]:
def train_model(X_train, y_train):
    """
    Trains a machine learning model and returns it.
    Inputs
    ------
    X_train : np.array
        Training data.
    y_train : np.array
        Labels.
    Returns
    -------
    model
        Trained machine learning model.
    """
    model = RandomForestClassifier(min_samples_split=25)
    model.fit(X_train,y_train)
    return model

In [66]:
model = train_model(X_train, y_train)

In [67]:
model

In [68]:
encoder

In [69]:
pd.to_pickle(model, "model.pkl")

#Saving the encoder and the LabelBinarizer for being used in the API later
pd.to_pickle(encoder, "encoder.pkl")
pd.to_pickle(lb, "lb.pkl")

In [70]:
!ls

EDA.ipynb                     model.pkl
README.md                     model_card_template.md
[1m[36m__pycache__[m[m                   requirements.txt
[1m[36mdata[m[m                          sanitycheck.py
dvc_on_heroku_instructions.md [1m[36mscreenshots[m[m
encoder.pkl                   setup.py
lb.pkl                        slice.py
main.py                       [1m[36mstarter[m[m
[1m[36mmodel[m[m                         train_model.py


In [71]:
# Encoder = pd.read_pickle(r"encoder.pkl")

In [72]:
# X_processed, y_processed, encoder, lb = process_data(test_set, categorical_features=cat_features, training=False,encoder=encoder)


In [73]:
X_test, y_test, encoder, lb = process_data(test_set, categorical_features=cat_features, training=False,encoder=encoder)


In [74]:
# X_processed

In [75]:
X_test[0:1,:]

array([[2.70000e+01, 1.78709e+05, 1.40000e+01, 0.00000e+00, 0.00000e+00,
        4.00000e+01, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00

In [76]:
# y_processed

In [77]:
y_test

array([], dtype=float64)

In [78]:
# X_processed, y_processed, encoder, lb = process_data(test_set, categorical_features=cat_features, label="salary", training=False,encoder=Encoder)


In [79]:
# y_processed

In [80]:
# X_processed, y_processed, encoder, lb = process_data(test_set, categorical_features=cat_features, training=False,encoder=Encoder, lb=LabelBinarizer())


In [81]:
# y_processed

In [82]:
# X_processed

In [83]:
# np.set_printoptions(threshold=np.inf)

In [84]:
# lb = LabelBinarizer()

In [85]:
# lb.fit_transform(y_processed)

In [86]:
len(X_test)

6508

In [87]:
# X_processed = X_processed[:,:-1]

In [88]:
#Calling the inference function to make a prediction  
# prediction_outcome = inference(model, X_processed)

In [89]:
# prediction_outcome

In [90]:
# process_data(
#     train_set, categorical_features=cat_features, label="salary", training=True
#     )

In [91]:
# df_processed = process_data(
#     df, categorical_features=cat_features, label="salary", training=True
#     )

In [92]:
# def train_test_model():
#     # Add the necessary imports for the starter code.       
#     # Add code to load in the data.
#     df = pd.read_csv("data/raw/census.csv")
#     df.columns = df.columns.str.strip()
#     df = df.drop_duplicates()

# # Optional enhancement, use K-fold cross validation instead of a train-test split.
#     train_set, test_set = train_test_split(df, test_size=0.20, random_state=42)

#     cat_features = [
#     "workclass",
#     "education",
#     "marital-status",
#     "occupation",
#     "relationship",
#     "race",
#     "sex",
#     "native-country",
#     ]
#     X_train, y_train, encoder, lb = process_data(
#     train_set, categorical_features=cat_features, label="salary", training=True
#     )

#     trained_model = train_model(X_train, y_train)

In [93]:
# model=train_test_model()

In [94]:
# model.fit?

In [95]:
def compute_model_metrics(y, preds):
    """
    Validates the trained machine learning model using precision, recall, and F1.

    Inputs
    ------
    y : np.array
        Known labels, binarized.
    preds : np.array
        Predicted labels, binarized.
    Returns
    -------
    precision : float
    recall : float
    fbeta : float
    """
    fbeta = fbeta_score(y, preds, beta=1, zero_division=1)
    precision = precision_score(y, preds, zero_division=1)
    recall = recall_score(y, preds, zero_division=1)
    return precision, recall, fbeta

In [96]:
def inference(model, X):
    """ Run model inferences and return the predictions.

    Inputs
    ------
    model : ???
        Trained machine learning model.
    X : np.array
        Data used for prediction.
    Returns
    -------
    preds : np.array
        Predictions from the model.
    """
    y_preds = model.predict(X)
    return y_preds

In [97]:
# inference(model,X_processed)

In [98]:
y_pred=inference(model, X_test)

In [99]:
y_pred

array([0, 1, 0, ..., 1, 0, 0])

In [100]:
y =test_set.iloc[:,-1:]

In [101]:
y[0:3]

Unnamed: 0,salary
20334,<=50K
23878,>50K
19996,<=50K


In [102]:
lb = LabelEncoder() 
y = lb.fit_transform(np.ravel(y))

In [103]:
y[0:3]

array([0, 1, 0])

In [104]:
# compute_model_metrics(y, prediction_outcome)

In [105]:
compute_model_metrics(y, y_pred)

(0.7771739130434783, 0.6383928571428571, 0.7009803921568628)

In [106]:
f"Precision:{round(compute_model_metrics(y, y_pred)[0],2)} \
Recall:{round(compute_model_metrics(y, y_pred)[1],2)} \
fbeta:{round(compute_model_metrics(y, y_pred)[2],2)}" 

'Precision:0.78 Recall:0.64 fbeta:0.7'

In [107]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_pred)

array([[4653,  287],
       [ 567, 1001]])

In [108]:
cat_features

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [109]:
test_set.head()

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
20334,27,Private,178709,Masters,14,Never-married,Machine-op-inspct,Not-in-family,White,Female,0,0,40,United-States,<=50K
23878,51,Local-gov,387250,Bachelors,13,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,50,United-States,>50K
19996,34,Private,202498,12th,8,Married-civ-spouse,Other-service,Husband,White,Male,0,0,40,Dominican-Republic,<=50K
21440,31,Private,133861,Assoc-acdm,12,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,40,United-States,<=50K
10000,34,Private,120461,Some-college,10,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,50,United-States,<=50K


In [110]:
slice_values = []
cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]


In [111]:
test_set[cat_features]

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
20334,Private,Masters,Never-married,Machine-op-inspct,Not-in-family,White,Female,United-States
23878,Local-gov,Bachelors,Married-civ-spouse,Adm-clerical,Husband,White,Male,United-States
19996,Private,12th,Married-civ-spouse,Other-service,Husband,White,Male,Dominican-Republic
21440,Private,Assoc-acdm,Divorced,Exec-managerial,Not-in-family,White,Male,United-States
10000,Private,Some-college,Divorced,Adm-clerical,Not-in-family,White,Female,United-States
...,...,...,...,...,...,...,...,...
12569,Private,HS-grad,Never-married,Craft-repair,Not-in-family,White,Male,United-States
16643,Private,Assoc-voc,Married-civ-spouse,Craft-repair,Husband,White,Male,United-States
26695,Self-emp-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
702,Private,Some-college,Never-married,Handlers-cleaners,Own-child,White,Male,United-States


In [112]:
X_test

array([[2.70000e+01, 1.78709e+05, 1.40000e+01, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       [5.10000e+01, 3.87250e+05, 1.30000e+01, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       [3.40000e+01, 2.02498e+05, 8.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       ...,
       [3.40000e+01, 1.98613e+05, 9.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       [2.10000e+01, 1.63870e+05, 1.00000e+01, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       [2.20000e+01, 1.84813e+05, 1.00000e+01, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00]])

In [113]:
len(X_test)

6508

In [120]:
encoder

In [116]:
encoder.categories

'auto'

In [123]:
encoder = pd.read_pickle(r"encoder.pkl") 

In [124]:
encoder

In [126]:
path="nd0821-c3-starter-code/starter"
# df = pd.read_csv(os.path.join(path, "data/raw/census.csv"))
df = pd.read_csv("data/raw/census.csv")

df.columns = df.columns.str.strip()
df = df.drop_duplicates()
# model = pd.read_pickle(r"model/model.pkl")
# encoder = pd.read_pickle(r"model/encoder.pkl") 
# lb = pd.read_pickle(r"model/lb.pkl")
# model = pd.read_pickle(r"nd0821-c3-starter-code/starter/model/model.pkl")
# encoder = pd.read_pickle(r"nd0821-c3-starter-code/starter/model/encoder.pkl") 
# lb = pd.read_pickle(r"nd0821-c3-starter-code/starter/model/lb.pkl")

# df = pd.read_csv("data/prepared/census.csv")
_, test_set = train_test_split(df, test_size=0.20, random_state=42, stratify=df.salary)
# _, test = train_test_split(df, test_size=0.20)


for cat in cat_features:
        for cls in test_set[cat].unique():
            df_temp = test_set[test_set[cat] == cls]
            
#             lb = LabelEncoder() 
#             y = lb.fit_transform(np.ravel(y))
            slice_metrics = []
            encoder = pd.read_pickle(r"encoder.pkl") 
            X_test, y_test, _, _ = process_data(
                df_temp,
                cat_features,
                label= None, encoder=encoder, lb=lb, training=False)

#             y_preds = model.predict(X_test)
            y_preds=inference(model, X_test)
            y =df_temp.iloc[:,-1:]
            lb = LabelEncoder() 
            y = lb.fit_transform(np.ravel(y))
            prc, rcl, fb = compute_model_metrics(y, y_preds)
            line = "[%s->%s] Precision: %s " \
                   "Recall: %s FBeta: %s" % (cat, cls, prc, rcl, fb)
#             logging.info(line)
            slice_metrics.append(line)
            print(slice_metrics)
        print()
#             print(f"Cat: {cat} Cls: {cls}")
#             #print(f"y_preds: {y_preds}")
#             print(f"prc: {prc}, rcl: {rcl}, fb: {fb}")
#         print()
#             y =test_set.iloc[:,-1:]
            
#             lb = LabelEncoder() 
#             y = lb.fit_transform(np.ravel(y))

#             prc, rcl, fb = compute_model_metrics(y_test,
#                                                                       y_preds)

#             line = "[%s->%s] Precision: %s " \
#                    "Recall: %s FBeta: %s" % (cat, cls, prc, rcl, fb)
#             logging.info(line)
#             slice_values.append(line)

['[workclass-> Private] Precision: 0.7730138713745272 Recall: 0.6142284569138277 FBeta: 0.6845337800111669']
['[workclass-> Local-gov] Precision: 0.7410714285714286 Recall: 0.680327868852459 FBeta: 0.7094017094017093']
['[workclass-> Self-emp-not-inc] Precision: 0.8055555555555556 Recall: 0.5918367346938775 FBeta: 0.6823529411764706']
['[workclass-> State-gov] Precision: 0.7384615384615385 Recall: 0.7058823529411765 FBeta: 0.7218045112781954']
['[workclass-> ?] Precision: 0.8148148148148148 Recall: 0.6111111111111112 FBeta: 0.6984126984126984']
['[workclass-> Federal-gov] Precision: 0.75 Recall: 0.6575342465753424 FBeta: 0.7007299270072993']
['[workclass-> Self-emp-inc] Precision: 0.8403361344537815 Recall: 0.8064516129032258 FBeta: 0.8230452674897119']
['[workclass-> Never-worked] Precision: 1.0 Recall: 1.0 FBeta: 1.0']
['[workclass-> Without-pay] Precision: 1.0 Recall: 1.0 FBeta: 1.0']

['[education-> Masters] Precision: 0.8502673796791443 Recall: 0.888268156424581 FBeta: 0.868852459

['[native-country-> Columbia] Precision: 0.0 Recall: 1.0 FBeta: 0.0']
['[native-country-> Honduras] Precision: 1.0 Recall: 1.0 FBeta: 1.0']
['[native-country-> Cambodia] Precision: 0.0 Recall: 0.0 FBeta: 0.0']
['[native-country-> Trinadad&Tobago] Precision: 1.0 Recall: 1.0 FBeta: 1.0']
['[native-country-> Iran] Precision: 1.0 Recall: 0.0 FBeta: 0.0']
['[native-country-> Outlying-US(Guam-USVI-etc)] Precision: 1.0 Recall: 1.0 FBeta: 1.0']
['[native-country-> Laos] Precision: 1.0 Recall: 1.0 FBeta: 1.0']
['[native-country-> Hong] Precision: 0.5 Recall: 1.0 FBeta: 0.6666666666666666']
['[native-country-> Thailand] Precision: 1.0 Recall: 1.0 FBeta: 1.0']
['[native-country-> Greece] Precision: 1.0 Recall: 1.0 FBeta: 1.0']
['[native-country-> Ireland] Precision: 1.0 Recall: 1.0 FBeta: 1.0']
['[native-country-> Scotland] Precision: 1.0 Recall: 1.0 FBeta: 1.0']



In [None]:
df[df['native-country'].str.contains("Dominican-Republic")]

In [None]:
test_set[test_set['native-country'].str.contains("Dominican-Republic")]

In [None]:
df.head()

In [None]:
test_set["native-country"]

In [None]:
for cat in cat_features:
        for cls in test_set[cat].unique():
            df_temp = test_set[test_set[cat] == cls]

#             X_test, y_test, _, _ = process_data(
#                 df_temp,
#                 cat_features,
#                 label="salary", encoder=encoder, lb=lb, training=False)

#             y_preds = model.predict(X_test)
            y_preds=inference(model, X_test)
            
            y =test_set.iloc[:,-1:]
            
            lb = LabelEncoder() 
            y = lb.fit_transform(np.ravel(y))

            prc, rcl, fb = compute_model_metrics(y_test,
                                                                      y_preds)

            line = "[%s->%s] Precision: %s " \
                   "Recall: %s FBeta: %s" % (cat, cls, prc, rcl, fb)
            logging.info(line)
            slice_values.append(line)

In [None]:
def slice():
    cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
    ]
    
    for cat in cat_features:
        for cls in test_set[cat].unique():
            df_temp = test_set[test_set[cat] == cls]

            X_test, y_test, _, _ = process_data(
                df_temp,
                cat_features,
                label="salary", encoder=encoder, lb=lb, training=False)

            y_preds = model.predict(X_test)

            prc, rcl, fb = compute_model_metrics(y_test,
                                                                      y_preds)

            line = "[%s->%s] Precision: %s " \
                   "Recall: %s FBeta: %s" % (cat, cls, prc, rcl, fb)
            logging.info(line)
            return slice_values.append(line)

In [None]:
def check_score():
    """
    Execute score checking
    """
    df = pd.read_csv("data/prepared/census.csv")
    _, test = train_test_split(df, test_size=0.20)

    trained_model = load("data/model/model.joblib")
    encoder = load("data/model/encoder.joblib")
    lb = load("data/model/lb.joblib")

    slice_values = []

    for cat in src.common_functions.get_cat_features():
        for cls in test[cat].unique():
            df_temp = test[test[cat] == cls]

            X_test, y_test, _, _ = src.common_functions.process_data(
                df_temp,
                categorical_features=src.common_functions.get_cat_features(),
                label="salary", encoder=encoder, lb=lb, training=False)

            y_preds = trained_model.predict(X_test)

            prc, rcl, fb = src.common_functions.compute_model_metrics(y_test,
                                                                      y_preds)


In [None]:
cat_features

In [None]:
test_set.head()

In [None]:
test_set["workclass"].unique()

In [None]:
test_set["education"].unique()

In [None]:
test_set["native-country"].unique()

In [None]:
df["native-country"].unique()

In [None]:
# df[df["native-country"] == 'Cuba'] 
df1 = df[df['native-country'].str.contains("Cuba")]

df1

In [None]:
df.head()

In [None]:
test_set[[cat_features]].unique()

In [None]:
_ , test_set = train_test_split(df, test_size=0.20, random_state=42, stratify=df.salary)

In [None]:
test_set

In [None]:
slice_values

In [None]:
for cls in test_set[cat_features].unique():
    df_temp = test_set[test_set[cat_features] == cls]

    X_test, y_test, _, _ = process_data(
        df_temp,
        categorical_features=src.common_functions.get_cat_features(),
        label="salary", encoder=encoder, lb=lb, training=False)
    y_preds = model.predict(X_test)

    precision, recall, fb = compute_model_metrics(y_test, y_preds)

    line = "[%s->%s] Precision: %s " \
                   "Recall: %s FBeta: %s" % (cat_features, cls, precision, recall, fb)
    logging.info(line)
    slice_values.append(line)


In [None]:
_ , test_set = train_test_split(df, test_size=0.20, random_state=42, stratify=df.salary)


slice_values = []
cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

for cls in test_set[cat_features].unique():
    df_temp = test_set[test_set[cat_features] == cls]

    X_test, y_test, _, _ = process_data(
        df_temp,
        categorical_features=src.common_functions.get_cat_features(),
        label="salary", encoder=encoder, lb=lb, training=False)

        y_preds = model.predict(X_test)

        precision, recall, fb = compute_model_metrics(y_test,
                                                                      y_preds)

        line = "[%s->%s] Precision: %s " \
                   "Recall: %s FBeta: %s" % (cat_features, cls, precision, recall, fb)
        logging.info(line)
        slice_values.append(line)

with open('data/model/slice_output.txt', 'w') as out:
    for slice_value in slice_values:
        out.write(slice_value + '\n')
