In [26]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import fbeta_score,precision_score,recall_score

In [2]:
def data(path):
    df = pd.read_csv(path)
    return df


In [3]:
def process_data(
    X, categorical_features=[], label=None, training=True, encoder=None, lb=None
):
    """ Process the data used in the machine learning pipeline.

    Processes the data using one hot encoding for the categorical features and a
    label binarizer for the labels. This can be used in either training or
    inference/validation.

    Note: depending on the type of model used, you may want to add in functionality that
    scales the continuous data.

    Inputs
    ------
    X : pd.DataFrame
        Dataframe containing the features and label. Columns in `categorical_features`
    categorical_features: list[str]
        List containing the names of the categorical features (default=[])
    label : str
        Name of the label column in `X`. If None, then an empty array will be returned
        for y (default=None)
    training : bool
        Indicator if training mode or inference/validation mode.
    encoder : sklearn.preprocessing._encoders.OneHotEncoder
        Trained sklearn OneHotEncoder, only used if training=False.
    lb : sklearn.preprocessing._label.LabelBinarizer
        Trained sklearn LabelBinarizer, only used if training=False.

    Returns
    -------
    X : np.array
        Processed data.
    y : np.array
        Processed labels if labeled=True, otherwise empty np.array.
    encoder : sklearn.preprocessing._encoders.OneHotEncoder
        Trained OneHotEncoder if training is True, otherwise returns the encoder passed
        in.
    lb : sklearn.preprocessing._label.LabelBinarizer
        Trained LabelBinarizer if training is True, otherwise returns the binarizer
        passed in.
    """

    if label is not None:
        y = X[label]
        X = X.drop([label], axis=1)
    else:
        y = np.array([])

    X_categorical = X[categorical_features].values
    X_continuous = X.drop(*[categorical_features], axis=1)

    if training is True:
        encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
        lb = LabelBinarizer()
        X_categorical = encoder.fit_transform(X_categorical)
        y = lb.fit_transform(y.values).ravel()
    else:
        X_categorical = encoder.transform(X_categorical)
        try:
            y = lb.transform(y.values).ravel()
        # Catch the case where y is None because we're doing inference.
        except AttributeError:
            pass

    X = np.concatenate([X_continuous, X_categorical], axis=1)
    return X, y, encoder, lb

In [4]:
df = data('../data/census.csv')
train, test = train_test_split(df, test_size=0.20)
cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]
X_train, y_train, encoder, lb = process_data(train,categorical_features=cat_features, label='salary', training=True, encoder=None, lb=None)

In [9]:
X_test, y_test, _, _ = process_data(
    test,
    categorical_features=cat_features, 
    label='salary', 
    training=False, 
    encoder=encoder, 
    lb=lb
)

In [17]:
def train_model(X_train, y_train):
    """
    Trains a machine learning model and returns it.

    Inputs
    ------
    X_train : np.array
        Training data.
    y_train : np.array
        Labels.
    Returns
    -------
    model
        Trained machine learning model.
    """

    clf = RandomForestClassifier(n_estimators=100,max_depth=5,criterion="gini",random_state=101)
    model = clf.fit(X_train, y_train)
    return model

model =train_model(X_train,y_train)

In [18]:
model

In [23]:
def inference(model, X):
    """ Run model inferences and return the predictions.

    Inputs
    ------
    model : ???
        Trained machine learning model.
    X : np.array
        Data used for prediction.
    Returns
    -------
    preds : np.array
        Predictions from the model.
    """
    predict = model.predict(X)
    return predict

In [24]:
predict = inference(model,X=X_test)

In [27]:

def compute_model_metrics(y, preds):
    """
    Validates the trained machine learning model using precision, recall, and F1.

    Inputs
    ------
    y : np.array
        Known labels, binarized.
    preds : np.array
        Predicted labels, binarized.
    Returns
    -------
    precision : float
    recall : float
    fbeta : float
    """
    fbeta = fbeta_score(y, preds, beta=1, zero_division=1)
    precision = precision_score(y, preds, zero_division=1)
    recall = recall_score(y, preds, zero_division=1)
    return precision, recall, fbeta

compute_model_metrics(y=y_test, preds=predict)

(0.8479776847977685, 0.38676844783715014, 0.5312363477501093)

In [3]:
# Script to train machine learning model.
# Add the necessary imports for the starter code.
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from ml.data import process_data
from ml.model import train_model 

# Add code to load in the data.
path = '../'
data = pd.read_csv('../data/census.csv')
data = data.drop_duplicates()

# Optional enhancement, use K-fold cross validation instead of a train-test split.
train, test = train_test_split(data, test_size=0.20)

cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

# Proces the train data with the process_data function.
X_train, y_train, encoder, lb = process_data(
    train,
    categorical_features=cat_features, 
    label='salary', 
    training=True, 
    encoder=None, 
    lb=None
)

# Proces the test data with the process_data function.
X_test, y_test, _, _ = process_data(
    test,
    categorical_features=cat_features, 
    label='salary', 
    training=False, 
    encoder=encoder, 
    lb=lb
)

# Train and save a model.
model = train_model(X_train,y_train)
pd.to_pickle(model, os.path.join(path, "model.pkl"))
pd.to_pickle(model, os.path.join(path, "encoder.pkl"))
pd.to_pickle(model, os.path.join(path, "lb.pkl"))

In [11]:
print(model.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 101, 'verbose': 0, 'warm_start': False}


In [15]:
model.get_params()=={'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 5,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 101,
 'verbose': 0,
 'warm_start': False}

True

In [16]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from ml.model import train_model,inference,compute_model_metrics 
from ml.data import process_data

data = pd.read_csv('../data/census.csv')
data = data.drop_duplicates()

train, test = train_test_split(data, test_size=0.20)

cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

X_train, y_train, encoder, lb = process_data(
    train,
    categorical_features=cat_features, 
    label='salary', 
    training=True, 
    encoder=None, 
    lb=None
)

# Proces the test data with the process_data function.
X_test, y_test, _, _ = process_data(
    test,
    categorical_features=cat_features, 
    label='salary', 
    training=False, 
    encoder=encoder, 
    lb=lb
)
# Building test functions
def test_train_model(X_train,y_train):
    '''
    
    Function to test the train_model function in model.py file
    
    '''
    model = train_model()
    assert model.get_params()=={'bootstrap': True,
    'ccp_alpha': 0.0,
    'class_weight': None,
    'criterion': 'gini',
    'max_depth': 5,
    'max_features': 'sqrt',
    'max_leaf_nodes': None,
    'max_samples': None,
    'min_impurity_decrease': 0.0,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'min_weight_fraction_leaf': 0.0,
    'n_estimators': 100,
    'n_jobs': None,
    'oob_score': False,
    'random_state': 101,
    'verbose': 0,
    'warm_start': False}
    assert type(model) == RandomForestClassifier

def test_inference():
    '''
    
    Function to test the inference function in model.py file

    '''
    model = train_model(X_train, y_train)
    preds = inference(model, X_train)
    assert len(preds) == len(X_train) #Assert that the length of the preds and X_train is same
    assert np.all((preds==0)|(preds == 1)) == True #To identify cases where the prediction values are not 0 and 1

def test_compute_model_metrics():
    '''
    
    Function to test the compute_model_metrics function in model.py file

    '''
    model = train_model(X_train, y_train)
    preds = inference(model, X_test)
    metrics = compute_model_metrics(y_test, preds)
    assert len(metrics) == 3 #length of metrics
    assert type(metrics) == tuple #type of metrics
    for metric in metrics:
        assert metric >=0 and metric <= 1 #to ensure all metrics values are between 0 and 1

if __name__ == "__main__":
    test_train_model()
    test_inference()
    test_compute_model_metrics()

NameError: name 'df' is not defined