### Homework week 7

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
import bentoml


In [2]:
!bentoml --version

bentoml, version 1.0.5


### Question 1

What's the version of BentoML you installed?

#### Answer: 1.0.5

In [3]:
data = 'CreditScoring.csv'
df = pd.read_csv(data)

In [4]:
df.columns = df.columns.str.lower()

status_values = {
    1: 'ok',
    2: 'default',
    0: 'unk'
}

df.status = df.status.map(status_values)

home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

df.home = df.home.map(home_values)

marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

df.marital = df.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

df.records = df.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.job = df.job.map(job_values)

for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)

df = df[df.status != 'unk'].reset_index(drop=True)

In [5]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=11)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = (df_train.status == 'default').astype('int').values
y_test = (df_test.status == 'default').astype('int').values

del df_train['status']
del df_test['status']

In [6]:
dv = DictVectorizer(sparse=False)

train_dicts = df_train.fillna(0).to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

test_dicts = df_test.fillna(0).to_dict(orient='records')
X_test = dv.transform(test_dicts)

In [7]:
dtrain = xgb.DMatrix(X_train, label=y_train)

xgb_params = {
    'eta': 0.1, 
    'max_depth': 3,
    'min_child_weight': 1,

    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=175)

In [8]:
bentoml.xgboost.save_model(
    'credit_risk_model',
    model,
    custom_objects={
        'dictVectorizer': dv
    })

Model(tag="credit_risk_model:jab6mysszcsyw7fs", path="C:\Users\rizdi\bentoml\models\credit_risk_model\jab6mysszcsyw7fs\")

In [9]:
!bentoml models list

 Tag                          Module           Size        Creation Time       
 credit_risk_model:jab6myss…  bentoml.xgboost  197.75 KiB  2022-10-23 12:46:07 
 credit_scoring_model:bp7qe…  bentoml.xgboost  270.63 KiB  2022-10-18 20:21:20 
 churn_clf:bciy52buo2who7fs   bentoml.sklearn  2.26 MiB    2022-09-14 22:41:47 
 iris_clf:zxlu5qbufkepq7fs    bentoml.sklearn  5.77 KiB    2022-09-14 13:43:16 


### Question 2

How big approximately is the saved BentoML model? Size can slightly vary depending on your local development environment. Choose the size closest to your model.

    924kb
    724kb
    114kb
    8kb

#### Answer: 114kb

In [10]:
# Data sent to our service

new_input = {
  "name": "Tim",
  "age": 37,
  "country": "US",
  "rating": 3.14
}

In [12]:
from pydantic import BaseModel, ValidationError, validator

class UserProfile(BaseModel):
    name: str
    age: int
    country: str
    rating: float

### Question 3

What would the pydantic class look like? You can name the class UserProfile.

#### Answer: 
```
from pydantic import BaseModel, ValidationError, validator

class UserProfile(BaseModel):
    name: str
    age: int
    country: str
    rating: float

```

### Question 4

What version of scikit-learn was this model trained with?

![Screenshot](images/homework_tree_view.png)

#### Answer: 1.1.1

### Question 5

You can use curl or the Swagger UI. What value does it return?

#### Answer: 1

### Question 6

Which model has better performance at higher volumes?

#### Answer: The first model