In [1]:
import os
import pandas as pd
import json
import joblib
import pickle
import requests
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

The police department of your city has received lots of complaints about its stop and search policy. Every time a car is stopped, the police officers have to decide whether or not to search the car for contraband. According to critics, these searches have a bias against people of certain backgrounds.

Your company has been hired to help with this situation, and you were assigned with creating a service to decide whether or not to search a car, based on objective data. This service will be used by police officers to request authorization to search, and your service will return a Yes or No answer.

These exercises will guide you on your task, step by step.

### 1. Meet the data

Start by getting familiar with the dataset, in file `connecticut_dataset_small.csv`.
Each row in the dataset is about one car search operation, and has 4 fields.

3 features:

- **StatuteReason**: what is the reason why the car was stopped
- **SubjectAge**: the driver's age
- **InterventionDateTime**: when the car search operation occured; date presented in the American way: month/day/year

And the target:
- **ContrabandIndicator**: whether contraband and/or evidence were found

We'll consider that a search was successful if contraband and/or evidence were found.

In [2]:
df = pd.read_csv(os.path.join("data", "connecticut_dataset_small.csv"))
df.head(2)

Unnamed: 0,StatuteReason,SubjectAge,InterventionDateTime,ContrabandIndicator
0,Display of Plates,42.0,05/27/2017 07:30:00 PM,False
1,Registration,40.0,11/08/2014 11:38:00 AM,False


In [3]:
# Create a DataFrame with the 3 features: StatuteReason, SubjectAge, InterventionDateTime
# Keep them in this order
X = df.drop('ContrabandIndicator',axis =1)

# YOUR CODE HERE
#raise NotImplementedError()


# Create a series with the target: ContrabandIndicator
y_ = [int(element) for element in df.ContrabandIndicator.values]
y = pd.Series(y_)


# YOUR CODE HERE
#raise NotImplementedError()

In [4]:
assert isinstance(X, pd.DataFrame)
assert X.columns.tolist() == ["StatuteReason", "SubjectAge", "InterventionDateTime"]

assert isinstance(y, pd.Series)

### 2. Build a model

Build a scikit model that predicts whether a car search was successful, based on the features that you have available. Your model should be delivered as a scikit [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline.predict_proba).

Don't worry too much about the model's performance, anything better than random works! We'll focus on model performance in the next BLUs.

In [5]:
X.isnull().sum()

StatuteReason           10
SubjectAge               0
InterventionDateTime     0
dtype: int64

In [6]:
X.dtypes

StatuteReason            object
SubjectAge              float64
InterventionDateTime     object
dtype: object

In [7]:
X.StatuteReason.unique()

array(['Display of Plates', 'Registration', 'Defective Lights',
       'Traffic Control Signal', 'Seatbelt', 'Window Tint',
       'Speed Related', 'Cell Phone', 'Other', 'Stop Sign', 'Other/Error',
       'Administrative Offense', 'Moving Violation',
       'Unlicensed Operation', 'Suspended License', 'Equipment Violation',
       'STC Violation', 'Stop Sign ', nan], dtype=object)

In [8]:
y

0        0
1        0
2        1
3        0
4        0
        ..
76738    0
76739    0
76740    0
76741    0
76742    0
Length: 76743, dtype: int64

X_ = X.copy()

X_['InterventionDateTime'] = pd.to_datetime(X_['InterventionDateTime'])
X_.dtypes
X_['year'] = X_['InterventionDateTime'].dt.year
X_['month'] = X_['InterventionDateTime'].dt.month
X_['day'] = X_['InterventionDateTime'].dt.day
X_['hour'] = X_['InterventionDateTime'].dt.hour
X_['minute'] = X_['InterventionDateTime'].dt.minute
X_['second'] = X_['InterventionDateTime'].dt.second
X_.drop('InterventionDateTime',axis = 1)


In [9]:
#Define a transformer for datetime feature
from sklearn.base import TransformerMixin, BaseEstimator
class DateTransformer(TransformerMixin, BaseEstimator):
    """Extracts features from datetime column
    
    Returns:
      hour: hour
      day: Between 1 and the number of days in the month
      month: Between 1 and 12 inclusive.
      year: four-digit year
      weekday: day of the week as an integer. Mon=0 and Sun=6
   """
    def fit(self, x, y=None):
        return self
    def transform(self, x, y=None):
        result = pd.DataFrame(x, columns=['date_hour'])
        #result['date_hour'] = pd.to_datetime(result['date_hour'])
        result['hour'] = [dt.hour for dt in result['date_hour']]
        result['day'] = [dt.day for dt in result['date_hour']]
        result['month'] = [dt.month for dt in result['date_hour']]
        result['year'] = [dt.year for dt in result['date_hour']]
        result['weekday'] = [dt.weekday() for dt in 
                             result['date_hour']]
        return result[['hour', 'day', 'month', 'year', 'weekday']]
    
    def get_feature_names(self):
        return ['hour','day', 'month', 'year', 'weekday']

In [10]:
X.head()

Unnamed: 0,StatuteReason,SubjectAge,InterventionDateTime
0,Display of Plates,42.0,05/27/2017 07:30:00 PM
1,Registration,40.0,11/08/2014 11:38:00 AM
2,Defective Lights,19.0,06/23/2016 09:29:50 PM
3,Traffic Control Signal,26.0,01/11/2017 07:28:00 AM
4,Seatbelt,23.0,07/14/2014 08:39:00 AM


In [11]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                     ('encoder', OneHotEncoder(handle_unknown='ignore'))])
num_pipe = Pipeline([('scaler',MinMaxScaler())])
datetime_pipe = Pipeline([('datetime',DateTransformer),('scaler',MinMaxScaler())])

#transformer = ColumnTransformer(transformers= [('cat',cat_pipe,['StatuteReason']),('num',num_pipe,['SubjectAge']),('datetime',datetime_pipe,['InterventionDateTime'])],remainder='passthrough')
transformer = ColumnTransformer(transformers= [('cat',cat_pipe,['StatuteReason']),('num',num_pipe,['SubjectAge'])],remainder='drop')
pipeline = Pipeline(steps = [('t',transformer),('model',LogisticRegression(max_iter=1000))])

#pipeline = make_pipeline(SimpleImputer(strategy='constant',fill_value='missing'),OneHotEncoder())
# YOUR CODE HERE
#raise NotImplementedError()

In [12]:
assert isinstance(pipeline, Pipeline)

In [13]:
# Use cross validation with 5 folds and ROC_AUC as metric, to check your model's performance
roc_aucs = cross_val_score(pipeline,X,y,cv=5,scoring = 'roc_auc')
# YOUR CODE HERE
#raise NotImplementedError()

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [14]:
assert roc_aucs.mean() > 0.5

In [15]:
# Now fit the pipeline to all the training data
pipeline.fit(X,y)
# YOUR CODE HERE
#raise NotImplementedError()

  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('t',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['StatuteReason']),
                                                 ('num',
                                                  Pipeline(steps=[('scaler',
                                                                   MinMaxScaler())]),
                                                  ['SubjectAge'])])),
                ('model', LogisticRegression(max_iter=1000))])

In [16]:
assert pipeline.predict_proba(X.head(1)).shape == (1, 2)

### 3. Serialize all the things!

Now we need to serialize three things:

1. The column names in the correct order
1. The fitted pipeline
1. The dtypes of the columns of the training set

In [17]:
# This is a temporary directory where your serialized files will be saved
# You can change it while working on the exercises locally,
# but change it back to TMP_DIR = '/tmp' before submitting the exercises,
# otherwise grading will fail
TMP_DIR = '/tmp'

In [18]:
# Serialize the column names from the X DataFrame into a file named columns.json
with open(os.path.join(TMP_DIR, "columns.json"),'w') as fh:
    json.dump(X.columns.tolist(),fh)
# YOUR CODE HERE
#raise NotImplementedError()

In [19]:
with open(os.path.join(TMP_DIR, "columns.json"), 'r') as fh:
    columns = json.load(fh)
    
assert columns == X.columns.tolist()

In [20]:
# Pickle the dtypes of the columns from the X DataFrame into a file named dtypes.pickle
# YOUR CODE HERE
with open(os.path.join(TMP_DIR, "dtypes.pickle"),'wb') as fh:
    pickle.dump(X.dtypes,fh)
#raise NotImplementedError()

In [21]:
with open(os.path.join(TMP_DIR, "dtypes.pickle"), 'rb') as fh:
    dtypes = pickle.load(fh)
    
assert dtypes.equals(X.dtypes)

In [22]:
# Pickle the fitted pipeline into a file named pipeline.pickle
# YOUR CODE HERE
joblib.dump(pipeline,os.path.join(TMP_DIR, "pipeline.pickle"))
#raise NotImplementedError()

['/tmp/pipeline.pickle']

In [23]:
pipeline_recovered = joblib.load(os.path.join(TMP_DIR, "pipeline.pickle"))

assert isinstance(pipeline_recovered, Pipeline)
assert pipeline_recovered.predict_proba(X.head(1)).shape == (1, 2)

### 4. Create a new repo for your service

Now it's time to create a new repo for your service. As you learned in the README of the [heroku-model-deploy repository](https://github.com/LDSSA/heroku-model-deploy), duplicate the heroku-model-deploy repo.

From this point on, you should code on the new repo. The remaining exercises in this notebook are questions meant to check if your service is working as expected.

After you've setup your new repo, copy the following things over there:
- `columns.json` file
- `dtypes.pickle` file
- `pipeline.pickle` file
- the package containing custom code in your model (only if you've used it, of course!).

### 5. Build your flask app

#### /predict

At this point, you can either edit the `app.py` file that's in the repo, or start a new file from scratch.
My advice is that you start one from scratch, as it will probably be a better learning experience.

Start by creating a `predict` endpoint, that should receive POST requests, and a JSON payload with:
- id
- observation, which has 3 fields: StatuteReason, SubjectAge, and InterventionDateTime.

This endpoint should return the proba returned by your model for this observation.
Make sure that each field is in the correct format before passing it to the scikit model. If you receive an observation with an invalid value, return an appropriate error message.

When a request is received, you should update your local sqlite database with the following:
- id
- observation
- proba
- true_class (which is null for now)

In case your app has received an observation with an id that it has seen before, it should return an error message, the correspondent proba, and don't store anything.

Try the following commands to check that everything is working as expected.

**Command**

```bash
~ > curl -X POST http://localhost:5000/predict -d '{"id": 0, "observation": {"StatuteReason": "Registration", "SubjectAge": 22.0, "InterventionDateTime": "11/08/2014 11:38:00 AM"}}' -H "Content-Type:application/json"
```


**Expected output**

```json
{
  "proba": 0.3192823701310136
}
```

(any proba value works, it depends on your model, of course!)

**Command**

```bash
~ > curl -X POST http://localhost:5000/predict -d '{"id": 0, "observation": {"StatuteReason": "Registration", "SubjectAge": 22.0, "InterventionDateTime": "11/08/2014 11:38:00 AM"}}' -H "Content-Type:application/json"
```


**Expected output**

```json
{
  "error": "Observation ID: \"0\" already exists",
  "proba": 0.3192823701310136
}
```

**Command**

```bash
curl -X POST http://localhost:5000/predict -d '{"id": 1, "observation": {"StatuteReason": "Registration", "SubjectAge": "hello", "InterventionDateTime": "11/08/2014 11:38:00 AM"}}' -H "Content-Type:application/json"
```

**Expected output**
```json
{
  "error": "Observation is invalid!"
}
```

In [24]:
# When the predict endpoint of your flask app is working as expected,
# set variable predict_endpoint_working_fine to True
predict_endpoint_working_fine = True

# YOUR CODE HERE
#raise NotImplementedError()

In [25]:
assert predict_endpoint_working_fine

#### /update

The update endpoint should receive POST requests, and a JSON payload with:
- id
- true_class

If there is an observation with `id` in your database, you should update the `true_class` value with the value in the request. The response should be the observation, with the updated true_class value.

Otherwise, you should return an appropriate error message.

Try the following commands to check that everything is working as expected.

**Command**

```bash
~ > curl -X POST http://localhost:5000/update -d '{"id": 0, "true_class": 1}'  -H "Content-Type:application/json"
```


**Expected output**

```json
{
  "id": 1,
  "observation": "{\"id\": 0, \"observation\": {\"StatuteReason\": \"Registration\", \"SubjectAge\": 22.0}}",
  "observation_id": 0,
  "proba": 0.3192823701310136,
  "true_class": 1
}
```

**Command**

```bash
~ > curl -X POST http://localhost:5000/update -d '{"id": 3, "true_class": 1}'  -H "Content-Type:application/json"
```


**Expected output**

```json
{
  "error": "Observation ID: \"3\" does not exist"
}
```

In [32]:
# When the predict endpoint of your flask app is working as expected,
# set variable update_endpoint_working_fine to True
update_endpoint_working_fine = True

# YOUR CODE HERE
#raise NotImplementedError()

In [33]:
assert update_endpoint_working_fine

### 6. Deploy your app to heroku

Follow the instructions on the Learning part of this BLU to deploy your app to heroku.

In order to check that your app is working correctly on heroku, re-run the previous commands, but replacing the `localhost` with the URL of your heroku app (like `https://<your-app-name>.herokuapp.com`). For instance, the first command would be:


**Command**

```bash
~ > curl -X POST https://<your-app-name>.herokuapp.com/predict -d '{"id": 0, "observation": {"StatuteReason": "Registration", "SubjectAge": 22.0, "InterventionDateTime": "11/08/2014 11:38:00 AM"}}' -H "Content-Type:application/json"
```


**Expected output**

```json
{
  "proba": 0.3192823701310136
}
```

In [28]:
# In this test, we will call your app to check if it's working as expected
# Assign the variable APP_NAME to the name of your heroku app
APP_NAME = 'heroku-model-deploy-sonia-2'

# YOUR CODE HERE
#raise NotImplementedError()

In [29]:
# Testing the /predict endpoint

url = f"http://{APP_NAME}.herokuapp.com/predict"
payload = {
    "id": 0,
    "observation": {
        "StatuteReason": "Registration",
        "SubjectAge": 22.0,
        "InterventionDateTime": "11/08/2014 11:38:00 AM"
    }
}

r = requests.post(url, json=payload)

assert isinstance(r, requests.Response)
assert r.ok
assert "proba" in r.json()
assert isinstance(r.json()["proba"], float)
assert 0 <= r.json()["proba"] <= 1

In [30]:
# Testing the /update endpoint

url = f"http://{APP_NAME}.herokuapp.com/update"
payload = {
    "id": 0,
    "true_class": 1
}

r = requests.post(url, json=payload)

assert isinstance(r, requests.Response)
assert r.ok
assert "observation" in r.json()
assert "proba" in r.json()
assert "true_class" in r.json()
assert r.json()["true_class"] == 1

In [34]:
!explorer.exe .