In [None]:
!pip install creme

In [None]:
import copy
import collections 
import datetime
import random
import tqdm

In [None]:
from creme import compose
from creme import feature_extraction
from creme import linear_model
from creme import metrics
from creme import neighbors
from creme import optim
from creme import preprocessing
from creme import stats
from creme import stream

In [None]:
def extract_date(x):
    """Extract features from the date."""
    import datetime
    if not isinstance(x['date'], datetime.datetime):
        x['date'] = datetime.datetime.strptime(x['date'], '%Y-%m-%d')
    x['wday'] = x['date'].weekday()
    return x

In [None]:
def get_metadata(x):
    key = x['id'].split('_')
    x['item_id'] = f'{key[0]}_{key[1]}_{key[2]}'
    x['store_id'] = f'{key[3]}_{key[4]}'
    return x

In [None]:
extract_features = compose.TransformerUnion(
    compose.Select('wday'),
    
    feature_extraction.TargetAgg(by=['item_id'], how=stats.RollingMean(1)),
    feature_extraction.TargetAgg(by=['item_id'], how=stats.RollingMean(2)),
    feature_extraction.TargetAgg(by=['item_id'], how=stats.RollingMean(3)),
    feature_extraction.TargetAgg(by=['item_id'], how=stats.RollingMean(4)),
    feature_extraction.TargetAgg(by=['item_id'], how=stats.RollingMean(5)),
    feature_extraction.TargetAgg(by=['item_id'], how=stats.RollingMean(6)),
    feature_extraction.TargetAgg(by=['item_id'], how=stats.RollingMean(7)),

    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(1)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(2)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(3)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(4)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(5)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(6)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(7)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(8)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(9)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(10)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(11)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(12)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(13)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(14)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(15)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(16)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(17)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(18)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(19)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(20)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(25)),
    feature_extraction.TargetAgg(by=['wday'], how=stats.RollingMean(30)),
)

In [None]:
# Init pipeline dedicated to KNN
knn = (
    compose.FuncTransformer(get_metadata) |
    compose.FuncTransformer(extract_date) |
    extract_features |
    preprocessing.StandardScaler() |
    neighbors.KNeighborsRegressor(window_size=30, n_neighbors=15, p=2)
)


# Init pipeline dedicated to linear model
lm = (
    compose.FuncTransformer(get_metadata) |
    compose.FuncTransformer(extract_date) |
    extract_features |
    preprocessing.MinMaxScaler() |
    linear_model.LinearRegression(optimizer=optim.SGD(0.005), intercept_lr=0.001)
)

In [None]:
list_model = []

X_y = stream.iter_csv('./data/sample_submission.csv', target_name='F8')

for x, y in tqdm.tqdm(X_y, position=0):
    
    item_id = '_'.join(x['id'].split('_')[:5])

    if item_id not in list_model:

        list_model.append(item_id)
        
dict_knn = {item_id: copy.deepcopy(knn) for item_id in tqdm.tqdm(list_model, position=0)}
dict_lm  = {item_id: copy.deepcopy(lm) for item_id in tqdm.tqdm(list_model, position=0)}

In [None]:
random.seed(42)

params = dict(
    target_name = 'y',
    converters  = {'y': int, 'id': str}, 
    parse_dates = {'date': '%Y-%m-%d'}
)

# Init streaming csv reader
X_y = stream.iter_csv('./data/train.csv', **params)

bar = tqdm.tqdm(X_y, position = 0)

# Init online metrics:
metric_knn = collections.defaultdict(lambda: metrics.MAE())

metric_lm  = collections.defaultdict(lambda: metrics.MAE())

mae = metrics.MAE()

for i, (x, y) in enumerate(bar):
    
    # Extract item id
    item_id  = '_'.join(x['id'].split('_')[:5])
    
    # KNN

    # Evaluate performance of KNN
    y_pred_knn = dict_knn[f'{item_id}'].predict_one(x)

    # Update metric of KNN
    metric_knn[f'{item_id}'].update(y, y_pred_knn)

    # Fit KNN
    dict_knn[f'{item_id}'].fit_one(x=x, y=y)

    # Linear Model

    # Evaluate performance of linear model
    y_pred_lm  = dict_lm[f'{item_id}'].predict_one(x)

    # Update metric of linear model
    metric_lm[f'{item_id}'].update(y, y_pred_lm)

    # Store MAE of the linear model during training
    mae.update(y, y_pred_lm)

    dict_lm[f'{item_id}'].fit_one(x=x, y=y)

    if i % 1000 == 0:

        bar.set_description(f'MAE, Linear Model: {mae.get():4f}')

In [None]:
import json

scores_knn = {id: _.get() for id, _ in metric_knn.items()}

scores_lm  = {id: _.get() for id, _ in metric_lm.items()}

with open('scores_knn.json', 'w') as file:
    
    json.dump(scores_knn, file)

with open('scores_lm.json', 'w') as file:
    
    json.dump(scores_lm, file)

In [None]:
import dill
# Per item name warning
with open('dict_knn.dill', 'wb') as file:
    
    dill.dump(dict_knn, file)
    
with open('dict_lm.dill', 'wb') as file:
    
    dill.dump(dict_lm, file)

In [None]:
import json

with open('scores_knn.json', 'rb') as file:
    
    scores_knn = json.load(file)

with open('scores_lm.json', 'rb') as file:
    
    scores_lm = json.load(file)

In [None]:
import dill

with open('dict_knn.dill', 'rb') as file:
    
    dict_knn = dill.load(file)

In [None]:
with open('dict_lm.dill', 'rb') as file:
    
    dict_lm = dill.load(file)

In [None]:
dict_models = {}

for item_id in tqdm.tqdm(scores_knn.keys()):
    
    score_knn = scores_knn[item_id]
    
    score_lm  = scores_lm[item_id]
    
    if score_knn < score_lm:
        
        dict_models[item_id] = dict_knn[item_id]
        
    else:
        
        dict_models[item_id] = dict_lm[item_id]
        
# Save selected models:
with open('dict_models.dill', 'wb') as file:
    
    dill.dump(dict_models, file)

In [3]:
url = 'http://159.89.38.125:8080'

In [4]:
import requests
import dill

requests.post(f'{url}/api/init', json= {'flavor': 'regression'})

<Response [201]>

In [5]:
with open('dict_models.dill', 'rb') as file:
    
    dict_models = dill.load(file)

In [None]:
for model_name, model in tqdm.tqdm(dict_models.items(), position=0):
    
    r = requests.post(f'{url}/api/model/{model_name}', data=dill.dumps(model))

 52%|█████▏    | 15949/30490 [3:34:29<3:49:30,  1.06it/s]

#### Make a prediction by calling the API:

In [None]:
r = requests.post(f'{url}/api/predict', json={
    'id': 1,
    'model': 'HOBBIES_1_001_CA_1',
    'features': {'date': '2016-05-23', 'id': 'HOBBIES_1_001_CA_1'}
})

In [None]:
r.json()

#### Update models with new data:

In [None]:
r = requests.post(f'{url}/api/learn', json={
    'id': 1,
    'model': 'HOBBIES_1_001_CA_1',
    'ground_truth': 0,
})

In [None]:
r