In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas
import sqlite3
import seaborn as sns
from sklearn import *
from tqdm import tqdm
import joblib



In [3]:
def batch_treatment(batch_data):
    batch_data = pandas.concat(
            [batch_data, pandas.get_dummies(batch_data.sales_depo, prefix='sales')]
        ).drop('sales_depo', axis=1)
    batch_data = pandas.concat(
            [batch_data, pandas.get_dummies(batch_data.sales_channel, prefix='channel')]
        ).drop('sales_channel', axis=1)
    return batch_data

In [None]:
def train_classifier(i):
    con = sqlite3.connect('/tmp/data.sqlite3')
    try:
        cls = linear_model.RidgeClassifier()
        batch_data = pandas.read_sql('''
            SELECT week_num, 
                   sales_depo,
                   sales_channel,
                   route_id,
                   client_id,
                   product_id,
                   adjusted_demand
              FROM data
             WHERE adjusted_demand is not null
                   AND week_num < 8
                   AND (rand = ? OR rand = ? OR rand = ? OR rand = ?)
            ''', params=[i, i+25, i+50, i+75], con=con)
        batch_data = batch_treatment(batch_data)
        print('Training...')
        cls.fit(batch_data.drop('adjusted_demand', axis=1), batch_data.adjusted_demand)
        print('Trained')
    finally:
        con.close()
    return cls

classifiers = joblib.Parallel(n_jobs=-1)([joblib.delayed(train_classifier)(i) for i in range(25)])

In [None]:
def classify_subset(i):
    con = sqlite3.connect('/tmp/data.sqlite3')
    try:
        batch_data = pandas.read_sql('''
            SELECT week_num, 
                   sales_depo,
                   sales_channel,
                   route_id,
                   client_id,
                   product_id,
                   adjusted_demand
              FROM data
             WHERE adjusted_demand is not null
                   AND week_num >= 8
                   AND (rand = ? OR rand = ? OR rand = ? OR rand = ?)
            ''', params=[i, i+25, i+50, i+75], con=con)
        preds = []
        for cls in classifiers:
            preds.append(cls.predict(batch_data.drop('adjusted_demand', axis=1)))
        preds = np.vstack(preds)
        pred = np.mean(preds, axis=0)
    finally:
        con.close()
    return pred, batch_data.adjusted_demand

batch_predictions, batch_y = zip(*joblib.Parallel(n_jobs=-1)([joblib.delayed(classify_subset)(i) for i in range(25)]))

In [None]:
metrics.mean_squared_error(np.concatenate(batch_y), np.concatenate(batch_predictions))

In [None]:
def RMSLE(y_true, y_pred):
    t1 = np.log(y_pred + 1)
    t2 = np.log(y_true + 1)
    sqt = (t1 + t2)**2
    return np.sqrt(np.mean(sqt))
RMSLE(np.concatenate(batch_y), np.concatenate(batch_predictions))

In [28]:
np.concatenate(batch_y)

array([ 30, 123,   3, ...,   2,   0,   2])

In [30]:
np.concatenate(batch_predictions)

-389.30195358329598