In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
import sqlite3
import pandas
import pandas.io.sql
import tqdm
from sklearn import *
import ml_metrics



In [4]:
class CategoricalMeanEstimator:
    def __init__(self, col):
        self.col = col
        self.cls = None
        self.global_mean = None
    def fit(self, X, y):
        self.cls = y.groupby(X[self.col]).median().to_frame('estimate')
        self.global_mean = y.median()
        return self
    
    def predict(self, X):
        x = X[self.col].to_frame('col')
        res = pandas.merge(x, self.cls, left_on='col', right_index=True, how='left')
        return res.estimate.fillna(self.global_mean)

In [5]:
class MultiCategoricalMeanEstimator:
    def __init__(self, cols):
        self.cols = cols
        self.cls = None
        self.global_mean = None
    def fit(self, X, y):
        groups = [X[col] for col in self.cols]
        self.cls = y.groupby(groups).median().to_frame('estimate').reset_index()
        self.global_mean = y.median()
        return self
    
    def predict(self, X):
        x = X[self.cols]
        res = pandas.merge(
            x, self.cls, 
            left_on=self.cols, right_on=self.cols, 
            how='left')
        return res.fillna(self.global_mean).estimate

In [6]:
con = sqlite3.connect('/tmp/data.sqlite3')
total = 53364883
data = None
chunksize = int(5e6)
try:
    data_iter = pandas.read_sql('''
        SELECT week_num,
               sales_depo,
               sales_channel,
               route_id,
               client_id,
               product_id,
               adjusted_demand,
               rand
          FROM data 
         WHERE adjusted_demand is not null 
               AND week_num < 8''', con=con, chunksize=chunksize)
    for f in tqdm.tqdm(data_iter, total=1+total//chunksize):
        # This halves the memory use :(
        for col in f:
            if f[col].dtype == np.int64:
                f[col] = f[col].astype(np.int32)
        if data is None:
            data = f
        else:
            data = pandas.concat([data, f])
finally:
    con.close()



In [7]:
series = {'adjusted_demand': data.adjusted_demand}
admissible_cols = ['week_num', 'sales_depo', 'sales_channel', 'route_id', 'client_id', 'product_id']

estimators = {}
for col in tqdm.tqdm(admissible_cols):
    est = CategoricalMeanEstimator(col)
    est.fit(data, data.adjusted_demand)
    estimators[col] = est
    series[col] = est.predict(data)

'''
if False:
    for c1, c2 in tqdm.tqdm([(c1, c2) for c1 in admissible_cols for c2 in admissible_cols if c1 != c2]):
        est = MultiCategoricalMeanEstimator([c1, c2])
        est.fit(data, data.adjusted_demand)
        series_name = c1 + '_' + c2
        series[series_name] = est.predict(data)
        test_series[series_name] = est.predict(test_data)
        del est
'''
    
train_X = pandas.DataFrame(series)
train_X['rand'] = data.rand
train_X['adjusted_demand'] = data.adjusted_demand
del series, data



In [8]:
con = sqlite3.connect('/tmp/train_test_data.sqlite3')
try:
    # Set up the table
    pandas.io.sql.to_sql(train_X.head(), 'train_data', con=con, if_exists='replace')
finally:
    con.close()

In [None]:
## test_series = {'adjusted_demand': test_data.adjusted_demand}
for col in tqdm.tqdm(admissible_cols):
    test_series[col] = estimators[col].predict(test_data)
    
test_X = pandas.DataFrame(test_series)
test_X['rand'] = test_data.rand
test_X['adjusted_demand'] = test_data.adjusted_demand

del test_series

In [None]:
con = sqlite3.connect('/tmp/data.sqlite3')
test_data = None
total = 20815581
try:
    data_iter = pandas.read_sql('''
        SELECT * 
          FROM data 
         WHERE adjusted_demand is not null 
               AND week_num >= 8''', con=con, chunksize=chunksize)
    for f in tqdm.tqdm(data_iter, total=1+total//chunksize):
        for col in f:
            if f[col].dtype == np.int64:
                f[col] = f[col].astype(np.int32)
        if test_data is None:
            test_data = f
        else:
            test_data = pandas.concat([test_data, f])
finally:
    con.close()

test_X['rand'] = test_data.rand
test_X['adjusted_demand'] = test_data.adjusted_demand

In [None]:
con = sqlite3.connect('/tmp/train_test_data.sqlite3')
try:
    pandas.io.sql.to_sql(test_X, 'test_data', con=con, if_exists='replace')
finally:
    con.close()

In [None]:
reg = ensemble.GradientBoostingRegressor(
    subsample=0.1, 
    max_depth=8,
    warm_start=True,
    max_features='log2',
    verbose=2,
    n_estimators=20)
reg.fit(train_X.drop('adjusted_demand', axis=1), train_X.adjusted_demand)

In [None]:
y_pred = max(reg.predict(test_X.drop('adjusted_demand', axis=1)), 0)

In [None]:
ml_metrics.rmsle(test_X.adjusted_demand, y_pred)