# Readme

This is a faster verison of the python script that generates the mean/median
datasets. I think it's the same, although it outputs to a CSV file, rather 
than a database. And it's much faster!

In [1]:
using SQLite
using DataArrays, DataFrames 
using DataStreams

    broadcast(Function, DataArrays.PooledDataArray...) at /Users/riri/.julia/v0.4/DataArrays/src/broadcast.jl:312
is ambiguous with: 
    broadcast(Any, NullableArrays.NullableArray...) at /Users/riri/.julia/v0.4/NullableArrays/src/broadcast.jl:100.
To fix, define 
    broadcast(Function)
before the new definition.


In [2]:
type CategoricalMeanEstimator
    col::Symbol
    cls_mean::DataFrames.DataFrame
    cls_med::DataFrames.DataFrame
    global_mean::Float32
    global_med::Float32
end

In [3]:
function CategoricalMeanEstimator(frame, name)
    return CategoricalMeanEstimator(
        name,
        aggregate(data, name, mean)[[name, :adjusted_demand_mean]],
        aggregate(data, name, median)[[name, :adjusted_demand_median]],
        mean(frame[name]),
        median(frame[name])
    )
end

CategoricalMeanEstimator

In [4]:
function classify_with_estimator(frame, estimator)
    res_mean = join(frame, estimator.cls_mean, on=estimator.col, kind=:left)[:adjusted_demand_mean]
    res_mean[isna(res_mean)] = estimator.global_mean
    res_med = join(frame, estimator.cls_med, on=estimator.col, kind=:left)[:adjusted_demand_median]
    res_med[isna(res_med)] = estimator.global_med
    res = DataFrame()
    res[symbol(string(estimator.col, :_mean))] = res_mean
    res[symbol(string(estimator.col, :_med))] = res_med
    return res
end

classify_with_estimator (generic function with 1 method)

In [5]:
db = SQLite.DB("/tmp/data.sqlite3")

SQLite.DB("/tmp/data.sqlite3")

In [6]:
query = """
SELECT week_num,
               sales_depo,
               sales_channel,
               route_id,
               client_id,
               product_id,
               adjusted_demand,
               rand
          FROM data 
         WHERE adjusted_demand is not null 
               AND week_num < 8
"""
data = SQLite.query(db, query)
for name in names(data)
    data[name] = data[name].values
end
head(data)

Unnamed: 0,week_num,sales_depo,sales_channel,route_id,client_id,product_id,adjusted_demand,rand
1,3,1110,7,3301,198780,35651,23,0
2,3,1110,7,3301,886295,47336,3,0
3,3,1110,7,3301,1603500,1240,5,0
4,3,1110,7,3301,1914789,1240,5,0
5,3,1110,7,3302,50720,48077,10,0
6,3,1110,7,3302,664651,1216,6,0


In [None]:
frs = Array{CategoricalMeanEstimator, 1}()
for name in names(data)
    if name != :adjusted_demand && name != :rand
        fr = CategoricalMeanEstimator(data, name)
        frs = push!(frs, fr)
    end 
end 

In [None]:
query = """
        SELECT coalesce(id, -1),
               week_num,
               sales_depo,
               sales_channel,
               route_id,
               client_id,
               product_id,
               adjusted_demand,
               rand
     FROM data limit 100
"""
test_data = SQLite.query(db, query)
for name in names(test_data)
    test_data[name] = test_data[name].values
end
head(test_data)

In [None]:
for fr in frs
    res = classify_with_estimator(test_data, fr)
    for name in names(res)
        test_data[name] = res[name]
    end 
end 

In [None]:
writetable("/tmp/all_data.csv", test_data)