In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys


# helper functions
sys.path.insert(0, "../src/lib")

import dataset as dataset_funcs
import cleaning as cleaning_funcs

pd.set_option('display.float_format', lambda x: '%.3f' % x)

NameError: name 'dataset' is not defined

**Approach and feature extraction**

The first approach is very naïve, because sometimes really simple models give good results and are easy and cheap to build (also, you can give some kind of early information to decision-makers while you are still working on a more sophisticated model).

Using **just sales data**, transform the dataset into a new dataset where each pair (X_i, y_i) is the following:

X_i = 3 (or 4, or 5) prices for 3 (or 4, or 5) sequential days (for a single product)
y_i = price for the next day

So we learn a very simple model that estimates the next price (for a single product)

> note that we have two subapproaches: in the first one we train on each product type separately and in the second one we train on all products togethers. This is meant to investigate whether we can increase performance when using data from all products.

**Assumptions**

- the model is stationary, i.e. the target function does not change over time, in other words, the way the previous prices affect the next one does not change depending upon what month of the year we're in.

**Shortcomings**

- we do not use data from competitors

**Evaluation**

- squared error, since we're dealing with continuous data

In [None]:
sales_df = pd.read_csv('../data/raw/sales.csv')
sales_df.DATE_ORDER = sales_df.DATE_ORDER.astype("datetime64")
sales_df["UNIT_PRICE"] = sales_df["REVENUE"] / sales_df["QTY_ORDER"]
sales_df.head()

right now the index is just numbers, but pandas allows us to inform that each index refers to a special Period in time (in this case, a day)

> note that using a PeriodIndex is different from just using a DatetimeIndex because then pandas would just think you want to index the data by a particular point in time, rather than by the whole day, as is the case here.

In [None]:
# periods = list(map(lambda dt: pd.Period(dt),sales_df["DATE_ORDER"]))
# idx = pd.PeriodIndex(periods)
# sales_df=sales_df.set_index(idx).drop(["DATE_ORDER"],axis=1)

as we mentioned on the **EXPLORATORY DATA ANALYSIS**, we will remove some bad data from our dataset to avoid propagating these errors to the model (using helper functions).

In [None]:
sales_df = cleaning_funcs.clean_sales_dataframe(sales_df);sales_df

Here we group by the product ID and the date the price was sampled.  

In [None]:
grouped_1 = sales_df.groupby([sales_df.PROD_ID,sales_df.DATE_ORDER],as_index=False).agg({
    "QTY_ORDER":np.sum,
    "REVENUE": np.sum,
    "UNIT_PRICE": np.mean
})
grouped_1.sample(10)


# len(sales_df[sales_df.DATE_ORDER < '2015-10-14 00:00:00'])
# len(sales_df[])


> note that the first approach treats each product individually

In [None]:
g1 = sales_df.groupby([sales_df.PROD_ID])

In [None]:
g1 = sales_df.groupby([sales_df.PROD_ID])
(p1,p2,p3,p4,p5,p6,p7,p8,p9) = [g1.get_group(prod_id) for prod_id in g1.groups.keys()]

dataset7 = p7.groupby(p7.DATE_ORDER).agg({
    "QTY_ORDER":np.sum,
    "UNIT_PRICE": np.mean
})
dataset7.sample(5)

## now that we have selected data only for P7, let's train a simple model on it

> remember, we'll use UNIT_PRICE and QTY_ORDER for the N=3 previous days to try to model what the next (4th) day. 

Note that we have **aggregated**, i.e. combined all rows for a given day (in the original dataset, there were multiple entries for the same product *and* the same day so we aggregated those).

In [None]:
dataset7.sort_index(inplace=True)
dataset7.reset_index(inplace=True)
dataset7[dataset7.DATE_ORDER.dt.month==5]

In [None]:
for row in dataset7.values:
    print(row[2])

In [None]:
import dataset as dataset_funcs

dataset7[dataset7.iloc[:,0] == pd.tslib.Timestamp('2015-06-30 00:00:00')].index.tolist()[0]

# type(dataset7.iloc[0])
# type(dataset7.iloc[:,0])

In [2]:
arr = np.array([1,2,3,4])

In [3]:
lst = [[2,3,4],[4,5,6]]

In [4]:
np.array(lst)

array([[2, 3, 4],
       [4, 5, 6]])

In [5]:
lst1 = []
lst1.append(lst)
lst1

[[[2, 3, 4], [4, 5, 6]]]

In [6]:
arr.tolist()

[1, 2, 3, 4]

In [8]:
arr[:-1]

array([1, 2, 3])

In [11]:
5 // 2

2