In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys

# helper functions
sys.path.insert(0, "../src/lib")

import dataset as dataset_funcs
import cleaning as cleaning_funcs

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
sales_df = pd.read_csv('../data/raw/sales.csv')
sales_df.DATE_ORDER = sales_df.DATE_ORDER.astype("datetime64")
sales_df["UNIT_PRICE"] = sales_df["REVENUE"] / sales_df["QTY_ORDER"]
sales_df.head()

Unnamed: 0,PROD_ID,DATE_ORDER,QTY_ORDER,REVENUE,UNIT_PRICE
0,P6,2015-08-02,1.0,1808.99,1808.99
1,P6,2015-08-17,1.0,1674.0,1674.0
2,P6,2015-08-17,1.0,1673.95,1673.95
3,P6,2015-08-11,1.0,1674.0,1674.0
4,P6,2015-08-17,1.0,1674.0,1674.0


as we mentioned on the **EXPLORATORY DATA ANALYSIS**, we will remove some bad data from our dataset to avoid propagating these errors to the model (using helper functions).

In [3]:
sales_df = cleaning_funcs.clean_sales_dataframe(sales_df);sales_df

Unnamed: 0,PROD_ID,DATE_ORDER,QTY_ORDER,REVENUE,UNIT_PRICE
0,P6,2015-08-02,1.000,1808.990,1808.990
1,P6,2015-08-17,1.000,1674.000,1674.000
2,P6,2015-08-17,1.000,1673.950,1673.950
3,P6,2015-08-11,1.000,1674.000,1674.000
4,P6,2015-08-17,1.000,1674.000,1674.000
5,P6,2015-07-30,1.000,1697.340,1697.340
6,P6,2015-08-06,1.000,1676.990,1676.990
7,P6,2015-08-16,1.000,1674.000,1674.000
8,P6,2015-07-25,1.000,1608.670,1608.670
9,P6,2015-07-16,1.000,1697.680,1697.680


let's split the data into each product (because each product may have different dynamics)

In [4]:
grouped = sales_df.groupby([sales_df.PROD_ID])
(p1,p2,p3,p4,p5,p6,p7,p8,p9) = [grouped.get_group(prod_id) for prod_id in grouped.groups.keys()]

In [5]:
# let's also aggregate rows with the same timestamp

# LATER

# dataset7 = p7.groupby(p7.DATE_ORDER).agg({
#    "QTY_ORDER":np.sum,
#    "UNIT_PRICE": np.mean
#})

let's use P7 in the first run because it's the product the with the most available data

In [6]:
p7 = p7.sort_values(['DATE_ORDER'])

In [7]:
# now just select the columns we will use in this very simple model
p7 = p7[["UNIT_PRICE","QTY_ORDER"]]

In [8]:
X,y = dataset_funcs.make_Xy_simple(p7)

In [9]:
X.shape,y.shape

((2899, 1), (2899,))