In [86]:
import pandas as pd
import numpy as np
from os import listdir
import matplotlib.pyplot as plt
import matplotlib.dates as dates
from matplotlib.ticker import IndexFormatter    ## better than dates.DateFormatter for pandas 
import seaborn as sns
from pandas.plotting import scatter_matrix
import statsmodels.api as sm
from statsmodels.api import tsa
from datetime import datetime as dt
%matplotlib inline

listdir('../input')

df_whey = pd.read_csv('../input/Datamart-Export_DY_WK100-Dry Whey Prices and Sales_20170829_122601.csv')
df_ched_500 = pd.read_csv('../input/Datamart-Export_DY_WK100-500 Pound Barrel Cheddar Cheese Prices, Sales, and Moisture Content_20170829_122601.csv')
df_milk = pd.read_csv('../input/Datamart-Export_DY_WK100-Nonfat Dry Milk Prices and Sales_20170829_122601.csv')
df_butter = pd.read_csv('../input/Datamart-Export_DY_WK100-Butter Prices and Sales_20170829_122601.csv')
df_ched_40 = pd.read_csv('../input/Datamart-Export_DY_WK100-40 Pound Block Cheddar Cheese Prices and Sales_20170829_122601.csv')

In [87]:
def cols_to_dt(frame, cols, format='%m/%d/%Y'):
    for col in cols:
        frame[col]=pd.to_datetime(frame[col], format=format)
    return frame
    


columns_to_transform = ['Week Ending Date', 'Report Date']
df_ched_500 = cols_to_dt(df_ched_500, cols=columns_to_transform)
df_ched_500.head()

In [88]:
def describe_df (frame):
    info = {'dtype': frame.dtypes, 
            'isnull': pd.isnull(frame).any(),
            'null_rate': pd.isnull(frame).mean(),
            'count_dist': frame.apply(lambda col: len(col.unique())), 
            'mean': frame.select_dtypes(exclude=['object']).mean(), 
            'median': frame.select_dtypes(exclude=['object']).median(), 
            'std': frame.select_dtypes(exclude=['object']).std(), 
            '%25': frame.select_dtypes(exclude=['object']).quantile(0.25), 
            '%50': frame.select_dtypes(exclude=['object']).quantile(0.50), 
            '%75': frame.select_dtypes(exclude=['object']).quantile(0.75)}
    return pd.DataFrame(info, columns=info.keys())

describe_df(df_ched_500).sort_values('dtype')

In [89]:
end_date = dt(year=2017, month=8, day=19)
start_date = dt(year=2012, month=3, day=26)

dt_index_d = pd.DatetimeIndex(start=start_date, end=end_date, freq='d')
dt_index_B = pd.DatetimeIndex(start=start_date, end=end_date, freq='B')

df_ched_500.set_index(dt_index_B, inplace=True)
df_ched_500 = df_ched_500.iloc[::-1]

index_dummy = pd.DataFrame([0]*len(dt_index), index=dt_index_d)
df_ched_500=pd.merge(df_ched_500, index_dummy, how='outer', left_index=True, right_index=True)

df_ched_500.drop(['Week Ending Date', 'Report Date', 'Date'], axis=1, inplace=True)
df_ched_500.head()

In [90]:
df_ched_500['Sales'] = [int(''.join([char for char in list(val) if char !=','])) if isinstance(val, str) else 0 for val in df_ched_500['Sales'].values]
df_ched_500 = df_ched_500.fillna(0).drop(0, axis=1)

In [91]:
df_ched_500.head(20)

In [92]:
for col in df_ched_500.columns:
    print(col)
    decompose = tsa.seasonal_decompose(df_ched_500[col])
    df_ched_500['{}_Seasonal'.format(col)] = decompose.seasonal
    df_ched_500['{}_Trend'.format(col)] = decompose.trend
    df_ched_500['{}_Resid'.format(col)] = decompose.resid
    
df_ched_500.head()

In [93]:
columns_to_draw = df_ched_500.columns

for col in columns_to_draw:
    fig, ax = plt.subplots(1,1, figsize=(16,4))
    df_ched_500.loc[:,[col]].plot(ax=ax)
    