In [None]:
import pandas as pd
import numpy as np
import datetime as dt

import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
%matplotlib inline
import seaborn as sb

from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV



from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import __version__
from plotly import graph_objs as go

## Формируем выборку временного ряда

In [None]:
df=pd.read_pickle('MG_Sales.pickle',compression='gzip')
#df['Дата']=np.array(df['Дата'], dtype='datetime64[M]')

## Формируем выборку

In [None]:
begin_period=dt.datetime(2014,1,1)
prediction_period=dt.datetime(2017,1,1)

#last_d=begin_period-dt.timedelta(days=30)
#(df['Магазин']=='Пермь (Колизей)  Ефименко О.Г.')&
#&(df['Дата']<prediction_period)prediction_period

time_series=pd.DataFrame(data=df[(df['Дата']>=begin_period)].groupby('Дата')['Количество'].sum())#(df['ТоварЦеноваяГруппа']=='3200-5000')&
time_series.index.name='ds'
time_series.columns=['y']

#time_series=time_series.loc[:prediction_period-dt.timedelta(seconds=1)]
y_test=time_series.loc[prediction_period:,'y']


date_list = np.array([prediction_period + dt.timedelta(days=x) for x in range(0, 141)]).astype('datetime64[D]')
time_series_forecast=pd.DataFrame(index=date_list)
time_series_forecast.index.name='ds'

## Фомируем характеристики модели

In [None]:
#Фиксируем аномально низкие и высокие продажи
ul=5#Персентиль высоких продаж 2
ll=5#7#Персентиль низких продаж 10
md=10#ширина медианы

ulim=np.percentile(time_series['y'], 100.-ul)
llim=np.percentile(time_series['y'], ll)
med=np.percentile(time_series['y'], [50-md,50+md])


time_series['Квантили']=0
time_series.loc[time_series.y<med[0],'Квантили']=-1
time_series.loc[time_series.y>med[1],'Квантили']=1
time_series.loc[time_series.y<llim,'Квантили']=-2
time_series.loc[time_series.y>ulim,'Квантили']=2

#вычисляем год назад
def yearsago(years, from_date):
    try:
        return from_date.replace(year=from_date.year - years)
    except ValueError:        
        return from_date.replace(month=2, day=28,
                                 year=from_date.year-years)

#временные характеристики
def setNewValues(time_series):
    time_series['День недели'] = time_series.index.weekday
    time_series['Неделя'] = time_series.index.week
    time_series['Год'] = time_series.index.year
    time_series['День месяца'] = time_series.index.day
    time_series['День года'] = time_series.index.dayofyear    
    time_series['Праздник'] =df.groupby('Дата')['Праздник'].max()    
    return time_series

def weekseason(time_series):
    time_series['Недельная сезонность']=time_series['День недели'].map(lambda cell: week_d.loc[cell,'Недельная сезонность'])
    return time_series

time_series=setNewValues(time_series)

#порядок дней в сезонности недельной продажи за исключением аномалий
week_d=pd.DataFrame(data=time_series[time_series['Квантили']==0].groupby('День недели')['y'].sum().sort_values())
week_d.insert(0,'Недельная сезонность',list(range(week_d.shape[0])))
for i in list(set(range(7))-set(week_d.index.values)):
    week_d.loc[i,'Недельная сезонность']=-1
    
time_series=weekseason(time_series)

mean_dict=dict(time_series.groupby(['Год','Неделя'])['y'].mean())
time_series['Среднее по неделе']=time_series.apply(lambda row: mean_dict[row['Год'],row['Неделя']] , axis=1)
mean_dict=dict(time_series.groupby(['День года'])['y'].mean())
time_series['Среднее по дню года']=time_series.apply(lambda row: mean_dict[row['День года']] , axis=1)


#Вычленяем целевую переменную
y=time_series.y
time_series.drop(['y'], axis=1, inplace=True)


#подготавливаем выборку для прогноза
time_series_forecast=weekseason(setNewValues(time_series_forecast))
st_day=time_series_forecast.iloc[0].name
first_day_past_year=yearsago(1, st_day)
#вычисляем период которым мы должны взять из прошлого года
last_day_past_year=dt.datetime(first_day_past_year.year,12,31)
time_series[first_day_past_year:last_day_past_year]

#Сдвигаем период на год вперед
def setTimebasedValues(time_series_forecast,time_series,cols,first_day_past_year):
    #вычленяем данные с колонками
    time_series_copy=pd.DataFrame(data=time_series.loc[first_day_past_year:last_day_past_year,cols].copy())
    try:
        time_series_copy.loc[dt.datetime(first_day_past_year.year,2,28)]=time_series_copy.loc[dt.datetime(first_day_past_year.year,2,28):dt.datetime(first_day_past_year.year,2,29)].mean()
    except:
        pass
    
    for col in cols:
        #Если високосный год
        try:            
            time_series_forecast[col]=pd.concat([
                time_series_copy.loc[:dt.datetime(first_day_past_year.year,2,28),[col]].shift(366,'D'),
                time_series_copy.loc[dt.datetime(first_day_past_year.year,3,1):,[col]].shift(365,'D')    
                ], axis=0, join='outer')
        except:
            time_series_forecast[col]=time_series_copy[col].shift(365,'D')
            #TODO Тут дополнительно отработать 29 февраля текущего года
    return time_series_forecast
        
time_series_forecast=setTimebasedValues(time_series_forecast,time_series,['Квантили','Среднее по неделе','Среднее по дню года'],first_day_past_year)
time_series_forecast=time_series_forecast[time_series.columns]

del week_d

## Обучение и валидация

In [None]:
next_d=dt.datetime(2017,1,1)
last_d=next_d-dt.timedelta(seconds=1)    
time_series_train=time_series.loc[:last_d]
y_train=y.loc[:last_d]

    
#Случайный лес
rf=RandomForestRegressor(n_jobs=-1)#criterion='neg_mean_absolute_error'
#фолды кросс-валидации
tss = TimeSeriesSplit(n_splits=3)
#сетка параметров
tuned_parameters={
    'n_estimators': np.arange(30,45,1),
    'min_impurity_split':np.logspace(-30, -19, num=20, endpoint=False),
    'max_depth': np.arange(20,30,1)    
}
clf_grid = RandomizedSearchCV(rf, tuned_parameters, cv=tss, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1)
clf_grid.fit(time_series_train, y_train)
print (clf_grid.best_params_)
print(clf_grid.best_score_)

## Валидация на кросс-обучении

In [None]:
folds=4
last=time_series.iloc[-1].name
startDt=dt.datetime(last.year,last.month,last.day)
lastDay=dt.datetime(last.year,last.month,1)-dt.timedelta(seconds=1)
startmonth=dt.datetime(lastDay.year,lastDay.month,1)

pediods=[]
for i in range(folds):
    pediods.append([startmonth,startDt])    
    startDt=startmonth-dt.timedelta(seconds=1)
    startmonth=dt.datetime(startDt.year,startDt.month,1)  

mae=[]
mape=[]
for begin,end in reversed(pediods):
    date_div_past=begin-dt.timedelta(days=1)
        
    time_series_train=time_series.loc[:date_div_past]
    time_series_test=time_series_forecast.loc[begin:end]
    
    y_train=y.loc[:date_div_past]    
    y_test=y.loc[begin:end]    
    
    clf_grid.best_estimator_.fit_transform(time_series_train,y_train)
    prediction_test = pd.DataFrame(data=clf_grid.best_estimator_.predict(time_series_test),index=time_series_test.index)
        
    #prediction_test*=0.75
    
        
    mae.append(np.mean(abs(y_test-prediction_test[0])))
    mape.append(np.mean(100-abs(100*(y_test-prediction_test[0])/y_test)))
    break
    
    
print("RandomForrest MAE: {} 100%-MAPE: {}%".format(round(np.mean(mae),2),round(np.mean(mape),2)))


In [None]:
init_notebook_mode(connected = True)
trace1 = go.Scatter(
            x = prediction_test.index,
            y = prediction_test[0],
            mode = 'lines',            
            name = 'Прогноз',
            line=dict(
                shape='spline'
            )
        )       

ty=y[dt.datetime(2016,1,1):dt.datetime(2016,1,31)].shift(366,'D')
trace0 = go.Scatter(
            x = ty.index,
            y = ty,
            mode = 'lines',            
            name = '2016 год',
            line=dict(
                shape='spline',
                dash = 'dash'
            )
        )       

ty=y[dt.datetime(2015,1,1):dt.datetime(2015,1,31)].shift(366+365,'D')
trace5 = go.Scatter(
            x = ty.index,
            y = ty,
            mode = 'lines',            
            name = '2015 год',
            line=dict(
                shape='spline',
                dash = 'dash'
            )
        )   


trace2 = go.Scatter(
            x = y_test.index,
            y = y_test,
            mode = 'lines',            
            name = 'Этот год',
            line=dict(
                shape='spline'
            )
        ) 
 
trace3 = go.Box(
    y=prediction_test[0],
    name='Mean & SD Прогноз',    
    boxmean='sd',
    #boxpoints = 'outliers'
    boxpoints = 'all'
)

trace4 = go.Box(
    y=y_test,
    
    name='Mean & SD Данные',    
    boxmean='sd',
    #boxpoints = 'outliers',
    boxpoints = 'all'
)


fig = dict(data = [trace2,trace0,trace1,trace5])#,trace0
iplot(fig, show_link=False)

fig = dict(data = [trace4,trace3])#,trace0
iplot(fig, show_link=False)

In [None]:
xgb.plot_importance(bst)


In [None]:
df.groupby('ТоварЦеноваяГруппа')['Количество'].sum().sort_values(ascending=False)

In [None]:
time_series[dt.datetime(2016,1,1):].head(16)

In [None]:
y[dt.datetime(2016,1,1):].head()

In [None]:
np.percentile(y, ll)

In [None]:
dt.datetime.now()

In [None]:
time_series.columns

In [None]:
clf_grid.best_estimator_.feature_importances_