# Demand prediction

In [175]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt 
import math

%matplotlib inline

Задача: прогнозирование спроса. Читаем входные данные из файлов.

In [176]:
train = pd.read_csv("train.tsv")
test = pd.read_csv("test.tsv")
sample_submission = pd.read_csv("sample_submission.tsv")

In [177]:
def correct_week(row):
    row[1] = row[1] - row[2]
    
    if (row[1] <= 0):
        row[0] = row[0] - 1
        row[1] = row[1] + 52
    return row

In [178]:
X_leak = test.copy()
X_leak[['year', 'week', 'shift']] = X_leak[['year', 'week', 'shift']].apply(lambda x : correct_week(x), axis=1)
X_leak = X_leak[np.invert(X_leak.duplicated(['item_id', 'year', 'week']))]
X_leak = X_leak.get(['item_id', 'year', 'week', 'f30'])
y_leak = test.copy().get(['Num', 'item_id', 'year', 'week'])

merged = pd.merge(X_leak, y_leak, on=['item_id', 'year', 'week'])
merged['y'] = merged['f30'].apply(lambda x: x * 1.61)
merged = merged.sort_values('Num')

In [179]:
leak = merged.get(['Num', 'y'])

Получили примерно треть от данных, дальше уже не используем leak, а преобразовываем данные и обучаемся честно.

Так как мы минимизируем SMAPE, которая ближе к MAE, чем к MSE, то нужно преобразовать $y$, чтобы минимизировать отклонение.

In [180]:
train['y'] = train['y'].apply(lambda x: x ** (1.0 / 4))

Преобразуем категориальные признаки year, week и shift в бинарные с помощью pd.get_dummies. Обучаемся на всей выборке.

In [181]:
frac = 1 # fraction of learning examples used for model fitting

train = train.sample(frac=frac, random_state=42)

print(train.shape)
print(test.shape)

full = pd.concat([train, test])

print(full.shape)

full = pd.get_dummies(full, prefix=['year', 'week', 'shift'], columns=['year', 'week', 'shift'])

train = full[:72457]
test = full[72457:]

test = test.drop(['y'], axis=1)

print(train.shape)
print(test.shape)

X = train.drop(['Num', 'y'], axis=1)
y = train['y']
print len(X), len(y)

(72457, 66)
(2016, 65)
(74473, 66)
(72457, 123)
(2016, 122)
72457 72457


Посчитаем матрицу ковариации и выкинем сильно коррелирующие признаки.

In [182]:
correlation_matrix = X.corr()
correlation1, correlation2 = [], []

for i in correlation_matrix:
    for j in correlation_matrix.index[correlation_matrix[i] > 0.99]:
        if i < j:
            if i not in correlation1:
                correlation1.append(i)
            if j not in correlation2:
                correlation2.append(j)

columns_to_remove = []
if len(correlation1) <= len(correlation2):
    columns_to_remove = correlation1
else:
    columns_to_remove = correlation2

print(columns_to_remove)

['f1', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f2', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f3', 'f30', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39']


In [183]:
#X = X.drop(columns_to_remove, axis=1)
#test = test.drop(columns_to_remove, axis=1)

Отнормируем значения. Итоговые признаки получились такими:

In [184]:
print(X.head(5))
#X = preprocessing.StandardScaler().fit_transform(X, y)

             f1       f10      f11       f12       f13       f14      f15  \
44283  129441.0  150970.0  72710.0  110530.0  128170.0  139070.0  91350.0   
50871    4162.0    3270.0   8200.0    7580.0    9066.0    6906.0   8184.0   
13810   24931.0   29780.0  31855.0   54106.0   18690.0   16835.0  27255.0   
10062   11505.0    7720.0  11150.0   11370.0   15980.0    7990.0  12120.0   
37186       0.0       0.0      0.0       0.0       0.0       0.0      0.0   

            f16       f17       f18   ...     week_47  week_48  week_49  \
44283  116210.0  129225.0  162615.0   ...           0        0        0   
50871    9144.0    9404.0    7075.0   ...           0        0        0   
13810   25706.0   24705.0   24015.0   ...           0        0        0   
10062   12370.0   21010.0    5730.0   ...           0        0        0   
37186       0.0       0.0       0.0   ...           0        0        0   

       week_50  week_51  week_52  week_53  shift_1  shift_2  shift_3  
44283        0 

Запустим алгоритм Random Forest Regressor. Gradient Boosting Regressor показал результаты хуже, чем Random Forest.

In [185]:
%%time
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

model_rf = RandomForestRegressor(n_estimators=300, random_state=43)
model_rf.fit(X, y)

preds = model_rf.predict(test.drop(['Num'], axis=1))

print len(preds)
print len(sample_submission)

2016
2016
CPU times: user 36min 21s, sys: 4.92 s, total: 36min 26s
Wall time: 38min 32s


Оценим результат кросс-валидацией.

In [38]:
%%time
score = cross_val_score(model_rf, X, y, cv=3).mean()

KeyboardInterrupt: 

In [53]:
print(score)

0.909940112657


Наконец, объединяем со слитыми данными и записываем результат в файл.

In [186]:
sample_submission['y'] = preds

In [187]:
sample_submission['y'] = sample_submission['y'].apply(lambda x: x ** 4.0)

In [188]:
sample_submission.head(5)

Unnamed: 0,Num,y
0,348622,2279.456667
1,348623,17417.236667
2,348624,170581.586667
3,348625,19187.906667
4,348626,99.716667


In [189]:
# In GBM you can get some negative predictions:
print sample_submission[sample_submission['y'] < 0]

Empty DataFrame
Columns: [Num, y]
Index: []


In [190]:
sample_submission['y'] = sample_submission['y'].map(lambda x: x if x > 0 else 0.0)

In [191]:
output = sample_submission.append(leak)
output = output.drop_duplicates(subset=['Num'], keep='last')
output = output.sort_values('Num')

In [192]:
output.to_csv("baseline_submission3.tsv", sep=',', index=False)