## This is my first public work. No new ideas are added here, only some explanations and actions to eliminate misunderstandings for beginners when working with this task.

## If you don't mind, please support this work - upvoted this topic


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Load the data.

In [None]:
train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv',delimiter=',',parse_dates=['date']) #parse_dates - чтобы сразу проеобразовал Object в datetime
# train = train.drop('Unnamed: 0',axis = 1)
train.sort_values(by=['date'], inplace=True, ascending=True)
train

Let's look in more detail and see that there is data that is impossible. So let's change them to zero.


In [None]:
train.loc[train.item_cnt_day < 0].item_cnt_day.unique()

In [None]:
train["item_cnt_day"] = train.item_cnt_day.replace({-1 : 0, -2 : 0, -3 : 0, -6 : 0, -5:0,-4:0,-22:0,-16:0,-9:0})

In [None]:
train.loc[train.item_cnt_day < 0]

Also the column 'date_block_num' is filled incorrectly. There are months and 34 and 35. Therefore, we will create and replace with the correct values.

In [None]:
#Создаем столбец с правильным значениями отсчета месяцев, тк date_block_num с ошибками
train['date_block_num'] = (train['date'].dt.to_period('M').view(int) - pd.to_datetime(['2013-01-01']).to_period('M').view(int))

Ctrate pivot table, with fill_value = 0

In [None]:
dataset = pd.pivot_table(train,index = ['shop_id','item_id'],values = ['item_cnt_day'],columns = ['date_block_num'],fill_value = 0,aggfunc='sum')
dataset.reset_index(inplace = True)
dataset

In [None]:
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv',delimiter=',')
test

In [None]:
ID = test.ID
test = test.drop(['ID'], axis=1)
test

Since we have data in the train that does not participate in the test, but in the test, on the contrary, there is data that is not in train, we will create a common table that is required for submit

In [None]:
dataset = pd.merge(test,dataset,on = ['item_id','shop_id'],how = 'left')
dataset

In [None]:
#Посмотрим количество пропусков
dataset.isnull().sum().sum()

There are some of the easiest options for replacing gaps, these are zero and average. The average performed slightly better.

In [None]:
# fillna 0
# dataset.fillna(0,inplace = True)
# dataset.isnull().sum().sum()

In [None]:
#fill na mean
dataset = dataset.fillna(dataset.mean())
dataset

In [None]:
#drop shop_id and item_id
dataset.drop(['shop_id','item_id'],inplace = True, axis = 1)
dataset.columns = ['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31','32','33']
dataset

In [None]:
#Cteate train and test
y_train = dataset.iloc[:,-1:]
X_train = dataset.iloc[:, :-1]
X_test = dataset.iloc[:,1:]



In [None]:
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from catboost import Pool
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from keras import optimizers
from keras.models import Sequential, Model
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Flatten, Dropout,BatchNormalization
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

You can use RF with this params. But it takes an infinite amount of change)

In [None]:
%%time
pipe_RF =  Pipeline([("RF", RandomForestRegressor(random_state = 42))])
params = {'RF__n_estimators': [1250],
          'RF__min_samples_split': [6],
          'RF__max_depth': [4],
          'RF__max_features': ['auto']},
RF = GridSearchCV(pipe_RF,params,scoring='neg_mean_squared_error',return_train_score=True,n_jobs=-1,cv = 5)
RF.fit(X_train,y_train)
print(f'Best_score = {RF.best_score_}')
print(f'Best_params = {RF.best_params_}')

## FC Model

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_train

In [None]:
model_FC = Sequential()
model_FC.add(Dense(512, input_dim=X_train.shape[1], activation='relu'))
model_FC.add(BatchNormalization())
model_FC.add(Dropout(0.2))
model_FC.add(Dense(256, activation='relu'))
model_FC.add(BatchNormalization())
model_FC.add(Dropout(0.2))
model_FC.add(Dense(128, activation='relu'))
model_FC.add(BatchNormalization())
model_FC.add(Dropout(0.2))
model_FC.add(Dense(64, activation='relu'))
model_FC.add(BatchNormalization())
model_FC.add(Dropout(0.2))
model_FC.add(Dense(1, activation='linear'))
model_FC.compile(loss='mean_squared_error', optimizer='adam')
model_FC.summary()

In [None]:
model_FC.fit(X_train, y_train,
 batch_size=4096, epochs=30,
 verbose=2)

In [None]:
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)
X_test

In [None]:
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv',delimiter=',')
test

In [None]:
# creating submission file 
submission_pfs = model_FC.predict(X_test)
submission_pfs = submission_pfs.clip(0,20)
submission = pd.DataFrame({'ID':test['ID'],'item_cnt_month':submission_pfs.ravel()})
# Save csv file 
submission.to_csv('FC_or_another_model',index = False)

## This model issued score = 1.079 и 7300+ place
