In [None]:
# Input data files are available in the read-only "../input/" directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# scintific computing libraries
import pandas as pd                                      
import numpy as np                    
from scipy import optimize, stats        

# visualisation libraries
import matplotlib.pyplot as plt                      
import seaborn as sns                

# algorithmic library
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.utils import resample
from sklearn.pipeline import Pipeline

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Flatten, Dropout
from tensorflow.keras.layers import LeakyReLU, PReLU, ELU
from keras.utils import np_utils

## Data Acquisition

In [None]:
df_train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
print(df_train.shape)
df_train.head()

In [None]:
df_shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
print(df_shops.shape)
df_shops.head()

In [None]:
df_items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
print(df_items.shape)
df_items.head()

In [None]:
df_item_categories = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
print(df_item_categories.shape)
df_item_categories.head()

In [None]:
df_test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
print(df_test.shape)
df_test.head()

## Data Preprocessing

In [None]:
df_train.head()

In [None]:
df_train.dtypes

In [None]:
df_train.isnull().sum()

In [None]:
df_train.drop(['date_block_num','item_price'], axis=1, inplace=True)

In [None]:
df_train['date'] = pd.to_datetime(df_train['date'], dayfirst=True)
df_train['date'] = df_train['date'].apply(lambda x: x.strftime('%Y-%m'))
df_train.head()

In [None]:
# 将df_train先按照data,然后再按照shop_id, 最后再按照item_id进行排序
df = df_train.groupby(['date','shop_id','item_id']).sum()
df.head()


In [None]:
df = df.pivot_table(index=['shop_id','item_id'], columns='date', values='item_cnt_day', fill_value=0)
df.reset_index(inplace=True)
df.head()

In [None]:
# 将df_test与df两表进行合并，按照【‘shop_id’, ‘item_id’】合并
df_test = pd.merge(df_test, df, on=['shop_id','item_id'], how='left')
df_test.drop(['ID', '2013-01'], axis=1, inplace=True)
df_test.head()

In [None]:
df_test = df_test.fillna(0)
df_test.head()

### 得到了可以训练集df_test，行为样本，列为2013-2015年每个月的销售量

## Model Development

In [None]:
Y_train_1 = df_test['2015-10'].values
print(Y_train_1.shape)

In [None]:
# 拿原来的df文件进行训练，而不是拿df_test文件进行训练
Y_train = df['2015-10'].values
X_train = df.drop(['2015-10'], axis = 1)
X_test = df_test

print(X_train.shape, Y_train.shape)
print(X_test.shape)

In [None]:
'''X_train = StandardScaler().fit(X_train).transform(X_train)
X_test = StandardScaler().fit(X_test).transform(X_test)'''

In [None]:
# 将训练样本分成训练集和测试集
x_train, x_test, y_train, y_test = train_test_split( X_train, Y_train, test_size=0.2, random_state=4)
print ('Train set:', x_train.shape,  y_train.shape)
print ('Test set:', x_test.shape,  y_test.shape)

In [None]:
LR = LinearRegression()
LR.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, LR.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, LR.predict(x_test)))
print('Test set score:', LR.score(x_train,y_train))

In [None]:
RFR = RandomForestRegressor(n_estimators = 100)
RFR.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, RFR.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, RFR.predict(x_test)))
print('Test set score:', RFR.score(x_train,y_train))

In [None]:
'''GBR = GradientBoostingRegressor(n_estimators = 50 , max_depth = 3)
GBR.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, GBR.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, GBR.predict(x_test)))
print('Test set score:', GBR.score(x_train,y_train))'''

In [None]:
'''model=LGBMRegressor(n_estimators=200, learning_rate=0.03, max_depth=8)
model.fit(x_train,y_train)

print('Train set mse:', mean_squared_error(y_train, model.predict(x_train)))
print('Test set mse:', mean_squared_error(y_test, model.predict(x_test)))
print('Test set score:', model.score(x_train,y_train))'''

In [None]:
prediction = LR.predict(X_test)

In [None]:
prediction = list(map(round, prediction))
prediction

## Submission

In [None]:
df_submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
print(df_submission.shape)
df_submission.head()

In [None]:
df_submission['item_cnt_month'] = prediction
df_submission.to_csv('prediction.csv', index=False)
df_submission.head()