**Goal**:   
Predict total sales for every product and store in the next month.   
**Data**:   
Time-series dataset consisting of daily sales data by Russian software firm, 1C Company.    
   
   **Model**:   
   Ridge Regression for future monthly item count.   


In [None]:
# Python 3 environment  withanalytics libraries installed
# as defined by the kaggle/python Docker 

import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#more libraries

#visualization
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns 

# ignore error messages
import warnings
warnings.filterwarnings("ignore")


In [None]:
train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')

submission = pd.read_csv('../input/competitive-data-science-predict-future-sales/sample_submission.csv')

items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
item_cats = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')

In [None]:
##Find if dataset has NaNs
# Total missing values
train.isnull().sum().sum()

In [None]:
#fix dates
train.date=pd.to_datetime(train.date)

In [None]:
# Get ordinal day of the year 
train['dayofyear']=train.date.dt.dayofyear

#get year
train['year'] = pd.DatetimeIndex(train['date']).year
#get month
train['month'] = pd.DatetimeIndex(train['date']).month

In [None]:
#find possible outliers

boxplot =train.boxplot( rot=45)

In [None]:
#remove outliers

#drop rows with items over 40000
df1=train[train['item_price'] < 40000]
#verify
df1.shape

In [None]:
df1.corr().style.background_gradient(cmap='mako')

In [None]:
#df1
#grouped by months
colors='steelblue', 'pink', 'lightseagreen', 'darkorange','gray','purple', 'peru','crimson','navy','forestgreen','tan','yellow'
df1.groupby('month')['month'].agg('count').plot(kind='pie',title='Group-By Month', colors=colors, figsize=(7,7), autopct='%0.1f')

**December** seems to be the most active month for sales.   

In [None]:
df2=df1[['date_block_num', 'shop_id', 'item_id', 'item_price',
       'item_cnt_day', 'dayofyear', 'year', 'month']]

In [None]:
df2.year.value_counts()

In [None]:
#find possible outliers in single column

boxplot4 =df2.boxplot(column=['item_cnt_day'])

In [None]:
#drop rows with items day count over 1000
df2=df2[df2['item_cnt_day'] < 1000]
#verify
df2.shape

In [None]:
#separate by year
df2013=df2[df2.year == 2013]
df2014=df2[df2.year == 2014]
df2015=df2[df2.year == 2015]

In [None]:
df2013.corr().style.background_gradient(cmap='GnBu')

In [None]:
df2013.groupby(df2013.month)['item_cnt_day'].sum().plot(figsize=(12, 5), color='peru')
plt.ylabel('Item count')
plt.title(" 2013 Item count per day, per Month", fontsize=18)

In [None]:
df2014.corr().style.background_gradient(cmap='Spectral')

In [None]:
df2014.groupby(df2014.month)['item_cnt_day'].sum().plot(figsize=(12, 5), color='darkgreen')
plt.ylabel('Item count')
plt.title("2014 Item count per day, per Month", fontsize=18)

In [None]:
df2015.corr().style.background_gradient(cmap='cool')

In [None]:
df2015.groupby(df2015.month)['item_cnt_day'].sum().plot(figsize=(12, 5), color='darkorange')
plt.ylabel('Item count')
plt.title("2015 Item count per day, per Month", fontsize=18)

In [None]:
#separate by month
dfjan=df2[df2.month == 1]
dffeb=df2[df2.month == 2]
dfmar=df2[df2.month == 3]
dfapl=df2[df2.month == 4]
dfmay=df2[df2.month == 5]
dfjun=df2[df2.month == 6]
dfjul=df2[df2.month == 7]
dfaug=df2[df2.month == 8]
dfsep=df2[df2.month == 9]
dfoct=df2[df2.month == 10]
dfnov=df2[df2.month == 11]
dfdec=df2[df2.month == 12]

In [None]:
sns.catplot(x='year', y='item_cnt_day', data=dfjan)

In [None]:
#separate Jan numbers by year
dfjan2013=dfjan[dfjan.year == 2013]
dfjan2014=dfjan[dfjan.year == 2014]
dfjan2015=dfjan[dfjan.year == 2015]

In [None]:
dfjan2013

In [None]:
itemPerDayJan2013=dfjan2013.dayofyear.value_counts()

In [None]:
item_cnt_monthjan2013 = dfjan2013.item_cnt_day.sum()
item_cnt_monthjan2013

In [None]:
item_cnt_month_jan2014 = dfjan2014.item_cnt_day.sum()
item_cnt_month_jan2014

In [None]:
item_cnt_month_jan2015 = dfjan2015.item_cnt_day.sum()
item_cnt_month_jan2015

In [None]:
#create ID column
df2['index1'] = df2.index

In [None]:
df2.rename(columns = {"index1": "ID"},  
          inplace = True) 

In [None]:
MyTrainDF=df2[['ID','shop_id', 'item_id','item_price', 'item_cnt_day', 'date_block_num', 
       'dayofyear', 'year', 'month']]

In [None]:
MyTrainDF

## finding: item_count_month

In [None]:
#repeat for remaining months
dfdec2013=dfdec[dfdec.year == 2013]
dfdec2014=dfdec[dfdec.year == 2014]
dfdec2015=dfdec[dfdec.year == 2015]

dffeb2013=dffeb[dffeb.year == 2013]
dffeb2014=dffeb[dffeb.year == 2014]
dffeb2015=dffeb[dffeb.year == 2015]

dfmar2013=dfmar[dfmar.year == 2013]
dfmar2014=dfmar[dfmar.year == 2014]
dfmar2015=dfmar[dfmar.year == 2015]

dfapl2013=dfapl[dfapl.year == 2013]
dfapl2014=dfapl[dfapl.year == 2014]
dfapl2015=dfapl[dfapl.year == 2015]

dfmay2013=dfmay[dfmay.year == 2013]
dfmay2014=dfmay[dfmay.year == 2014]
dfmay2015=dfmay[dfmay.year == 2015]

dfjun2013=dfjun[dfjun.year == 2013]
dfjun2014=dfjun[dfjun.year == 2014]
dfjun2015=dfjun[dfjun.year == 2015]

dfjul2013=dfjul[dfjul.year == 2013]
dfjul2014=dfjul[dfjul.year == 2014]
dfjul2015=dfjul[dfjul.year == 2015]

dfaug2013=dfaug[dfaug.year == 2013]
dfaug2014=dfaug[dfaug.year == 2014]
dfaug2015=dfaug[dfaug.year == 2015]

dfsep2013=dfsep[dfsep.year == 2013]
dfsep2014=dfsep[dfsep.year == 2014]
dfsep2015=dfsep[dfsep.year == 2015]

dfoct2013=dfoct[dfoct.year == 2013]
dfoct2014=dfoct[dfoct.year == 2014]
dfoct2015=dfoct[dfoct.year == 2015]

dfnov2013=dfnov[dfnov.year == 2013]
dfnov2014=dfnov[dfnov.year == 2014]
dfnov2015=dfnov[dfnov.year == 2015]

In [None]:
def f(row):
    if (row['month'] == 1) and (row['year']==2013) :
        val = row['item_cnt_day']/dfjan2013.item_cnt_day.sum()
    elif (row['month'] == 1) and (row['year']==2014) :
        val = row['item_cnt_day']/dfjan2014.item_cnt_day.sum()
    elif  (row['month'] == 1) and (row['year']==2015) :
        val = row['item_cnt_day']/dfjan2015.item_cnt_day.sum()
        
    elif (row['month'] == 2) and (row['year']==2013) :
        val = row['item_cnt_day']/dffeb2013.item_cnt_day.sum()
    elif (row['month'] == 2) and (row['year']==2014) :
        val = row['item_cnt_day']/dffeb2014.item_cnt_day.sum()
    elif  (row['month'] == 2) and (row['year']==2015) :
        val = row['item_cnt_day']/dffeb2015.item_cnt_day.sum()
        
    elif (row['month'] == 3) and (row['year']==2013) :
        val = row['item_cnt_day']/dfmar2013.item_cnt_day.sum()
    elif (row['month'] == 3) and (row['year']==2014) :
        val = row['item_cnt_day']/dfmar2014.item_cnt_day.sum()
    elif  (row['month'] == 3) and (row['year']==2015) :
        val = row['item_cnt_day']/dfmar2015.item_cnt_day.sum()
        
    elif (row['month'] == 4) and (row['year']==2013) :
        val = row['item_cnt_day']/dfapl2013.item_cnt_day.sum()
    elif (row['month'] == 4) and (row['year']==2014) :
        val = row['item_cnt_day']/dfapl2014.item_cnt_day.sum()
    elif  (row['month'] == 4) and (row['year']==2015) :
        val = row['item_cnt_day']/dfapl2015.item_cnt_day.sum()
        
    elif (row['month'] == 5) and (row['year']==2013) :
        val = row['item_cnt_day']/dfmay2013.item_cnt_day.sum()
    elif (row['month'] == 5) and (row['year']==2014) :
        val = row['item_cnt_day']/dfmay2014.item_cnt_day.sum()
    elif  (row['month'] == 5) and (row['year']==2015) :
        val = row['item_cnt_day']/dfmay2015.item_cnt_day.sum()
        
    elif (row['month'] == 6) and (row['year']==2013) :
        val = row['item_cnt_day']/dfjun2013.item_cnt_day.sum()
    elif (row['month'] == 6) and (row['year']==2014) :
        val = row['item_cnt_day']/dfjun2014.item_cnt_day.sum()
    elif  (row['month'] == 6) and (row['year']==2015) :
        val = row['item_cnt_day']/dfjun2015.item_cnt_day.sum()
        
    elif (row['month'] == 7) and (row['year']==2013) :
        val = row['item_cnt_day']/dfjul2013.item_cnt_day.sum()
    elif (row['month'] == 7) and (row['year']==2014) :
        val = row['item_cnt_day']/dfjul2014.item_cnt_day.sum()
    elif  (row['month'] == 7) and (row['year']==2015) :
        val = row['item_cnt_day']/dfjul2015.item_cnt_day.sum()
        
    elif (row['month'] == 8) and (row['year']==2013) :
        val = row['item_cnt_day']/dfaug2013.item_cnt_day.sum()
    elif (row['month'] == 8) and (row['year']==2014) :
        val = row['item_cnt_day']/dfaug2014.item_cnt_day.sum()
    elif  (row['month'] == 8) and (row['year']==2015) :
        val = row['item_cnt_day']/dfaug2015.item_cnt_day.sum()
        
    elif (row['month'] == 9) and (row['year']==2013) :
        val = row['item_cnt_day']/dfsep2013.item_cnt_day.sum()
    elif (row['month'] == 9) and (row['year']==2014) :
        val = row['item_cnt_day']/dfsep2014.item_cnt_day.sum()
    elif  (row['month'] == 9) and (row['year']==2015) :
        val = row['item_cnt_day']/dfsep2015.item_cnt_day.sum()
        
    elif (row['month'] == 10) and (row['year']==2013) :
        val = row['item_cnt_day']/dfoct2013.item_cnt_day.sum()
    elif (row['month'] == 10) and (row['year']==2014) :
        val = row['item_cnt_day']/dfoct2014.item_cnt_day.sum()
    elif  (row['month'] == 10) and (row['year']==2015) :
        val = row['item_cnt_day']/dfoct2015.item_cnt_day.sum()
        
    elif (row['month'] == 11) and (row['year']==2013) :
        val = row['item_cnt_day']/dfnov2013.item_cnt_day.sum()
    elif (row['month'] == 11) and (row['year']==2014) :
        val = row['item_cnt_day']/dfnov2014.item_cnt_day.sum()
    elif  (row['month'] == 11) and (row['year']==2015) :
        val = row['item_cnt_day']/dfnov2015.item_cnt_day.sum()
        
    elif (row['month'] == 12) and (row['year']==2013) :
        val = row['item_cnt_day']/dfdec2013.item_cnt_day.sum()
    elif (row['month'] == 12) and (row['year']==2014) :
        val = row['item_cnt_day']/dfdec2014.item_cnt_day.sum()
    elif  (row['month'] == 12) and (row['year']==2015) :
        val = row['item_cnt_day']/dfdec2015.item_cnt_day.sum()
        
    else:
        val = 0
    return val

In [None]:
MyTrainDF['item_cnt_month']= MyTrainDF.apply(f, axis=1)

In [None]:
MyTrainDF.corr().style.background_gradient(cmap='gist_yarg')

In [None]:
#year plot

x = MyTrainDF['year']
y= MyTrainDF['item_cnt_month']
colors = ('darkorange')
 
# Plot
plt.scatter(x, y, c=colors, alpha=0.2)
plt.title('Item count by month per year')
plt.xlabel('year')
plt.ylabel('item count per mo')
plt.show()

In [None]:
MyTrainDF.plot(kind='scatter', x='month',y='item_cnt_day', color='red')

## Regressions   
Regression is where a supervised model learns mapping from the data to then output results, like “0.1”, “6.2”, etc.


### Other types of regressions tried:  

    Lasso Regression   
    ElasticNet Regression    
    
The results with these models were not better than the other ones displayed here.

## 1. linear regression

In [None]:
#ML models
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split 

from sklearn.linear_model import LinearRegression


from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import cross_val_score

In [None]:
#Linear regression
X = MyTrainDF['item_cnt_day'].values.reshape(-1,1)
y = MyTrainDF['item_cnt_month'].values.reshape(-1,1)

In [None]:
#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

regressor = LinearRegression()  
regressor.fit(X_train, y_train) #train the model

In [None]:
#Intercept
print(regressor.intercept_)
#Slope
print(regressor.coef_)

In [None]:
#Predict
pred1 = regressor.predict(X_test)

In [None]:
#actual value and predicted value
dfLinReg = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': pred1.flatten()})
dfLinReg

In [None]:
plt.scatter(X_test, y_test,  color='blue')
plt.plot(X_test, pred1, color='gray', linewidth=2)
plt.show()

In [None]:
first10preds=dfLinReg.head(10)
c='darkgreen', 'blueviolet'
first10preds.plot(kind='bar',figsize=(9,6), color=c)
plt.grid(which='major', linestyle='-', linewidth='0.3', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

In [None]:
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred1)))

### scale the data

In [None]:
MyTrainDF

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

MyTrainDF[['shop_id','item_id',
           'item_price', 'item_cnt_day',
           'date_block_num','dayofyear', 
           'year','month','item_cnt_month']] = scaler.fit_transform(MyTrainDF[['shop_id','item_id',
                                                                               'item_price', 'item_cnt_day',
                                                                               'date_block_num','dayofyear', 
                                                                               'year','month','item_cnt_month']])

X= MyTrainDF[['shop_id','item_id', 'item_cnt_day','dayofyear', 'month']].values
y = MyTrainDF['item_cnt_month'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                   random_state = 5) 

## 2. Ridge Regression 

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

#automated model using SCALED data

def plot_results(name, y_train, y_pred, num_to_plot=1000000, lims=(0.001,0.2), figsize=(6,5)):
    plt.figure(figsize=figsize)
    score = mean_squared_error(y_train, y_pred, squared=False)
    plt.scatter(y_train[:num_to_plot] , y_pred[:num_to_plot], color="goldenrod")
    plt.plot(lims, lims)
    plt.ylim(lims)
    plt.xlim(lims)
    plt.title(f'{name}: {score:0.5f}', fontsize=16)
    plt.show()

modelName = ["Ridge Regression"]

model = [Ridge(alpha = 0.01)]    

for name, model in zip(modelName, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    plot_results(name, y_test, y_pred)


In [None]:
#actual value and predicted value
ridge = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
ridge

In [None]:
first10preds=ridge.head(10)
c='salmon', 'dimgray'
first10preds.plot(kind='bar',figsize=(9,6), color=c)
plt.grid(which='major', linestyle='-', linewidth='0.3', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

### 3. Decision Tree Regression 

In [None]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state=2, max_depth=5)
regressor.fit(X_train, y_train)

In [None]:
pred2= regressor.predict(X_test)

In [None]:
#actual value and predicted value
tree = pd.DataFrame({'Actual': y_test, 'Predicted': pred2.flatten()})
tree

In [None]:
first10preds=tree.head(10)
c='peru', 'rosybrown'
first10preds.plot(kind='line',figsize=(10,8), color=c)
plt.grid(which='major', linestyle='-', linewidth='0.3', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

In [None]:
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred2)))

Regression decision tree under-performed.

### Results:   
The Ridge Regression performed best.   

In [None]:
prediction= pd.DataFrame(y_pred)
prediction

In [None]:
prediction.rename(columns = {0:'item_cnt_month'}, inplace=True)

In [None]:
#create ID column
prediction['ID'] = prediction.index

In [None]:
prd2=prediction[['ID','item_cnt_month']].head(214200)
prd2

In [None]:
#must have 214200 rows

#ensure INDEX is false

#df of results to csv
prd2.to_csv('predictionDF.csv',index=False)