In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


import glob
from functools import reduce


%pip install deep_translator
from deep_translator import GoogleTranslator as gt

#text clustering library
%pip install textpack
from textpack import tp


from sklearn.preprocessing import StandardScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose



#ML libraries
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split



import warnings
warnings.filterwarnings('ignore')

#figsizes for all plots
figsize = (18, 5)

# Loading all files from the directory as DataFrames

In [None]:
import os
#Loading all files in Kaggle kernel
filesList = []

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        
        file = os.path.join(dirname, filename)    
        fileExt = file[60:-4] #Get the name of the files following the current pattern

        #read each file in their variable name
        exec('{:s}_set = pd.read_csv("{:s}")'.format(fileExt, file)) 
        
        # -- Display the files that will be loaded and the varible names
#         print('{:s}_set = pd.read_csv("{:s}")'.format(fileExt, file))
        

        #append list of files
        filesList.append('{:s}_set'.format(fileExt))

filesList

# Join data into consolidated datasets to work better

I will join the data transforming an OLTP like schema to an OLAP like schema, to work better with our datasets

In [None]:
data = pd.merge(sales_train_set, items_set, on='item_id', how='left')
data = pd.merge(data, shops_set, on='shop_id', how='left')
data = pd.merge(data, item_categories_set, on='item_category_id', how='left')

#Use date column as datetime type
sales_train_set.date = pd.to_datetime(sales_train_set.date, format="%d.%m.%Y")
data.date = pd.to_datetime(data.date, format="%d.%m.%Y")

data.head()

## EDA - Exploratory Data Analysis

Now that I have the data all in one set, let's move on to the exploratory analysis step to better understand what's in our hands.

### Translate the name of the stores to understand better

I noticed that there are some store names that seem to be duplicates, as they are very similar. So I merged these stores into one ID.


In [None]:
shops_set['translated'] = shops_set.shop_name.apply(lambda x: gt(source='ru', target='en').translate(x))

In [None]:
shops_set[[ 'shop_id', 'translated']].sort_values(by='translated')

### Investigate if stores on shoppings have greater performance

I didn't found any significant difference between store numbers, in general.

In [None]:
data = pd.merge(data, shops_set[['shop_id', 'translated']], on='shop_id', how='left')

In [None]:
data.groupby(['translated']).sum()['item_cnt_day'].sort_values(ascending=False)

In [None]:
#if there is shopping in 'translated', separate to analyse
wordToAnalyse = 'shopping'
group1 = data.loc[data['translated'].str.contains(wordToAnalyse, case=False)]
group2 = data.loc[ ~ data['translated'].str.contains(wordToAnalyse, case=False)]

#print each group mean
print('Difference between groups:', group1.item_cnt_day.mean() - group2.item_cnt_day.mean())


After analysing the shops dataset, I noticed that some stores have similar names, so I will attribute the same ID for them.

In [None]:
data.shop_id = data.shop_id.replace({10:11, 0:57, 1:58})

### Looking for nulls in data

I found no null data, what is great.

In [None]:
#Find out if we have null data
((data.isna().sum()/data.isna().count())*100).sort_values(ascending=False)

### The 'item_cnt_day' and 'price' features

In [None]:
print('item price description:')
round(data.item_price.describe())

In [None]:
print('item count description:')
round(data.item_cnt_day.describe())

In [None]:
perc1 = round(data[data['item_cnt_day'] < 0].count()['date'] / data['item_cnt_day'].sum(),5)
perc2 = round(data[data['item_price'] < 0].count()['date'] / data['item_price'].sum(),5)

print('Proportion of negative values on item_cnt_day feature:', perc1*100, '%')
print('Proportion of negative values on item_price feature:', perc2*100, '%')


There are negative numbers on both variables and a lot of outliers, specially on [item_price]. The negative values could mean returned items, difference found on inventory, etc. 
For the study I will handle outliers and change remaining negative values by the mode (in the [item_cnt_day]) or by 0 (in the [item_price] variable).

In [None]:
#distribution of Item cnt day considering outliers
plt.figure(figsize=figsize)
plt.title('Outliers of item_cnt_day before adjust')
sns.boxplot(x=data['item_cnt_day'], color='red')

In [None]:
def removeOutliers(data, column, q1, q3):
  #Detect the outliers using IQR technique

  q1_value = data[column].quantile(q1)
  q3_value = data[column].quantile(q3)
  iqr = q3_value - q1_value
  
  #Upper and Lower Limits
  upper = q3_value + 1.5 * iqr
  lower = q1_value - 1.5 * iqr

  print("Lower bound:", lower)
  print("Upper bound:", upper)
  
  # new_df = data[data['Income'] > upper]
  # new_df = data[data['Income']  < lower]

  # Capping (if value is above ou below defined limit, it will be setted to the limit)
  newColumn = column + '_Ol' 

  data[newColumn] = np.where(data[column] > upper, upper, 
                            np.where(data[column] < lower, lower,
                            data[column]))

  print("=" *50)
  print('New Column name is:', newColumn)

  mode1 = data[newColumn].mode()

  #If we have negative values, change for the mode.
  data[newColumn] = data[newColumn].mask(data[newColumn] < 0, float(mode1))

  #print result described
  print("=" *50)
  print('item count description:')
  print(round(data[newColumn].describe()))

  # #distribution of Income without the greater outliers
  # plt.figure(figsize=figsize)
  # plt.title('Outliers of item_cnt_day after adjust')
  # sns.boxplot(x=data['item_cnt_day_ol'], color='red')


In [None]:
removeOutliers(data, 'item_cnt_day', 0.01, 0.9)

In [None]:
# fill negative values from price with 0
data['item_price'] = data['item_price'].mask(data['item_price'] < 0, 0)
round(data['item_price'].describe())

#### Plot [item_cnt_day] over time to verify seasonality and trends 

In [None]:
# #adding month and year columns
# sales_train_set['year'] = sales_train_set['date'].dt.year
# sales_train_set['month'] = sales_train_set['date'].dt.month

#item count over time
data.groupby(['date_block_num']).sum()['item_cnt_day'].plot(figsize=figsize, title='Item count sum over time "months"')

#Same as below:
# sales_train_set.groupby([sales_train_set.index.year, sales_train_set.index.month])['item_cnt_day'].sum().plot(figsize=figsize)

In [None]:
#item price over time
data.groupby(['date_block_num']).sum()['item_price'].plot(figsize=figsize, title='Item price sum over time "months"')

In [None]:
#set date index to use the dataset in decomposing technique
data = data.set_index('date', inplace=False)

dataTimeMean = pd.DataFrame(data.resample('M')['item_cnt_day'].sum()) #resample using months as parameter
# dataTimeMean = dataTimeMean.fillna(0)
# dataTimeMean.drop(dataTimeMean.tail(8).index, inplace=True) # Drop the last observations with 0 

#Decompose Time Series
decompose = seasonal_decompose(dataTimeMean, extrapolate_trend=12)

#Trend
obs = decompose.observed
#Trend
trend = decompose.trend
#Seazonal
seazon = decompose.seasonal
#Error
random = decompose.resid

In [None]:
#plot
fig, axes = plt.subplots(4, 1, figsize=(15,8), sharex=True)
fig.suptitle('Decompose of the sum of Item Count over months')

sns.lineplot(x=obs.index, y=obs, ax=axes[0], data=obs)
sns.lineplot(x=trend.index, y=trend, ax=axes[1], data=trend)
sns.lineplot(x=seazon.index, y=seazon, ax=axes[2], data=seazon)
sns.lineplot(x=random.index, y=random, ax=axes[3], data=random)

In [None]:
#Seazonality analysis (De-trending)
dataTimeMean.diff(1).plot(figsize=figsize, title='Seazonality over months')

In [None]:
#Seazonality of item counts
dataTimeMean.item_cnt_day.diff(1).groupby(dataTimeMean.index.month).sum().plot(kind='bar', figsize=figsize, title='Seazonality of item count over months')

In [None]:
dataTimeMean.groupby(dataTimeMean.index.month).mean().plot(kind='bar')

I then noticed a strong element of seasonality, indicating that in January the sum of items is always lower, after a large peak in December.
I also observed a general downward trend in the number of items, as shown by the downward curve of the 'trend' chart. 

### Best stores classification

I will define the best stores within the dataset

In [None]:
#item count by store
data.groupby(['shop_id']).sum()['item_cnt_day'].sort_values(ascending=False).plot(kind='bar', figsize=figsize, title='Item count by store')

I noticed that in the general analysis, stores 31, 25, 54 and 28 had the best performance in terms of the number of items sold. However, we know that the dataset has stores that were not operational during the entire period, so we will also evaluate the average sales taking into account the number of days that the store was operating (period between the first and last store registration in the dataset), which gives insight into which stores did best in the time they were in operation. 

#### Feature creation (operating days)

In [None]:
listShop = []
listDays = []

for i in range(len(shops_set.shop_id)):
    a = data[data['shop_id'] == i].index.max()
    b = data[data['shop_id'] == i].index.min()
    opDays = (a-b).days

    # print('Store {} had {} days operating'.format(i, opDays))
    listShop.append(i) 
    listDays.append(opDays)

daysOps = pd.DataFrame({'shop_id': listShop,
                        'opDays': listDays})

#insert on new dataset the mean of item count considering the operation days
daysOps['meanByOpDays'] = data.groupby(['shop_id']).sum()['item_cnt_day'] / daysOps['opDays']
daysOps = daysOps.dropna(axis=0)

daysOps.sort_values(by='meanByOpDays', ascending=False).head()

In [None]:
#item count by store (mean)
daysOps['meanByOpDays'].sort_values(ascending=False).plot(kind='bar', figsize=figsize, title='Item count by store operating days')

## Tests for predictions

I'll start with predictions, testing linear regression algorithms

#### Cluster similar items using Text Pack

In [None]:
itemCluster = tp.TextPack(items_set, ['item_name'], match_threshold=0.1, ngram_remove=r'[,-./]', ngram_length=3)
itemCluster.run(column_name='clustered')

In [None]:
print('Clustering reduced the items by text similatiry into {} categories'.format(items_set.clustered.nunique()))

In [None]:
items_set['category'] = pd.factorize(items_set.clustered)[0]

In [None]:
items_set = items_set.drop('clustered', axis=1)
items_set

In [None]:
#merge clustering result into merged data set
data = data.reset_index() #reset index to avoid losing date index

data = pd.merge(data, items_set[['category','item_id']], on='item_id', how='left')

data = data.set_index('date', inplace=False) #reset to date index again

In [None]:
data.head()

In [None]:
data = data[['date_block_num','shop_id','item_id','item_price','item_category_id','item_cnt_day_Ol', 'category']]
data

#### Disregarding stores with more than X days inoperative

In [None]:
#find the shops with more than 30 days down, to set 0 for each item.
listShopsDown = []

for i in range(len(data.shop_id.unique())):
    a = data[data['shop_id'] == i].index.max()
    max = data.index.max()

    
    if (a - max).days < -30:
        listShopsDown.append(i) 
    else:
        pass

print('Shops to desconsider:', listShopsDown)

data_pred = data.query('shop_id != @listShopsDown') #remove stores from the data set therefore
data_pred

#### Parcial auto correlation function (pac) to determine auto correlation between lags

In [None]:
# If we would use autoregression:
lenOfDataToPredict = len(data_pred)

if lenOfDataToPredict < 10:
    lags=(lenOfDataToPredict/2)-1
else:
    lags=10

plot_pacf(data_pred.item_cnt_day_Ol, lags=lags)
plt.show()

#### Feature creation (lags and month of the year)

In [None]:
#sorting by items for lag creation
data_pred = data_pred.sort_values(by=['shop_id', 'item_id'])

#creating features for autoregression technique
data_pred['x1'] = data_pred.item_cnt_day_Ol.shift(1)
data_pred['x2'] = data_pred.item_cnt_day_Ol.shift(2)
data_pred['x3'] = data_pred.item_cnt_day_Ol.shift(3)

#feature engeneering
data_pred['month'] = data_pred.index.month
# data_pred['week'] = data_pred.index.week
# data_pred['day'] = data_pred.index.day


data_pred.dropna(axis=0, inplace=True)

data_pred.head()

#### Dummies and normalization

In [None]:
#one hot encoding for the months
# onehot = pd.get_dummies(data_pred.month, drop_first=True, prefix="m")
# data_pred = data_pred.drop('month', axis=1)
data_pred = pd.get_dummies(data_pred, columns=['month'], drop_first=True, prefix="m")
data_pred = pd.get_dummies(data_pred, columns=['category'], drop_first=True, prefix="cat_")

#normalize item_price feature
toNormalizeData = data_pred['item_price']

scaler = StandardScaler().fit(toNormalizeData.values.reshape(-1,1))
toNormalizeData = scaler.transform(toNormalizeData.values.reshape(-1,1))

#Get the arrays generated back in the dataset
data_pred['item_price'] = toNormalizeData

data_pred

In [None]:
#group to predict by month, not by day as we have daily data
data_pred = data_pred.groupby(['date_block_num', 
                                'shop_id',
                                'item_id']).sum().reset_index()
                                
#remove aggregatiions on onehot encoded columns
listOfOHColumns = ['m_2','m_3','m_4','m_5','m_6','m_7','m_8','m_9','m_10','m_11','m_12']

for c in listOfOHColumns:
    data_pred[c] = np.where(data_pred[c] >= 1, 1,
                            data_pred[c])

In [None]:
#Solve the summed item category ID issue
tempItems = pd.merge(items_set, item_categories_set, on='item_category_id', how='left')
tempItems = tempItems[['item_id','item_category_id']]
tempItems

In [None]:
data_pred = data_pred.drop('item_category_id', axis=1)
data_pred = pd.merge(data_pred, tempItems, on='item_id', how='left')
data_pred = data_pred.drop('item_category_id', axis=1)
data_pred

#### Train and test split

In [None]:
randomState = 12

X = data_pred.copy()
y = X.pop('item_cnt_day_Ol')

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2,
                                                    random_state = 1
                                                    )

#### Best performance model (LGBM)

After testing few models using a Pipeline technique, I decided to focus on the LGBM model, as it has showed best scores

In [None]:
lgbm = LGBMRegressor(learning_rate= 0.2,
                     max_depth= 20,
                     num_leaves= 150,
                     subsample= 0.4).fit(X_train, y_train)

predictions_LGBM = np.around(lgbm.predict(X_test), decimals=1)


print('RMSE for lgbm was: \n', mean_squared_error(y_test, predictions_LGBM, squared=False))
print('==============='*5)

In [None]:
data_pred['pred_1'] = np.around(lgbm.predict(X), decimals=1)
data_pred[['item_cnt_day_Ol', 'pred_1']].plot(figsize=figsize, title='Plot with predictions and the actual data')

print('RMSE for lgbm on all data was: \n', mean_squared_error(data_pred['item_cnt_day_Ol'], data_pred['pred_1'], squared=False))

In [None]:
data_pred.query('shop_id==2')[['item_cnt_day_Ol', 'pred_1']].plot(figsize=figsize, title='Plot with predictions and the actual data')

### Final model to run predictions:

In [None]:
lgbm = LGBMRegressor(learning_rate= 0.2,
                     max_depth= 20,
                     num_leaves= 150,
                     subsample= 0.4).fit(X, y)


I will insert in the data set that will be used for the predictions, the same variables created for the test set. These variables do not exist in the original data and therefore need to be added so that the model has the same capabilities for the final prediction. 

In [None]:
test_set_merge = pd.merge(test_set, data_pred[['item_id', 'shop_id', 'item_price', 
                                                'cat__1', 'cat__2', 'cat__3', 'cat__4', 'cat__5', 'cat__6', 
                                                'cat__7', 'cat__8', 'cat__9', 'cat__10', 'cat__11']], on=['item_id', 'shop_id'], how='left')

#removing duplicates
test_set = pd.merge(test_set, test_set_merge.drop_duplicates(['ID']), on=['ID', 'item_id', 'shop_id'], how='left')

#remove nans from items with no values on original data
test_set = test_set.fillna(0)

#insert next month index
test_set['date_block_num'] = data_pred.date_block_num.max() + 1

#drop ID
# test_set = test_set.drop('ID', axis=1)

#add month encoded columns to test data
test_set[['m_2','m_3','m_4','m_5','m_6','m_7','m_8','m_9','m_10','m_11','m_12']] = [0,0,0,0,0,0,0,0,0,1,0] #as the forecast will use the november as month
test_set[['x1', 'x2', 'x3']] = [None,None,None]

#Vizualize
test_set.query('item_id == 5233 & shop_id == 5')

In [None]:
#reorder columns to match the training dataset
test_set = test_set[X_test.columns]

In [None]:
#concatenating the datasets, so I will be able to fill lags with actual data from previous months
temp_test = pd.concat([data_pred, test_set], keys=["x", "y"])
temp_test

In [None]:
#just to vizualize if the concat did it well
temp_test.sort_values(by=['shop_id', 'item_id', 'date_block_num']).head(20)

In [None]:
#remove aggregatiions on onehot encoded columns
listOfOHColumns = ['cat__1', 'cat__2', 'cat__3', 'cat__4', 'cat__5', 'cat__6', 'cat__7', 'cat__8', 'cat__9', 'cat__10', 'cat__11']

for c in listOfOHColumns:
    temp_test[c] = np.where(temp_test[c] >= 1, 1,
                            temp_test[c])

In [None]:
#creating features for autoregression technique
temp_test = temp_test.sort_values(by=['shop_id', 'item_id', 'date_block_num'])

temp_test['x1'] = temp_test['x1'].fillna(temp_test.item_cnt_day_Ol.shift(1))
temp_test['x2'] = temp_test['x2'].fillna(temp_test.item_cnt_day_Ol.shift(2))
temp_test['x3'] = temp_test['x3'].fillna(temp_test.item_cnt_day_Ol.shift(3))

#fill residual NaNs with 1
temp_test = temp_test.loc['y'].fillna(1)

In [None]:
test_set = temp_test[X.columns].sort_index()

### Prediction of the trained model using the given test dataset 

In [None]:
test_set['pred'] = np.around(lgbm.predict(test_set), decimals=1)

In [None]:
#Items on test set that we do not have on train sets.
itemsNotInTrain = test_set.item_id[~test_set.item_id.isin(data_pred.item_id)]

# fill predictions to 0 for this items
test_set['pred'] = test_set.pred.where(test_set.item_id.isin(itemsNotInTrain), 0)

In [None]:
test_set

In [None]:
test_set['pred'].describe()

The final result for this notebook was a RMSE of 1.24015

# References

https://github.com/lukewhyte/textpack

https://www.kaggle.com/code/deinforcement/top-1-predict-future-sales-features-lightgbm/

https://github.com/seatgeek/thefuzz