In [None]:
import warnings 
warnings.filterwarnings("ignore")

# Base libraries
import os
import numpy as np
import pandas as pd
import re
import string
import math
from IPython.display import display_html
import tqdm
import wandb


## visualization libraries
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patches as patches
import seaborn as sns
!pip install pywaffle
from pywaffle import Waffle

%matplotlib inline
sns.set(style="darkgrid")
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# read data
sales_train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
item_categories = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')

### File descriptions
- sales_train.csv - the training set. Daily historical data from January 2013 to October 2015.
- test.csv - the test set. You need to forecast the sales for these shops and products for November 2015.
- sample_submission.csv - a sample submission file in the correct format.
- items.csv - supplemental information about the items/products.
- item_categories.csv  - supplemental information about the items categories.
- shops.csv- supplemental information about the shops.
### Data fields
- ID - an Id that represents a (Shop, Item) tuple within the test set
- shop_id - unique identifier of a shop
- item_id - unique identifier of a product
- item_category_id - unique identifier of item category
- item_cnt_day - number of products sold. You are predicting a monthly amount of this measure
- item_price - current price of an item
- date - date in format dd/mm/yyyy
- date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33
- item_name - name of item
- shop_name - name of shop
- item_category_name - name of item category

### Let's join the datasets


In [None]:
#Let's start with merge of the datasets
sales_train.head()

In [None]:
shops.head()

In [None]:
items.head()

In [None]:
item_categories.head()

In [None]:
merge_1 = sales_train.merge(shops, on="shop_id")
merge_2 = items.merge(item_categories, on="item_category_id")
df = merge_1.merge(merge_2, on="item_id")
df.head()

def eda(data):
    print("----------Top-5 Records----------")
    print(data.head(5))
    print("-----------Information-----------")
    print(data.info())
    print("-----------Data Types-----------")
    print(data.dtypes)
    print("----------Missing value-----------")
    print(data.isnull().sum())
    print("----------Null value-----------")
    print(data.isna().sum())
    print("----------Shape of Data----------")
    print(data.shape)
    
eda(df)
#train = sales.join(items, on='item_id', rsuffix='_').join(shops, on='shop_id', rsuffix='_').join(item_categories, on='item_category_id', rsuffix='_').drop(['item_id_', 'shop_id_', 'item_category_id_'], axis=1)

In [None]:
df.describe()

In [None]:
#We saw that there is a min of -1 on the price, let's see it
print(df[df["item_price"] < 0])

In [None]:
df.shape

In [None]:
#Check for nulls or duplicates
df.isnull().any().sum()
df.duplicated().value_counts()

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
print('Min date from train set: %s' % df['date'].min().date())
print('Max date from train set: %s' % df['date'].max().date())

Questions:
- Year that sold the most;
- Sells over the months;
- Shop that sells the most;
- Item most sold;
- Category most important;
- Revenues for year, month

In [None]:
#Year where there were more sells
df.groupby([df['date'].apply(lambda x: x.strftime(format="%Y"))])['item_cnt_day', 'item_price'].sum()

In [None]:
#Sells over the months
plt.figure(figsize=(20,5))
ax = sns.countplot(x="date_block_num", data=df, palette="husl")
plt.title("Count of Sales each month")
plt.show()

In [None]:
#Most sales in one day
df.groupby([df['date']])['item_cnt_day'].sum().sort_values(ascending=False)[:25]

In [None]:
print("There are", df['shop_id'].nunique(), "unique shops.")
print("There are", df['item_id'].nunique(), "unique items")
print("There are", df['item_category_id'].nunique(), "unique items")

In [None]:
plt.figure(figsize=(20,5))
ax = sns.countplot(x="shop_id", data=df, palette="husl")
plt.title("Count of Sales on each Shop")
plt.show()

In [None]:
df['item_category_id'].value_counts()[:25]

In [None]:
df['item_id'].value_counts()[:25]

In [None]:
df['Revenues'] = df['item_cnt_day']*df['item_price']

In [None]:
#Revenues by year
df.groupby([df['date'].apply(lambda x: x.strftime(format="%Y"))])['Revenues'].sum()

In [None]:
#Revenues by month
df.groupby([df['date'].apply(lambda x: x.strftime(format="%B"))])['Revenues'].sum().sort_values(ascending=False)

### The task is to forecast the total amount of products sold in every shop for the test set.

I'm going to use the numerical values only for my dataset;
I have to transform the dataset to have only monthly data