In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import tensorflow as tf
from sklearn import preprocessing

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Load all the data files

In [None]:
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
sample_submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
item_categories = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
sales_train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')

## Checking the size of inidvidual files 

* *Task* - forecast the total amout of products sold in every shop for the test set.
* *Problem* - List of shops and products change every month

* Items, item_categories, shops are all supplimentary information.

### So, we need to focus on exploring the sales train dataset

In [None]:
print('shape of Items : ',items.shape)
print('shape of sample_submission: ', sample_submission.shape)
print('shape of item_categories : ',items.shape)
print('shape of sales_train : ',items.shape)
print('shape of shops : ',items.shape)
print('shape of test : ',items.shape)

## Exploring the training dataset

In [None]:
#Providing information about all the columns in the dataset
sales_train.info(), sales_train.describe()

Among all the 5 columns, we have identified the following 2 columns as the important features

1. item_id
2. shops_id

They both will have unique elements that will determine the solution of the problem.

In [None]:
#Details about presence of unique elements in the sales dataset
unique_items = pd.unique(sales_train['item_id'])
unique_shops = pd.unique(sales_train['shop_id'])
print('Number of unique items : ', len(unique_items))
print('Number of unique shops :', len(unique_shops))

## Data visualization

### Here we try to group the data based on the following factors 
1. Date
2. Shop ID
3. Item price

In [None]:
# grouping the data based on individual columns
df_d = sales_train.groupby(['date_block_num'],as_index=False).sum()
df_s = sales_train.groupby(['shop_id'],as_index=False).sum().sort_values('item_cnt_day',ascending=False)
df_i = sales_train.groupby(['item_price'],as_index=False).sum()


#Plotting the dataframes
sns.set(rc={'figure.figsize':(15,18)})
fig, axes = plt.subplots(3,1)
sns.lineplot(x='date_block_num', y='item_cnt_day', data=df_d,ax=axes[0])
sns.barplot(x='shop_id',y='item_cnt_day',data=df_s,ax=axes[1])
sns.lineplot(x='item_price',y='item_cnt_day',data=df_i,ax=axes[2])



## check whether there are null values in each of the columns

In [None]:
print('number of null values in date : ',sales_train['date'].isnull().sum())
print('number of null values in date_block_num : ',sales_train['date_block_num'].isnull().sum())
print('number of null values in shop_id : ',sales_train['shop_id'].isnull().sum())
print('number of null values in item_id : ',sales_train['item_id'].isnull().sum())
print('number of null values in item_price : ',sales_train['item_price'].isnull().sum())
print('number of null values in item_cnt_day : ',sales_train['item_cnt_day'].isnull().sum())

## Creating a column named month for better understanding of the data