In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Read data, merge CSVs and fix datatyp**es

In [None]:
# Read data
sales = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')

# Merge sales and items so that we can see items category
sales = sales.merge(items, on='item_id', how='left')

# Drop columns we won't use
sales = sales.drop('item_name', axis=1)

In [None]:
# Fix misidentified data types
sales.date = pd.to_datetime(sales.date)
sales.date_block_num = sales.date_block_num.astype(str)
sales.shop_id = sales.shop_id.astype(str)
sales.item_id = sales.item_id.astype(str)

**Basic data details**

In [None]:
# Basic checks
print("-------First rows of data-------")
print(sales.head())
print("-------Amount of missing rows-------")
print(sales.isna().sum())
print("-------Data types-------")
print(sales.dtypes)
print("-------Duplicate checks-------")
print(sales.duplicated().sum())

In [None]:
# Investigate duplicated rows - seems OK, the same shop can sell the same product on the same day for the same price
duplicates = sales[sales.duplicated(keep=False)==True]
print(duplicates.head())
del duplicates

**Check for outliers**

In [None]:
# Check outliers in items sold quantity
# There are some super weird outliers: one day with -20 sales (returns?), one with 2000 sales? Consider dropping att se impact on RMSE.
negative_cnt = sales[sales.item_cnt_day < 0]
print(negative_cnt.head())
print(negative_cnt.shape)

fig, ax = plt.subplots(2)
sns.boxplot(data=negative_cnt, y='item_cnt_day', ax=ax[0])
sns.boxplot(data=sales, y='item_cnt_day', ax=ax[1])

In [None]:
# Check outliers in item price
# Another super weird outlier - one item with price 300,000? Seems like a clear outlier
sns.boxplot(data=sales, x='item_price')

weird_price = sales[sales.item_price>250000]
print(weird_price.shape)
print(weird_price.head())

**Understanding how frequently items and shops occur in our data**

In [None]:
# Grouping to check the counts of product
item_counts = sales.groupby(['item_id']).item_id.count()
# Least popular sold 1
print(item_counts.min())
# Most popular item sold 31,340
print(item_counts.max())
# The average amount of sales per product is 134.6
print(item_counts.mean())

In [None]:
# This shows that the sales per item are not normally distributed - we might need to transform this data to improve predictions
sns.displot(data=item_counts, kind='kde')

In [None]:
# Grouping to check the counts of shops
shop_counts = sales.groupby(['shop_id']).shop_id.count()
print("--- Sales/rows per shop ---")
print(shop_counts.head())
print("--- Amount of Shops ---")
print(shop_counts.shape[0])
print("--- Least amount of sales per shop ---")
print(shop_counts.min())
print("--- Most amount of sales per shop ---")
print(shop_counts.max())
print("--- Most amount of sales per shop ---")
print(shop_counts.mean())