# Exploratory Data Analysis

#### Load modules, list files in ../input

In [None]:
import numpy as np
import pandas as pd
import os
%matplotlib inline
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
print(os.listdir("../input"))

#### Read data

In [None]:
items = pd.read_csv("../input/items.csv")
item_categories = pd.read_csv("../input/item_categories.csv")
sales_train=pd.read_csv("../input/sales_train.csv")
shops = pd.read_csv("../input/shops.csv")
test = pd.read_csv("../input/test.csv")

#### Join data

In [None]:
df_train = pd.merge(sales_train, items, on="item_id", how="inner")
df_train = pd.merge(df_train, item_categories, 
                    on="item_category_id", how="inner")
df_train = pd.merge(df_train, shops, on="shop_id", how="inner")

df_train.head()

#### Drop redundant columns and coerce data types

In [None]:
df_train.drop(["shop_id", "item_id", "item_category_id"],
              axis=1, inplace=True)

df_train = df_train.astype({"date": "datetime64",
                            "date_block_num": "int64",
                            "item_price": "float64",
                            "item_cnt_day": "int64",
                            "item_name": "object",
                            "item_category_name": "object",
                            "shop_name": "object"})

df_train.head()

#### Dimensions

In [None]:
print("Rows: {}, Columns: {}".format(df_train.shape[0], df_train.shape[1]))

#### Extract month, day of week, and year features

In [None]:
df_train["year"] = df_train["date"] \
    .apply(lambda x:dt.datetime.strftime(x, "%Y"))
df_train["month"] = df_train["date"] \
    .apply(lambda x:dt.datetime.strftime(x, "%B"))
df_train["dow"] = df_train["date"] \
    .apply(lambda x:dt.datetime.strftime(x, "%A"))

#### Items sold by date

In [None]:
df_date = df_train \
    .groupby("date", as_index=False) \
    .aggregate({"item_cnt_day": "sum"})

In [None]:
plt.figure(figsize=(15,3));
sns.lineplot(x="date", y="item_cnt_day", data=df_date);
plt.title("Items Sold by Date");
plt.xlabel("Date");
plt.ylabel("Items Sold");
plt.show()

#### Items sold by day of week

In [None]:
df_dow = df_train \
    .groupby("dow", as_index=False) \
    .aggregate({"item_cnt_day": "sum"})

In [None]:
dow_order = ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]

plt.figure(figsize=(7,5));
sns.barplot(x="dow", y="item_cnt_day", data=df_dow, order=dow_order);
plt.title("Items Sold by Day of Week");
plt.xlabel("Day of Week");
plt.ylabel("Items Sold");
plt.show()

#### Items sold by month

In [None]:
df_month = df_train \
    .groupby("month", as_index=False) \
    .aggregate({"item_cnt_day": "sum"})

In [None]:
month_order = ["January", "February", "March", "April", 
               "May", "June", "July", "August", "September", 
               "October", "November", "December"]

plt.figure(figsize=(10,4));
sns.barplot(x="month", y="item_cnt_day", data=df_month, order=month_order);
plt.title("Items Sold by Month");
plt.xticks(rotation=45)
plt.xlabel("Month");
plt.ylabel("Items Sold");
plt.show()

#### Items sold by item id

In [None]:
df_item = df_train \
    .groupby("item_category_name", as_index=False) \
    .aggregate({"item_cnt_day": "sum"})

In [None]:
plt.figure(figsize=(7,20));
sns.barplot(x="item_cnt_day", y="item_category_name", data=df_item);
plt.title("Items Sold by Item Name");
plt.xlabel("Item Sold");
plt.ylabel("Item Name");
plt.show()

#### Items sold by Store ID

In [None]:
df_store = df_train \
    .groupby("shop_name", as_index=False) \
    .aggregate({"item_cnt_day": "sum"})

In [None]:
plt.figure(figsize=(7,20));
sns.barplot(x="item_cnt_day", y="shop_name", data=df_store)
plt.title("Items Sold by Store Name");
plt.xlabel("Items Sold");
plt.ylabel("Store ID");
plt.show()

#### Next Steps:  Simple time series analysis