In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings("ignore")


for dirname, _, filenames in os.walk('/kaggle/input'): # new way of reading comppetition input  
    for filename in filenames:                         # csv files
        file_name = os.path.join(dirname, filename)    # if you liked, then upvote
        if "train.csv" in file_name:                   # to make notebook popular among Kagglers
            df = pd.read_csv(file_name)                # 'df' represents for train dataset
        elif "test.csv" in file_name:
            test = pd.read_csv(file_name)              # 'test' represents for test dataset
        elif "submission.csv" in file_name:
            sample = pd.read_csv(file_name)            # 'sample' represents for sample submission dataset
            

My new way of reading datasets, write your ideas about it in **comments** section

#  Introduction to Dataframes

In [None]:
df.head()

In [None]:
test.head()

# No missing values 

In [None]:
df.isnull().sum()

In [None]:
test.isnull().sum()

**Both of datasets do not have missing values which makes our job more easierly and accurately**

# Making dataframes simplier

In [None]:
def clean_store(item):
    return item[6:] # 'KaggleMart' => 'Mart'

In [None]:
def clean_product(product):
    return product[7:] # 'Kaggle Hat' => 'Hat'

In [None]:
df.store = df.store.apply(clean_store)
test.store = test.store.apply(clean_store)

In [None]:
df['product'] = df['product'].apply(clean_product)
test['product'] = test['product'].apply(clean_product)

In [None]:
df.store.unique()

In [None]:
df['product'].unique()

**Mug** 
![kaggle mug](https://storage.googleapis.com/kaggle-competitions/kaggle/3333/media/kaggle_dataviz_mug.png)


**Hat** 
![Kaggle Hat](https://storage.googleapis.com/kaggle-avatars/images/4089076-gr.jpg)

**Sticker** 
![Kaggle Stickers](https://heads0rtai1s.github.io/pics/rstudioconf20_2.jpg)

In [None]:
df.country.unique()

**Finland, Norway and Sweden are Nordic countries**

![Map](https://i.pinimg.com/736x/b9/41/6e/b9416ea36ee695f15f4a15d173df21bc--interactive-map-the-map.jpg)

In [None]:
df.date = pd.to_datetime(df.date)
test.date = pd.to_datetime(test.date)

# Visualization by years

In [None]:
years = df.groupby([df.date.dt.year])[['num_sold']].mean()


In [None]:
sns.barplot(x=years.index, y=years.num_sold)
plt.title("Product selling through years")
plt.xlabel("Years")
plt.ylabel("Number of Sold Products")
plt.show()

# Visualization by months

In [None]:
months = []
for i in range(2015, 2019):
    sorted_by_months = df.loc[(df.date.dt.year == i)]
    months.append(sorted_by_months.groupby(df.date.dt.month)['num_sold'].mean())

In [None]:
labels = [2015,2016,2017,2018]
for j in range(4):
    label = labels[j]
    x = months[j].index
    y = months[j]
    sns.lineplot(x,y, label=label)
    plt.title("Product selling through months")
    plt.xlabel("Months")
    plt.ylabel("Number of Sold Products")
    
    

* April & December are the most popular months for selling
**Look at the holidays in those months:**

Common holidays in spring season:

Shrovetide=>Mar, 8th

Palm Sunday=>Apr, 10th

Good Friday=>Apr, 15th

Maundy Thursday=>Apr, 15th

Easter Sunday=>Apr, 17th

Easter Monday=>Apr, 18th

Ascension of Christ=>May, 26th

# Visualization by weeks

In a single year - **51 weeks**

![51 weeks](https://avatars.mds.yandex.net/i?id=e550d6b86c28b0e5ad1af0f283f37b4d-2352855-images-thumbs&n=13)

In [None]:
weeks = []
for i in range(2015, 2019):
    sorted_by_weeks = df.loc[(df.date.dt.year == i)]
    weeks.append(sorted_by_weeks.groupby(df.date.dt.week)['num_sold'].mean())

In [None]:
labels = [2015,2016,2017,2018]
for j in range(4):
    label = labels[j]
    x = weeks[j].index
    y = weeks[j]
    sns.lineplot(x,y, label=label)
    plt.title("Product selling through weeks")
    plt.xlabel("Weeks")
    plt.ylabel("Number of Sold Products")
    

# Visualization by days of week 
 Monday=0, 
 Sunday=6

In [None]:
days_of_week = []
for i in range(2015, 2019):
    sorted_by_days_of_week = df.loc[(df.date.dt.year == i)]
    days_of_week.append(sorted_by_days_of_week.groupby(df.date.dt.day_of_week)['num_sold'].mean())

In [None]:
labels = [2015,2016,2017,2018]
for j in range(len(days_of_week)):
    label = labels[j]
    x = days_of_week[j].index
    y = days_of_week[j]
    sns.lineplot(x,y, label=label)
    plt.title("Product selling through days of week")
    plt.xlabel("Days of week")
    plt.ylabel("Number of Sold Products")

# Visualization by countries

In [None]:
sns.lineplot(x='country', y='num_sold', data=df)
plt.title("Ratio of product selling and countries")
plt.xlabel("Name of countries")
plt.ylabel("Number of Sold Products")

It line graph above, Norway is the most popular

Let\`s look at  **GDP of Nordic countries:**
![GDP of Nordic countries](http://climatepositions.com/wp-content/uploads/2015/09/GDP-Finland-Sweden-Norw-Denm-Russ..png)

# Visualization by products

In [None]:
sns.lineplot(x='product', y='num_sold', data=df)
plt.title("Ratio of product selling and products` name")
plt.xlabel("Name of products")
plt.ylabel("Number of Sold Products")

In [None]:
hats = df.loc[(df['product'] == "Hat")]
mugs = df.loc[(df['product'] == "Mug")]
stickers = df.loc[(df['product'] == "Sticker")]

hat_products = hats.groupby(hats.date.dt.month)['num_sold'].mean()
mug_products = mugs.groupby(mugs.date.dt.month)['num_sold'].mean()
sticker_products = stickers.groupby(stickers.date.dt.month)['num_sold'].mean()


In [None]:
sns.lineplot(x=mug_products.index, y=mug_products.values, label ='Mugs')
sns.lineplot(x=hat_products.index, y=hat_products.values, label ='Hats')
sns.lineplot(x=sticker_products.index, y=sticker_products.values, label ='Stickers')


Hats are very popular in spring
Customers buy Hats because of own interests, agains rain and other weathers.
However, according to the source: 

https://www.worlddata.info/europe/sweden/climate.php,

https://www.worlddata.info/europe/norway/climate.php,

https://www.worlddata.info/europe/finland/climate.php

Rainy days are not the reason!

# Bar plot of 'country', 'product' and 'num_sold'

In [None]:
sns.barplot(x='country', y='num_sold', hue='product', data=df)
plt.title("Ratio of product selling and countries among products")
plt.xlabel("Name of countries")
plt.ylabel("Number of Sold Products")
plt.show()

# Bar chart of 'country', 'store' and 'num_sold'

In [None]:
sns.barplot(x='country', y='num_sold', hue='store', data=df)
plt.title("Ratio of product selling and countries among stores")
plt.xlabel("Name of countries")
plt.ylabel("Number of Sold Products")
plt.show()

# Bar chart of 'store', 'product' and 'num_sold'

In [None]:
sns.barplot(x='store', y='num_sold', hue='product', data=df)
plt.title("Ratio of product selling among storess")
plt.xlabel("Name of store")
plt.ylabel("Number of Sold Products")
plt.show()

# Conclusion

* Each visualization has many helpful features for further work in TPS-Jan 2022 competition
* If you want to modefy, feel free to use **Copy & Edit** button
* Thank you for your attention!

# Do not forget to:
* Share your ideas about my notebook in the **comments** section! 
* Upvote, if current notebook was useful!
* Mention me in **comments** section!

Regards,

[Sardor Abdirayimov](https://www.kaggle.com/sardorabdirayimov)