# Kaggle - Tabular Playground Series - Jan 2022

# 1. Modules

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# 2. Data

In [None]:
#paths to the Github repo

train_path = '../input/tabular-playground-series-jan-2022/train.csv'
test_path = '../input/tabular-playground-series-jan-2022/test.csv'

In [None]:
#train dataset

train = pd.read_csv(train_path)
train.head()

In [None]:
# data tyes

train.dtypes

In order to perform analysis by time periods, I incorporate columns with these values

In [None]:
train['year'] = pd.to_datetime(train['date']).dt.year
train['month'] = pd.to_datetime(train['date']).dt.month
train['week'] = pd.to_datetime(train['date']).dt.week
train['day_month'] = pd.to_datetime(train['date']).dt.day
train['day_year'] = pd.to_datetime(train['date']).dt.dayofyear #Number of day from 1 to 365

train.sample(10)

In [None]:
#test dataset

test = pd.read_csv(test_path)
test.head()

# 3. Analysis - Train dataset

## 3.1 General Analysis

In [None]:
#general statistics

train.describe()

In [None]:
#nulls per column

train.isnull().sum()

In [None]:
#Number of records per country

sns.countplot(x="country", data=train).set_title('Records per country')

In [None]:
#Number of records per store

sns.countplot(x="store", data=train).set_title('Records per store')

In [None]:
#Number of records per product

sns.countplot(x="product", data=train).set_title('Records per product')

In [None]:
#distribution of number of sales depending on the country

plt.figure(figsize=(12,8))
sns.kdeplot(data=train, x="num_sold", hue="country").set_title('KDE solds per country')

In [None]:
#distribution of number of sales depending on the store

plt.figure(figsize=(12,8))
sns.kdeplot(data=train, x="num_sold", hue="store").set_title('KDE solds per store')

In [None]:
#distribution of number of sales depending on the product

plt.figure(figsize=(12,8))
sns.kdeplot(data=train, x="num_sold", hue="product").set_title('KDE solds per product')

In [None]:
#Quantity sold depending on the day of the year

plt.figure(figsize=(12,8))
sns.lineplot(data=train, x="day_year", y="num_sold").set_title('Sales according to the day of the year')

In [None]:
#Quantity sold by country depending on the day of the year

plt.figure(figsize=(14,8))
sns.lineplot(data=train, x="day_year", y="num_sold",hue="country").set_title('Sales according to the day of the year and the country')

In [None]:
#Quantity sold per store depending on the day of the year

plt.figure(figsize=(14,8))
sns.lineplot(data=train, x="day_year", y="num_sold",hue="store").set_title('Sales according to the day of the year and the store')

In [None]:
#Quantity sold by type of product depending on the day of the year

plt.figure(figsize=(14,8))
sns.lineplot(data=train, x="day_year", y="num_sold",hue="product").set_title('Sold by type of product depending on the day of the year')

In [None]:
#Quantity sold per country

sns.catplot(x="country", y="num_sold", kind="box", data=train,height=8, aspect=1)
plt.title('Boxplot - Sold per country')

In [None]:
#Quantity sold per country and per year

sns.catplot(x="country", y="num_sold", hue="year", kind="box", data=train,height=10, aspect=1)
plt.title('Boxplot - Sold per country and per year')

In [None]:
#Quantity sold by country and by store

sns.catplot(x="country", y="num_sold", hue="store", kind="box", data=train,height=10, aspect=1)
plt.title('Boxplot - Sold by country and by store')

In [None]:
#Quantity sold per store and per year

sns.catplot(x="store", y="num_sold", hue="year", kind="box", data=train,height=10, aspect=1)
plt.title('Boxplot - Sold per store and per year')

In [None]:
#Quantity sold by country and by product

sns.catplot(x="country", y="num_sold", hue="product", kind="box", data=train,height=10, aspect=1)
plt.title('Boxplot - Sold by country and by product')

In [None]:
#Quantity sold per product and per year

sns.catplot(x="product", y="num_sold", hue="year", kind="box", data=train,height=10, aspect=1)
plt.title('Boxplot - Sold per product and per year')

In [None]:
#Quantity sold per store and per product

sns.catplot(x="store", y="num_sold", hue="product", kind="box", data=train,height=10, aspect=1)
plt.title('Boxplot - Sold per store and per product')

## 3.2  Country Analysis

In [None]:
#train - Finland

train_finland = train[train['country'] == 'Finland']
train_finland.head()

In [None]:
#train - Norway

train_norway = train[train['country'] == 'Norway']
train_norway.head()

In [None]:
#train - Sweden

train_sweden = train[train['country'] == 'Sweden']
train_sweden.head()

In [None]:
countrys = train['country'].unique()

### Solds by year

In [None]:
for i in countrys:
    plt.figure(figsize=(12,8))
    sns.kdeplot(data=train[train['country'] == i], x="num_sold", hue="year",palette="tab10").set_title(' KDE - Solds by year - ' + i)

###  Sold by Months

In [None]:
for i in countrys:
    sns.catplot(x="month", y="num_sold", kind="box", data=train[train['country'] == i],height=8,aspect=1)
    plt.title('Boxplot - Sold by month - ' + i)

###  Sold by Months and store

In [None]:
#Cantidad vendida por mes y tienda

for i in countrys:
    sns.catplot(x="month", y="num_sold",hue="store",kind="box", data=train[train['country'] == i],height=10,aspect=1)
    plt.title('Boxplot - Sold by month and store - ' + i)

###  Sold by week of the year

In [None]:
#Cantidad vendida por semana

for i in countrys:
    sns.catplot(x="week", y="num_sold", kind="box", data=train[train['country'] == i],height=12, aspect=1)
    plt.title('Boxplot - Sold by week - ' + i)

###  Sold by day of the week

In [None]:
#Cantidad vendida por dia del mes

for i in countrys:
    sns.catplot(x="day_month", y="num_sold", kind="box", data=train[train['country'] == i],height=10, aspect=1)
    plt.title('Boxplot - Sold by day of the month - ' + i)

### Sales by store depending on the day of the year and store

In [None]:
for i in countrys:
    plt.figure(figsize=(14,8))
    sns.lineplot(data=train[train['country'] == i], x="day_year", y="num_sold",hue="store").set_title('Sales according to the day of the year and store - ' + i)

### Sales by store depending on the day of the year and product

In [None]:
for i in countrys:
    plt.figure(figsize=(14,8))
    sns.lineplot(data=train[train['country'] == i], x="day_year", y="num_sold",hue="product").set_title('Sales according to the day of the year and product - ' + i)