In [None]:

import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

# Index
- 1. [Describtive analysis (univariant Analysis)](#descriptive-analysis) 
    - 1.1 [Common Data](#common-data)
        - 1.1.1 [Data type](#common-data)
        - 1.1.2 [Missing value](#common-data)
    - 1.2 [Numerical Data](#numerical-data)
        - 1.2.1 Quantile statistics
        - 1.2.2 Descriptive analysis
        - 1.2.3 [Distribution Analysis](#distribution-analysis)
    - 1.3 [Categorical Data](#categorical-data)
        - 1.3.1 [Cordinality](#categorical-data)
        - 1.3.2 [Unnique count](#categorical-data)
- 2. [Some basic Understanding about the data](#basic-uderstanding)
- 3. [Correlation Analysis (bivariant Analysis)](#Correlation-analysis)


# Basic

In [None]:
data = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
data.head()

In [None]:
print("data number of samples---->",data.shape[0])
print("data number of columns---->",data.shape[1])

In [None]:
# Basic feature enginerring. --> it is more meaniful to split the data into day , month and year.
data['date'] = pd.to_datetime(data["date"])

data["day"] = data['date'].dt.day
data["month"] = data['date'].dt.month
data["year"] = data['date'].dt.year

data['dayofweek'] = data['date'].dt.dayofweek # day of the week monday = 0 to sunday = 6
data['dayofmonth'] = data['date'].dt.days_in_month # day of the month
data['dayofyear'] = data['date'].dt.dayofyear # day in the year
data['weekofyear'] = data['date'].dt.weekofyear  # week of the year

# data.drop(["date"],axis=1,  inplace = True) # chaneg the original data
data.head(5)

<a id="descriptive-analysis"></a>
## 1. Describtive analysis (Univariant Analysis)

<a id="common-data"></a>
### 1.1 Common Data Analysis

In [None]:
def Common_data_analysis(data):
    print("{:=^100}".format(" Common data analysis "))
    # default settings.
    column = data.columns
    total_samples = data.shape[0]
    value_dict = {}
    
    # calculate values.
    missing_values = data.isnull().sum().values
    missing_value_percentage = [round((col_missing_count / total_samples) * 100, 2) for col_missing_count in missing_values ]
    datatype = [data.iloc[:,i].dtype for i in range(data.shape[1])]
    
    categorical_data = list(data.loc[:,data.dtypes == 'object'].columns)
    numerical_data = [d for d  in column if d not in categorical_data]
    # print the diff datatype and count.
    print()
    print("Numerical data list {} ---> total {} numerical values".format(numerical_data, len(numerical_data)))
    print("Categorical data list {} ---> total {} categorical values".format(categorical_data, len(categorical_data)))
    print()
    
    # organise values.
    value_dict["data type"] = datatype
    value_dict["Missing Value"] = missing_values
    value_dict["% of Missing value"] = missing_value_percentage
    df = pd.DataFrame(value_dict, columns = value_dict.keys(), index = column)
    
    # make a highlight for col has high missing value percentage. (>55% say)
    # the particular row will be highlighted if it above missing value threshold.
    def highlight_high_missing_value(sample):
        threshold = 0.55
        style = sample.copy()
        highlight = 'background-color: red;'
        if sample[2] > threshold:
            style[:] = highlight
        else:
            style[:] = ''
        return style
    df = df.style.apply(highlight_high_missing_value, axis = 1)
    display(df)
    return (column, categorical_data, numerical_data)

data_columns, categorical_data, numerical_data = Common_data_analysis(data)


In [None]:
# # visualization for missing value.
# plt.figure(figsize=(15,10))
# sns.heatmap(data.isnull().transpose(),
#             cbar_kws={'label': 'Missing Data'})
# plt.title('Heatmap showing Missing Values ', weight = 'bold', size = 20, color = 'red')
# plt.xticks(size = 12)
# plt.yticks(size = 12)
# plt.show()

### Observation.
Only 3 Categorical value and a happy thing there is no missing value :) 

<a id="numerical-data"></a>
### 1.2 Numerical Data

In [None]:
def numerical_data_analysis(data, numerical_data):
    print("{:=^100}".format(" Numerical data analysis "))

    column = list(data.columns)
    column.remove("date")

    min_value = [data[col].min() if col in numerical_data else "NA" for col in column]
    max_value = [data[col].max() if col in numerical_data else "NA" for col in column]
    #mode_value = [data[col].mode() if col in numerical_data else "NA" for col in column]
    mean_value = [data[col].mean() if col in numerical_data else "NA" for col in column]
    std_value = [data[col].std() if col in numerical_data else "NA" for col in column]
    #print(mode_value)
    skewness_value = [data[col].skew() if col in numerical_data else "NA" for col in column]
    kurtosis_value = [data[col].kurtosis() if col in numerical_data else "NA" for col in column]

    q1_value = [data[col].quantile(0.25) if col in numerical_data else "NA" for col in column]
    q2_meadian_value = [data[col].quantile(0.50) if col in numerical_data else "NA" for col in column]
    q3_value = [data[col].quantile(0.75) if col in numerical_data else "NA" for col in column]

    # find the range value.
    def find_range(min_value_list, max_value_list):
        range_value = [(max_value - min_value)  if min_value != "NA" else "NA" for max_value, min_value in zip(max_value_list, min_value_list)]
        return range_value

    # find the inter quartile range. (q3-q1)
    def iqr(q1_value_list, q3_value_list):
        range_value = [(q3 - q1) if q1 != "NA" else "NA" for q3, q1 in zip(q3_value_list, q1_value_list)]
        return range_value

    range_value = find_range(min_value, max_value)
    iqr_value = iqr(q1_value, q3_value)

    # organise everything inside a dataframe.
    df_dict = {}
    df_dict["min"] = min_value
    df_dict["max"] = max_value
    df_dict["range(max-min)"] = range_value
    #df_dict["mode"] = mode_value
    df_dict["mean/average"] = mean_value
    df_dict["standard deviation"] = std_value
    df_dict["Q1"] = q1_value
    df_dict["meadian/Q2"] = q2_meadian_value
    df_dict["Q3"] = q3_value
    df_dict["Inter quantile range"] = iqr_value
    df_dict["kurtosis"] = kurtosis_value
    df_dict["Skewness"] = skewness_value

    df = pd.DataFrame(df_dict, columns = df_dict.keys(), index = column)
    display(df)

numerical_data_analysis(data, numerical_data)

### Observation.
here skewness is the important measure which gives the information of our dataset. We can say the data follows normal distribution/not by the skewness value. normal distribution value between [-1,+1] . So. all our data is following the normal distribution more or less.

<a id="distribution-analysis" ></a>
### 1.2.3 Distribution Analysis

In [None]:
# find the distubution of the data. ( visualization would be so good)
column = data.columns
plt.figure
fig, ax = plt.subplots(3,4 , figsize=(15,15))
# we have 9 numerical values.
col, row = 4,3
col_count = 0
for r in range(row):
    for c in range(col):
        sns.histplot(data=data, x=column[col_count], ax=ax[r,c])
        col_count +=1


### Observation.
Mostly all the features are unifomly distributed. 
All the categorical values are uniformly distributed --> meaning each class in the categorical data has the same number of samples :)
num_old has few outliers and little bit skewed to the right. But since it is a target value we don't need to handle that skeness :)


Data is more clean on Univariant analysis. No more feature engineering like missing value handling, skewness handling don't need to be done.

<a id="categorical-data"></a>
### 1.3 Categorical data.

We found all the categorical data are uniform --> so don't need cardinality analysis anymore.

<a id="#basic-uderstanding" ></a>
## 2. Some basic Understanding about the data

### 1. sales based on country

In [None]:
sns.barplot(data=data, x="country" , y = "num_sold")

Norway has the higest sales.

### 2. sales on every year 

In [None]:
sns.barplot(data=data, x="country" , y = "num_sold" , hue="year")


sales normally increased every year on every county. This is normal year after year people love to buy the things and sales will be increased. 

### 3. yearly sales beaced on stores

In [None]:
sns.barplot(data=data, x="country" , y = "num_sold" , hue="store")

KaggleRama doing better on all the years
More or less 40% high sales KaggleRama has than KaggleMart.
This can be because of many reasons --
- KaggleRama may have large store with lots of item (we don't have the information about the stacks)
- KaggleRama have more branches than KaggleMart (we don't have that information too)
- Good quality and service also be a reason.

### 4. Monthly Store sales

In [None]:
plt.figure(figsize=(15,8))
group_by_data=data.groupby(['date','store']).agg(num_sold=('num_sold','sum'))
sns.lineplot(data=group_by_data, x= "date", y="num_sold", hue="store")

In [None]:
plt.figure(figsize=(5,5))
group_by_data=data.groupby(['month','store']).agg(num_sold=('num_sold','sum'))
sns.lineplot(data=group_by_data, x= "month", y="num_sold", hue="store")

We can clearly see the sales is increased on 3-5th and 12 th month every year at both the stores. <br/>
We know the reason this is because of some festivals(seasons) <br/>
It is started increasing in a high rate at 11th month and so high on 12th month.(this may be because of chrishmas)

### 5. Product based sales

In [None]:
plt.figure(figsize=(5,5))
group_by_data=data.groupby(['month','product']).agg(num_sold=('num_sold','sum'))
sns.lineplot(data=group_by_data, x= "month", y="num_sold", hue="product")

The Kaggle hat has the large sales than other 2. <br/>
Kaggle hat sales increased in the month of 3-6 and 12. all other product's sales not so much increased not so much decresed it is normal. <br/>
All product sales decreased in the month of Feb that also needs to be focused (but can't correctly tell what may be the reason)

### 5.2 Product based on each country sales 

In [None]:
fig, ax = plt.subplots(3,1 , figsize = (10,12))
country = ["Norway", "Sweden", "Finland"]
for row in range(3):
    # plot the Kaglle Hat in a country.
    ax[row].plot( data[(data["country"]==country[row]) & (data["product"]=="Kaggle Hat")]["month"],
                data[(data["country"]==country[row]) & (data["product"]=="Kaggle Hat")]["num_sold"], label="Kaggle Hat")
    ax[row].plot(data[(data["country"]==country[row]) & (data["product"]=="Kaggle Mug")]["month"],
                 data[(data["country"]==country[row]) & (data["product"]=="Kaggle Mug")]["num_sold"], label="Kaggle Mug")
    ax[row].plot(data[(data["country"]==country[row]) & (data["product"]=="Kaggle Sticker")]["month"],
                 data[(data["country"]==country[row]) & (data["product"]=="Kaggle Sticker")]["num_sold"], label="Kaggle Sticker")
    ax[row].title.set_text("Prodcut based sales on country --> {}".format(country[row]))
    ax[row].set_ylabel('Sales')
    ax[row].set_xlabel('Month')
    ax[row].legend()

### 5.3 product based on stores

In [None]:
#g = data.groupby(["store", "product"]).agg(num_sold=('num_sold','sum'))
sns.barplot(data=data, x= "store", y = "num_sold", hue="product")

### 6 which days of month has more sales

In [None]:
g = data.groupby("day").sum()
sns.lineplot(data=data, x= "day", y = "num_sold", hue="product")

Interesting people buying little high at month ends :) (can't tell the exact reason for this behaviour)

### 7. Week days based sales

In [None]:
g = data.groupby("dayofweek").sum()
sns.lineplot(data=data, x= "dayofweek", y = "num_sold", hue="product")

Not suprising sales high at weekends . lots of reason for this.

<a id="Correlation-analysis"></a>
## 3. Correlation Analysis

In [None]:
# can be done by heatmap.
fig = plt.figure(figsize = (18,12))
sns.heatmap(data=data.corr(), annot=True, vmin=0, vmax=1,)

Not a single value strongly correlated to target value. We need to do some thing for this in feature engineering stage

In [None]:
# Feature engineering loading ....