In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import time
import os
from itertools import product
from pylab import rcParams
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess, Fourier

# This is a report on the kaggle sales in Sweden, Finland and Norway. 

### We will go through the sales of each country to try and see which of the three is the best in terms of Kaggle products turn over. 
### We also want to see which product Is the most sold.

### Basically this report will go into details about the kaggle sales in this respective countries.

![](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSaTEtuo6o3PiveRP343REE0_VpnBNpjSPxWqvh1zcMulH6hz2MwTaDDHY&s=10)

In [None]:
import os
for dirname , _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname,filename))

datafolder ='../input/tabular-playground-series-jan-2022/'

In [None]:
df_test= pd.read_csv(os.path.join(datafolder,'test.csv'))
df_train=pd.read_csv(os.path.join(datafolder,'train.csv'))
sample_submission=pd.read_csv(os.path.join(datafolder,'sample_submission.csv'))

In [None]:
df_train.head()

In [None]:
from IPython.core.display import HTML

def multi_table(table_list):
    ''' Accepts a list of IpyTable objects and returns a table which contains each IpyTable in a cell
    '''
    return HTML(
        '<table><tr style="background-color:white;">' + 
        ''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list]) +
        '</tr></table>')

In [None]:
clean=df_train.dropna()

In [None]:
clean.columns

In [None]:
clean_nunique = {var: pd.DataFrame(clean[var].value_counts()) 
              for var in {'row_id' ,'date' ,'country', 'store' , 'product', 
                          'num_sold'}}

In [None]:
multi_table=([clean_nunique['row_id'],clean_nunique['date'],clean_nunique['country'], 
            clean_nunique['store'],clean_nunique['product'],clean_nunique['num_sold']])


In [None]:
multi_table 

In [None]:
 MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
LINEWIDTH=2
ALPHA=.6

dfp = clean[['row_id','date' ,'country','store','product','num_sold']].copy()


In [None]:
# Extract the year and the month from the date column into indepedent columns
dfp['date']  = pd.to_datetime(dfp['date'])
dfp['year_month']  = dfp['date'].apply(lambda x : x.strftime('%Y-%m'))
dfp['year']  = dfp['date'].dt.year
dfp['month'] = dfp['date'].dt.month
dfp

In [None]:
dfp_trend = dfp.groupby(['year','month']).sum()['num_sold'].reset_index()

In [None]:
dfp_trend_country = dfp.groupby(['year','month','country']).sum()['num_sold'].reset_index()

## Sales per County 

In [None]:
dfp_trend_country 

### Plot of Sales per Month from 2015 till 2018

In [None]:
plt.figure(figsize=(16,6))
# Plot the sales from the year 2015 till 2018
plt.plot(MONTHS, dfp_trend[dfp_trend.year==2015].num_sold, '-o', color='steelblue', linewidth=LINEWIDTH, alpha=ALPHA,label='2015')
plt.plot(MONTHS, dfp_trend[dfp_trend.year==2016].num_sold, '-o', color='seagreen', linewidth=LINEWIDTH, alpha=ALPHA,label='2016')
plt.plot(MONTHS, dfp_trend[dfp_trend.year==2017].num_sold, '-o', color='pink', linewidth=LINEWIDTH, alpha=ALPHA,label='2017')
plt.plot(MONTHS, dfp_trend[dfp_trend.year==2018].num_sold, '-o', color='blue', linewidth=LINEWIDTH, alpha=ALPHA,label='2018')

ax = plt.gca()
ax.set_title('Sales per month')
ax.set_ylabel('Sales in dollars')
ax.grid(axis='y', color='gray', alpha=.2)

for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.legend(loc=2, title='Legend')
plt.show()

## County Specific Sales

### Finland 
### Norway 
### Sweden 




In [None]:
dfp_trend_country2=dfp_trend_country[dfp_trend_country.country=='Finland']
dfp_trend_country2

### Finland Monthly Sales

In [None]:
# for Finland
plt.figure(figsize=(16,6))
# Plot the sales of the year 2015
plt.plot(MONTHS, dfp_trend_country2[dfp_trend_country2.year==2015].num_sold, '-o', color='steelblue', linewidth=LINEWIDTH, alpha=ALPHA,label='2015')
plt.plot(MONTHS, dfp_trend_country2[dfp_trend_country2.year==2016].num_sold, '-o', color='seagreen', linewidth=LINEWIDTH, alpha=ALPHA,label='2016')
plt.plot(MONTHS, dfp_trend_country2[dfp_trend_country2.year==2017].num_sold, '-o', color='pink', linewidth=LINEWIDTH, alpha=ALPHA,label='2017')
plt.plot(MONTHS, dfp_trend_country2[dfp_trend_country2.year==2018].num_sold, '-o', color='blue', linewidth=LINEWIDTH, alpha=ALPHA,label='2018')

ax = plt.gca()
ax.set_title('FINLAND SALES PER MONTH')
ax.set_ylabel('Sales in dollars')
ax.grid(axis='y', color='gray', alpha=.2)

for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.legend(loc=2, title='Legend')
plt.show()

 #### It is clear that in Finland the most number of items sold is around 70K in 2018 around Jan, With its peak sales at Dec at around 75k.
 #### From the graph its clear to see that sales have been increasing positively since 2015.
 #### We notice that in Jan 2015 sales were just below 60k, and slightly above 60k at the end of the year.
 

### Norway Monthly Sales 

In [None]:
dfp_trend_country3=dfp_trend_country[dfp_trend_country.country=='Norway']
dfp_trend_country3

In [None]:
# for NORWAY
plt.figure(figsize=(16,6))
# Plot the sales of the year 2015
plt.plot(MONTHS, dfp_trend_country3[dfp_trend_country3.year==2015].num_sold, '-o', color='steelblue', linewidth=LINEWIDTH, alpha=ALPHA,label='2015')
plt.plot(MONTHS, dfp_trend_country3[dfp_trend_country3.year==2016].num_sold, '-o', color='seagreen', linewidth=LINEWIDTH, alpha=ALPHA,label='2016')
plt.plot(MONTHS, dfp_trend_country3[dfp_trend_country3.year==2017].num_sold, '-o', color='pink', linewidth=LINEWIDTH, alpha=ALPHA,label='2017')
plt.plot(MONTHS, dfp_trend_country3[dfp_trend_country3.year==2018].num_sold, '-o', color='blue', linewidth=LINEWIDTH, alpha=ALPHA,label='2018')

ax = plt.gca()
ax.set_title('NORWAY SALES PER MONTH')
ax.set_ylabel('Sales in dollars')
ax.grid(axis='y', color='gray', alpha=.2)

for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.legend(loc=2, title='Legend')
plt.show()

 #### It is clear that in Norway the most number of items sold is just above 115k in 2018 around Jan, With its peak sales at Dec at around 120k.
 #### From the graph its clear to see that sales have been increasing positively since 2015.
 #### We notice that in Jan 2015 sales were just below 100k , and 100k at the end of the year.
 #### With its lowest ever sales of around 70k. making it a better performer with respect to sales against Finland and Sweden, which has a recorded Sales low of just above 40k.
 

### Sweden Monthly Sales 

In [None]:
dfp_trend_country4=dfp_trend_country[dfp_trend_country.country=='Sweden']
dfp_trend_country4  

In [None]:
# for Sweden
plt.figure(figsize=(16,6))
# Plot the sales of the year 2015
plt.plot(MONTHS, dfp_trend_country4[dfp_trend_country4.year==2015].num_sold, '-o', color='steelblue', linewidth=LINEWIDTH, alpha=ALPHA,label='2015')
plt.plot(MONTHS, dfp_trend_country4[dfp_trend_country4.year==2016].num_sold, '-o', color='seagreen', linewidth=LINEWIDTH, alpha=ALPHA,label='2016')
plt.plot(MONTHS, dfp_trend_country4[dfp_trend_country4.year==2017].num_sold, '-o', color='pink', linewidth=LINEWIDTH, alpha=ALPHA,label='2017')
plt.plot(MONTHS, dfp_trend_country4[dfp_trend_country4.year==2018].num_sold, '-o', color='blue', linewidth=LINEWIDTH, alpha=ALPHA,label='2018')

ax = plt.gca()
ax.set_title('SWEDEN SALES PER MONTH')
ax.set_ylabel('Sales in dollars')
ax.grid(axis='y', color='gray', alpha=.2)

for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.legend(loc=2, title='Legend')
plt.show()

 #### It is clear that in Sweden the most number of items sold is just above 85k in 2017 around Dec.
 #### From the graph its clear to see that sales have been increasing positively since 2015.
 #### We notice that in Jan 2015 sales were around 70k ,and slightly above 70k at the end of the year.
 #### With its lowest ever sales of around 50k in Sep. making it a better performer with respect to sales against Finland, which has a recorded Sales low of just above 40k.
 

In [None]:
import seaborn as sns
sns.set()
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

## Country Distribution 

In [None]:
tmp = clean.country.value_counts()
labels = (np.array(tmp.index))
sizes = (np.array((tmp / tmp.sum())*100))


fig = go.Figure(data=[go.Pie(labels=labels, values=sizes, hole=.4)])
fig.update_layout( title_text="COUNTRIES ",
    annotations=[dict(text='Country Distribution', x=0.5, y=0.52, font_size=40, showarrow=False)])
fig.show()

## Sales Comparison, Monthly for all Countries 

In [None]:
# for Sweden,Finland,Norway 
plt.figure(figsize=(16,6))
# Plot the sales of the year 2018

plt.plot(MONTHS, dfp_trend_country2[dfp_trend_country2.year==2018].num_sold, '-o', color='seagreen', linewidth=LINEWIDTH, alpha=ALPHA,label='Finland')
plt.plot(MONTHS, dfp_trend_country3[dfp_trend_country3.year==2018].num_sold, '-o', color='pink', linewidth=LINEWIDTH, alpha=ALPHA,label='Norway')
plt.plot(MONTHS, dfp_trend_country4[dfp_trend_country4.year==2018].num_sold, '-o', color='blue', linewidth=LINEWIDTH, alpha=ALPHA,label='Sweden')
 
ax = plt.gca()
ax.set_title('Sales Comparison 2018')
ax.set_ylabel('Sales in dollars')
ax.grid(axis='y', color='gray', alpha=.2)

for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.legend(loc=2, title='Legend')
plt.show()

#### In 2018 the performance gap is clear to see, Norway out performed both Finland and Sweden throughout the entire year of 2018.
#### On average Norway out sells the two countries in question by 250k sales each month.
#### Sales in Finland are overall the lowest and Must be paid more attention.
#### Maybe some sort of Customer incentives are needed or promotional content to try and boost Finland and maybe Sweden's Overall sales.

In [None]:
# for Sweden,Finland,Norway 
plt.figure(figsize=(16,6))
# Plot the sales of the year 2015

plt.plot(MONTHS, dfp_trend_country2[dfp_trend_country2.year==2015].num_sold, '-o', color='seagreen', linewidth=LINEWIDTH, alpha=ALPHA,label='Finland')
plt.plot(MONTHS, dfp_trend_country3[dfp_trend_country3.year==2015].num_sold, '-o', color='pink', linewidth=LINEWIDTH, alpha=ALPHA,label='Norway')
plt.plot(MONTHS, dfp_trend_country4[dfp_trend_country4.year==2015].num_sold, '-o', color='blue', linewidth=LINEWIDTH, alpha=ALPHA,label='Sweden')

ax = plt.gca()
ax.set_title('Sales Comparison 2015')
ax.set_ylabel('Sales in dollars')
ax.grid(axis='y', color='gray', alpha=.2)

for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.legend(loc=2, title='Legend')
plt.show()

In [None]:
Product_Sales=pd.DataFrame(dfp.groupby('product').sum()['num_sold'])

### Most sold Kaggle Product

In [None]:
Sorted_Prod=Product_Sales.sort_values('num_sold',ascending=False)
Sorted_Prod[:10].plot(kind='bar',figsize=(13,6),color='red',title='Sales per product')



#### The most sold product or item are kaggle hats with sales above 500k followed by Mugs with sales in the excess of 300k. and lastly the Sticker being the lowest performing product amongst the three, with sales of about 150k.


### Modeling Sales 
#### tgmd method.

In [None]:

from tqdm import tqdm
test = tqdm(range(0, len(df_test)), desc='Matching')
for i in test:
    for j in range(0, len(df_train)):
        for k in range(1, len(df_test.columns)):
            if df_test.iloc[i,k] == df_train.iloc[j,k]:
                continue
            else:
                break
        else:
            sample_submission.iloc[i, 1] = df_train.iloc[j, -1]
            break
test.close()

In [None]:
sample_submission.to_csv