# 0.1 Importing/Installing libraries

In [None]:
!pip install -q jupyterthemes
!jt -t onedork

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import scipy
from datetime import datetime
from jupyterthemes import jtplot
jtplot.style()


# 0.2 Reading Data

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')

print(f'Size of train dataset: {len(train_df)}')

# 1 Exploration of each column

## 1.1 Date Column

In [None]:
print(f"Start date: {train_df['date'].min()}")
print(f"End date: {train_df['date'].max()}")

### Creating NumDaysFromStart and datetime Column

In [None]:
start_date =  datetime.strptime(train_df.loc[0,'date'], '%Y-%m-%d')
train_df['datetime'] = [
    datetime.strptime(date, '%Y-%m-%d') for date in train_df['date']
]
train_df['NumDaysFromStart'] = [
    (datetime.strptime(date, '%Y-%m-%d') - start_date).days for date in train_df['date']
]

### Checking if data is a contiguous range of dates

In [None]:
np.array_equal(train_df['NumDaysFromStart'].unique(), np.arange(train_df['NumDaysFromStart'].nunique()))

## 1.2 Other Columns (Country, Store and Product)

In [None]:
print(f"Unique values in country: \n{train_df['country'].unique()}\n")
print(f"Value counts in country: \n{train_df['country'].value_counts()}") # Same amount of data in each country

In [None]:
print(f"Unique values in store: \n{train_df['store'].unique()}\n")
print(f"Value counts in store: \n{train_df['store'].value_counts()}") # Same amount of data in each store

In [None]:
print(f"Unique values in product: \n{train_df['product'].unique()}\n")
print(f"Value counts in product: \n{train_df['product'].value_counts()}") # Same amount of data in each product

## Conclusion: Training data contains <u>all</u> combinations of (date, country, store, product) 
18 combinations for each day, no missing values :)

#  2 Studying how num_sold is affected by all other variables

In [None]:
# Function to add month labels as xticks in matplotlib
def add_month_labels():
    locator = mdates.MonthLocator()
    fmt = mdates.DateFormatter('%b')

    X = plt.gca().xaxis
    X.set_major_locator(locator)
    X.set_major_formatter(fmt)

## 2.1 num_sold vs date

In [None]:
%matplotlib inline
mean_sold_per_day = train_df.groupby('datetime')['num_sold'].mean()
# new_years = train_df[train_df['date'].str.match('\d{4}-01-01')]['NumDaysFromStart'].unique()

plt.figure(figsize = (40,20))

plt.plot(mean_sold_per_day)
plt.title('Mean num_sold over time')
plt.ylabel('Mean num_sold')
plt.xlabel('Month')
add_month_labels()

#### Looks periodical with a slight positive gradient (perhaps growth of the website)
#### Spikes probably caused by holidays (biggest spikes are caused by New Year/Christmas)
#### Occilates within a range of values which resemble a sin wave
#### Occilations may be caused by variations in each week (eg Weekdays vs weekends)
#### Lets try to fit a sin curve to it!

## Fitting sin curve to data using MSE

In [None]:
# code taken from here: https://newbedev.com/how-do-i-fit-a-sine-curve-to-my-data-with-pylab-and-np
def fit_sin(tt, yy):
    '''Fit sin to the input time sequence, and return fitting parameters "amp", "omega", "phase", "offset", "freq", "period" and "fitfunc"'''
    tt = np.array(tt)
    yy = np.array(yy)
    ff = np.fft.fftfreq(len(tt), (tt[1]-tt[0]))   # assume uniform spacing
    Fyy = abs(np.fft.fft(yy))
    guess_freq = abs(ff[np.argmax(Fyy[1:])+1])   # excluding the zero frequency "peak", which is related to offset
    guess_amp = np.std(yy) * 2.**0.5
    guess_offset = np.mean(yy)
    guess = np.array([guess_amp, 2.*np.pi*guess_freq, 0., guess_offset])

    def sinfunc(t, A, w, p, c):  return A * np.sin(w*t + p) + c

    popt, pcov = scipy.optimize.curve_fit(sinfunc, tt, yy, p0=guess)
    A, w, p, c = popt
    f = w/(2.*np.pi)
    fitfunc = lambda t: A * np.sin(w*t + p) + c
    return {"amp": A, "omega": w, "phase": p, "offset": c, "freq": f, "period": 1./f, "fitfunc": fitfunc, "maxcov": np.max(pcov), "rawres": (guess,popt,pcov)}
fit = fit_sin(np.arange(len(mean_sold_per_day)), mean_sold_per_day)
fitted_curve = fit['fitfunc'](np.arange(len(mean_sold_per_day)))

print( "Amplitude=%(amp)s, Period=%(period)s, phase=%(phase)s, offset=%(offset)s, Max. Cov.=%(maxcov)s" % fit )

#### Period is very close to 365 days!

In [None]:
dates = train_df['datetime'].unique()

plt.figure(figsize=(40,20))
plt.plot(mean_sold_per_day, label='num_sold')
plt.plot(dates, fitted_curve, label='sin')
add_month_labels()
plt.title('Mean num_sold over time, with sin curve')
plt.ylabel('Mean num_sold')
plt.xlabel('Month')
plt.legend()

#### Fits pretty well at first, but fails to capture the small positive gradient
#### Tends to 'hug' the upper part of the curve, most likely due to holiday outliers

## Plotting by day of week

#### Creating DayofWeek and DayType columns

In [None]:
# First date is a thursday
train_df['DayOfWeek'] = (train_df['NumDaysFromStart'] + 3) % 7 + 1 # Mon is 1, Tues is 2 ...
train_df['DayType'] = (train_df['DayOfWeek'] >= 6).replace({True: 'weekend', False: 'weekday'})

In [None]:
mean_sold_by_daytype = train_df.groupby(['DayType', 'datetime'])['num_sold'].mean()
mean_sold_by_daynum = train_df.groupby(['DayOfWeek', 'datetime'])['num_sold'].mean()

In [None]:
plt.figure(figsize=(40,20))
days = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']

for daynum in range(1,8):
    plt.plot(mean_sold_by_daynum[daynum], label = days[daynum-1])
    
plt.plot(mean_sold_per_day, label = 'All')
plt.title('Mean num_sold over time, by day of week')
plt.ylabel('Mean num_sold')
plt.xlabel('Month')
add_month_labels()
plt.legend()

#### Very distinct separation between weekdays and weekends
#### Fridays are also consistently higher than other weekdays

### Weekday vs Weekends

In [None]:
plt.figure(figsize=(40,20))
daytypes = ['weekday', 'weekend']
plt.plot(dates, mean_sold_per_day, label='All')
for daytype in daytypes:
    plt.plot(train_df.loc[train_df['DayType'] == daytype,'datetime'].unique(),mean_sold_by_daytype[daytype], label = daytype)

plt.title('Mean num_sold over time, by day type')
plt.ylabel('Mean num_sold')
plt.xlabel('Month')
add_month_labels()
plt.legend()

#### Clear separation between weekdays and weekends

## 2.2 num_sold vs product

In [None]:
mean_sold_per_product = train_df.groupby('product')['num_sold'].mean()
plt.figure(figsize=(30,15))
plt.ylabel('num_sold')
plt.title('Mean sold per product')
mean_sold_per_product.plot(kind='bar')

#### Clear preferences for Kaggle Hat

### num_sold over time for each product

In [None]:
products = ['Kaggle Hat', 'Kaggle Mug', 'Kaggle Sticker']
mean_sold_per_product_vs_date = train_df.groupby(['product', 'date'])['num_sold'].mean()

plt.figure(figsize=(40,20))
for product in products:
    plt.plot(dates, mean_sold_per_product_vs_date[product], label = product)
    
plt.title('Mean num_sold over time, by product')
plt.ylabel('Mean num_sold')
plt.xlabel('Month')
add_month_labels()
plt.legend()

* Kaggle Hats display the most sinusoidal properties
    * Possibly because it is most affected by seasons, more hats are needed during winter
* Kaggle Mugs are only slightly affected by seasons
* Kaggle Stickers are not affected at all by seasons
* Very clear separation between each product

In [None]:
mean_sold_per_product_and_daytype_vs_date = train_df.groupby(['product', 'DayType', 'date'])['num_sold'].mean()

plt.figure(figsize=(40,20))
for product in products:
    for daytype in daytypes:
        plt.plot(
            train_df.loc[train_df['DayType'] == daytype,'datetime'].unique(),
            mean_sold_per_product_and_daytype_vs_date[product][daytype], label = f'{product}/{daytype}'
        )
plt.title('Mean num_sold over time, by product and day type')
plt.ylabel('Mean num_sold')
plt.xlabel('Month')
add_month_labels()
plt.legend()

#### Kaggle Sticker graphs now look almost constant on non-outlier regions

## 2.3 num_sold vs country

In [None]:
mean_sold_per_country = train_df.groupby('country')['num_sold'].mean()
plt.figure(figsize=(30,15))
plt.ylabel('num_sold')
plt.title('Mean sold per country')
mean_sold_per_country.plot(kind='bar')

### num_sold vs date for each country

In [None]:
countries = ['Finland', 'Norway', 'Sweden']
mean_sold_per_country_vs_date = train_df.groupby(['country', 'date'])['num_sold'].mean()

In [None]:
plt.figure(figsize=(40,20))
for country in countries:
    plt.plot(np.arange(1461), mean_sold_per_country_vs_date[country], label = country)
plt.legend()

## 2.4 num_sold vs store

In [None]:
mean_sold_per_store = train_df.groupby('store')['num_sold'].mean()
plt.figure(figsize=(30,15))
plt.ylabel('num_sold')
plt.title('Mean sold per store')
mean_sold_per_store.plot(kind='bar')

In [None]:
stores = ['KaggleMart', 'KaggleRama']
mean_sold_per_store_vs_date = train_df.groupby(['store', 'datetime'])['num_sold'].mean()

In [None]:
stores = ['KaggleMart', 'KaggleRama']
plt.figure(figsize=(40,20))
for store in stores:
    plt.plot(dates, mean_sold_per_store_vs_date[store], label = store)
add_month_labels()
plt.legend()