# **Simple Time Series Clustering(KMeans)**


In this notebook, trying basic time series clustering, and some other time series analysis techniques.


**References - Thanks for sharing information. ** 

https://www.kaggle.com/learn/time-series

https://www.kaggle.com/bextuychiev/every-pandas-function-to-manipulate-time-series/notebook

https://www.kaggle.com/izzettunc/introduction-to-time-series-clustering


In [None]:
!pip install tslearn

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns  # visualization tool
import os

from tslearn.clustering import KShape
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tslearn.utils import to_time_series_dataset
from tslearn.clustering import TimeSeriesKMeans

from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor


# **Load Data**

In [None]:
data = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
tests = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
item_categories = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')

Check the content of the data.


In [None]:
merged = pd.merge(pd.merge(data, items, on='item_id'), shops, on='shop_id')

merged = merged.loc[:,['date','shop_id','item_id','item_category_id','item_cnt_day']].copy()
merged['datetime'] = pd.to_datetime(merged['date'])
merged.drop('date',axis=1)

merged.head()

# **Visualize Overview Trends**

It looks like gradual upward trend.


In [None]:
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 4))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
)
%config InlineBackend.figure_format = 'retina'

df = merged.groupby(['datetime','item_category_id'],as_index=False ).mean()
df.set_index('datetime', inplace=True)
df = df.resample('M').mean()
df['Time'] = np.arange(len(df.index))

# Training data
X = df.loc[:, ['Time']]  # features
y = df.loc[:, 'item_cnt_day']  # target

# Train the model
model = LinearRegression()
model.fit(X, y)

# Store the fitted values as a time series with the same time index as
# the training data
y_pred = pd.Series(model.predict(X), index=X.index)

ax = y.plot(**plot_params)
ax = y_pred.plot(ax=ax, linewidth=3)
ax.set_title('Time Plot of Monthly sales');

# **Lag futures**

I've tried everything, but I can't seem to find an effective lag.

In [None]:
df['Lag_1'] = df['item_cnt_day'].shift(1)
df['Lag_3'] = df['item_cnt_day'].shift(3)
df['Lag_12'] = df['item_cnt_day'].shift(12)
df.head()


In [None]:

X = df.loc[:, ['Lag_12']]
X.dropna(inplace=True)  # drop missing values in the feature set
y = df.loc[:, 'item_cnt_day']  # create the target
y, X = y.align(X, join='inner')  # drop corresponding values in target

model = LinearRegression()
model.fit(X, y)

y_pred = pd.Series(model.predict(X), index=X.index)

fig, ax = plt.subplots()
ax.plot(X['Lag_12'], y, '.', color='0.25')
ax.plot(X['Lag_12'], y_pred)
ax.set_aspect('equal')
ax.set_ylabel('item sale count')
ax.set_xlabel('Lag_12')
ax.set_title('Lag Plot of Monthly sales');

In [None]:
ax = y.plot(**plot_params)
ax = y_pred.plot()

# **Item Categories**

Next, let's look at sales trends by item category.
(There are a lot of them, so we'll divide them into 10 at a time.)
There is many variation in the time period and unit of sales results, so it is difficult to read trends at a glance.
I can also see some spikes in some places.


In [None]:
itemcats = merged.groupby(['datetime','item_category_id'],as_index=False ).mean()
itemcats.set_index('datetime', inplace=True)

for categories in range(0,70,10):
    div = itemcats.query(f'{categories} <= item_category_id < {categories+10}')
    div.pivot(columns='item_category_id',values='item_cnt_day').resample('M').agg(['mean']).plot()

# **Process for Outliers**


Some outliers should be removed.For now, I will figure out a way to automatically remove values that are a certain distance from the moving average value.


In [None]:
# Temporary value
span = 30
threshold=2

def plot_outlier(ts):
    fig, ax = plt.subplots()
    ewm_mean = ts.ewm(span).mean()
    ewm_std = ts.ewm(span).std()
    ax.plot(ts, label='original')
    ax.plot(ewm_mean, label='ewma')

    outlier = ts[(ts - ewm_mean).abs() > ewm_std * threshold]
    ax.scatter(outlier.index, outlier, label='outlier')
    ax.legend()
    ax.set_title('EWMA and Outlier')
    return fig

def subst_outlier(ts):
    df = ts.copy() 
    ewm_mean = df.ewm(span).mean()
    ewm_std = df.ewm(span).std()
    df[(df - ewm_mean).abs() > ewm_std * threshold] = ewm_mean #Replace outliers with moving average values
    return df

def plot_items_subst_outlier(df):
    for sp in range(0,50,10):
        div = df.query(f'{sp} <= item_category_id < {sp+10}')
        wd = div.pivot(columns='item_category_id',values='item_cnt_day')
        wd = wd.resample('M').sum()
        subst_outlier(wd).plot()
        
def plot_shops_subst_outlier(df):
    for sp in range(0,50,10):
        div = df.query(f'{sp} <= shop_id < {sp+10}')
        wd = div.pivot(columns='shop_id',values='item_cnt_day')
        wd = wd.resample('M').sum()
        subst_outlier(wd).plot()

In [None]:
outl = merged.groupby(['datetime','item_category_id'],as_index=False ).mean()
outl.set_index('datetime', inplace=True)

# Sample item category No.8.
outl = outl.query('item_category_id == 8')
wd = outl.pivot(columns='item_category_id',values='item_cnt_day').resample('M').agg(['mean'])

plot_outlier(wd)
ax = subst_outlier(wd).plot()
ax.set_title('After Delete Outlier')


# **Shops**


Next, let's see at the sales trends for each store.

It seems that outliers need to be processed here as well.

In [None]:
vd = merged.groupby(['datetime','shop_id'],as_index=False ).sum()
vd.set_index('datetime', inplace=True)

def plot_shops(df):
    for sp in range(0,50,10):
        div = df.query(f'{sp} <= shop_id < {sp+10}')
        wd = div.pivot(columns='shop_id',values='item_cnt_day')
        wd = wd.resample('M').sum()
        wd.plot()

plot_shops(vd)

# **Time Series Clustering**


It seems to be a certain degree of similarity in the sales trends for both item categories and shops.
So I try classification by time series clustering.

I'm trying out some patterns.Number of clusters and the clustering method may not be the best way.

In [None]:
def clustering(df, cl_count):
    # Normalization
    scaler = TimeSeriesScalerMeanVariance(mu=0.0, std=1.)
    scaled = scaler.fit_transform(to_time_series_dataset(df.values.T))
    # Calculate KMeans
    km = TimeSeriesKMeans(n_clusters=cl_count, verbose=True, random_state=seed)
    labels = km.fit_predict(scaled)
    return labels
    

In [None]:
seed = 0
np.random.seed(seed)

# Temporary value
shop_cluster_count = 4
item_cluster_count = 3

# Get Shops cluster
clst = merged.groupby(['datetime','shop_id'],as_index=False ).sum()
clst.set_index('datetime', inplace=True)
clst = clst.pivot(columns='shop_id',values='item_cnt_day').resample('M').sum()
clst = subst_outlier(clst)
shops['shop_cluster'] = clustering(clst, 4)

# Get Shops cluster
clst = merged.groupby(['datetime','item_category_id'],as_index=False ).sum()
clst.set_index('datetime', inplace=True)
clst = clst.pivot(columns='item_category_id',values='item_cnt_day').resample('M').sum()
clst = subst_outlier(clst)
item_categories['item_category_cluster'] = clustering(clst, 3)

# **Visualize Clusters**


Visualise the clusters, but unfortunately there is not such a clear difference.

In [None]:
def plot_items_cluster(df):
    for sp in range(item_cluster_count):
        div = df[df['item_category_cluster'] == sp]
        wd = div.pivot(columns='item_category_id',values='item_cnt_day').resample('M').sum()
        ax = subst_outlier(wd).plot()
        ax.set_title(f'Item category cluster {sp}')
        ax.get_legend().remove();

        
def plot_shops_cluster(df):
    for sp in range(shop_cluster_count):
        div = df[df['shop_cluster'] == sp]
        wd = div.pivot(columns='shop_id',values='item_cnt_day').resample('M').sum()
        ax = subst_outlier(wd).plot()
        ax.set_title(f'Shop cluster {sp}')
        ax.get_legend().remove();

For shops, the clusters are separated according to the distribution of sales peaks.

In [None]:
item_merged = pd.merge(items, item_categories, on='item_category_id')
merged = pd.merge(pd.merge(data, item_merged, on='item_id'), shops, on='shop_id')
merged = merged.loc[:,['date','shop_id','item_id','item_category_id','shop_cluster','item_category_cluster','item_cnt_day']].copy()
merged['datetime'] = pd.to_datetime(merged['date'])
merged.drop('date',axis=1)

merged.head()

In [None]:
vd = merged.groupby(['datetime','shop_id'],as_index=False ).sum()
vd.set_index('datetime', inplace=True)

plot_shops_cluster(vd)

Item categories clusters seems obscure.

In [None]:
vd = merged.groupby(['datetime','item_category_id'],as_index=False ).sum()
vd.set_index('datetime', inplace=True)

plot_items_cluster(vd)

# **Model building and prediction**


It's a simple prediction using linear regression and the XGB ensemble.

In [None]:
# LinearRegression model

def LinearRegressionPred(data):
    df = data.copy()

    itemsum = pd.DataFrame(df.loc[:,'item_cnt_day']).resample('M').sum()
    didx = pd.DataFrame(index=pd.date_range(start="2013-01-01", end="2015-10-31", freq="M"))
    itemsum = itemsum.merge(didx, how="outer", left_index=True, right_index=True).fillna(0)
    itemsum['Time'] = np.arange(len(itemsum.index))
    
    X = itemsum.loc[:, ['Time']]  # features
    y = itemsum.loc[:, 'item_cnt_day']  # target
    
    model = LinearRegression()
    model.fit(X, y)
    
    return model.predict(pd.DataFrame([itemsum['Time'].max()+1]))


In [None]:
features = merged.loc[:,['datetime','shop_id','item_id','item_cnt_day']].copy()
features.set_index('datetime', inplace=True)
features = features['20130101':'20151031']

tests['pred_line'] = 0
for index, row in tests.iterrows():
    pred = LinearRegressionPred(features.query(f"shop_id == {row['shop_id']} and item_id == {row['item_id']}"))
    row['pred_line'] = pred

In [None]:
features = merged[['date','shop_id','item_id','shop_cluster','item_category_cluster','item_category_id','item_cnt_day']].copy()
features['datetime'] = pd.to_datetime(features['date'])
features = features.drop('date',axis=1)
features.set_index('datetime', inplace=True)
features['month'] = features.index.month

target = features.pop('item_cnt_day')

xgb = XGBRegressor()
xgb.fit(features, target)

df_tests = pd.merge(pd.merge(tests, item_merged, on='item_id'), shops, on='shop_id')
features = df_tests[['shop_id','item_id','shop_cluster','item_category_cluster','item_category_id']].copy()
features['datetime'] = pd.to_datetime('2015-11-30')
features.set_index('datetime', inplace=True)
features['month'] = features.index.month

tests['pred_xgb'] = xgb.predict(features)

# **Make Submission**

In [None]:
tests['item_cnt_month'] = (tests['pred_xgb'] + tests['pred_line'])/2
tests[['ID','item_cnt_month']].to_csv('submission.csv', index=False)