<h1 style="font-size:30px; text-align:center; margin-bottom:30px;"><span style="color:SteelBlue">Rossmann Sales Prediction:</span> 01 Data Exploration and Cleaning </h1>

## 1. Overall data structure

### 1.1 Data overview

In [None]:
from __future__ import print_function  # Compatability with Python 3

# NumPy for numerical computing
import numpy as np

# Pandas for DataFrames
import pandas as pd
pd.set_option('display.max_columns', 100)
from pandas import DataFrame
from pandas import TimeGrouper

# Matplotlib for visualization
from matplotlib import pyplot as plt
# display plots in the notebook
%matplotlib inline 

# Seaborn for easier visualization
import seaborn as sns

# datetime
import datetime

In [None]:
# Load data from CSV
df = pd.read_csv("../input/train.csv")
df2 = pd.read_csv("../input/store.csv")

In [None]:
# Dataframe dimensions
print(df.shape)
print(df2.shape)

In [None]:
# Column datatypes
print(df.dtypes,'\n')
print(df2.dtypes)

In [None]:
# Display first 5 rows of df
df.head()

In [None]:
# Display last 5 rows of data
df.tail()

In [None]:
df2.head()

In [None]:
df2.tail()

### 1.2 Early data cleaning & feature engineering

#### 1.2.1 Drop unwanted observations

In [None]:
# Drop duplicates
df = df.drop_duplicates()
df2 = df2.drop_duplicates()
print(df.shape)
print(df2.shape)

In [None]:
# drop closed observation
df = df[df.Open != 0]

In [None]:
len(df[df.Customers == 0])

In [None]:
df[df.Customers == 0].sort_values(by=['Store'])

In [None]:
len(df[df.Sales == 0])

In [None]:
# after checking the data, decide to drop sales == 0 observations
df = df[df.Sales != 0]

In [None]:
print(df.shape)

In [None]:
df['AvgPurchasing'] = df.Sales / df.Customers

## 2. Study of numerical features

### 2.1 Distribution of numerical features

In [None]:
# Plot histogram grid
df.hist(xrot=-45,figsize=(10,10))
# Clear the text "residue"
plt.show()

In [None]:
# Plot histogram grid
df2.hist(xrot=-45,figsize=(10,10))
# Clear the text "residue"
plt.show()

In [None]:
# Summarize numerical features
df.describe()

In [None]:
# Promo2Since[Year/Week] - describes the year and calendar week when the store started participating in Promo2
df2.describe()

### 2.2 Outliers of numerical features or target

In [None]:
# Box plot of 'Sales'
plt.figure(figsize=(4,3))
sns.boxplot(y='Sales', data=df)

In [None]:
plt.figure(figsize=(4,3))
sns.boxplot(y='Customers', data=df)

In [None]:
plt.figure(figsize=(4,3))
sns.boxplot(y='AvgPurchasing', data=df)

In [None]:
df[df.Sales < 1000][['Store','Sales']].describe()

In [None]:
df.groupby('Store')['Sales'].mean().sort_values()

In [None]:
df[df.Store == 652]['Sales'].describe()

In [None]:
df[df.Store == 652]['Sales'].sort_values()

###### To do 1: 1) Note that there's a way to cluster the stores based on the level of sales.  2) we can later decide whether to remove outliers based on Q3-Q1  range for Sales, Customers, AvgPurchasing.

###### The below outliers dropping functions are not used for now.

In [None]:
df=df.reset_index()

In [None]:
def find_low_high(feature):
    # find store specific Q1 - 3*IQ and Q3 + 3*IQ
    IQ = df.groupby('Store')[feature].quantile(0.75)-df.groupby('Store')[feature].quantile(0.25)
    Q1 = df.groupby('Store')[feature].quantile(0.25)
    Q3 = df.groupby('Store')[feature].quantile(0.75)
    low = Q1 - 3*IQ
    high = Q3 + 3*IQ
    low = low.to_frame()
    low = low.reset_index()
    low = low.rename(columns={feature: "low"})
    high = high.to_frame()
    high = high.reset_index()
    high = high.rename(columns={feature: "high"})
    return {'low':low, 'high':high}

In [None]:
def find_outlier_index(feature):
    main_data = df[['Store',feature]]
    low = find_low_high(feature)["low"]
    high = find_low_high(feature)["high"]
    
    new_low = pd.merge(main_data, low, on='Store', how='left')
    new_low['outlier_low'] = (new_low[feature] < new_low['low'])
    index_low = new_low[new_low['outlier_low'] == True].index
    index_low = list(index_low)
    
    new_high = pd.merge(main_data, high, on='Store', how='left')
    new_high['outlier_high'] = new_high[feature] > new_high['high']
    index_high = new_high[new_high['outlier_high'] == True].index
    index_high = list(index_high)
    
    index_low.extend(index_high)
    index = list(set(index_low))
    return index

In [None]:
len(find_outlier_index("Sales"))

In [None]:
# decide only to delete the 1113 observations above to delete the sales outlier
df.drop(find_outlier_index("Sales"), inplace=True, axis=0)

In [None]:
df.shape

### 2.3 Box-cox transformation for numerical features and target

In [None]:
from scipy.stats import boxcox
df['Sales'], lam1 = boxcox(df.Sales)
df['Customers'], lam2 = boxcox(df.Customers)
df['AvgPurchasing'], lam3 = boxcox(df.AvgPurchasing)

In [None]:
print(lam1)
df.Sales.hist(figsize=(4,2))
plt.show()

print(lam2)
df.Customers.hist(figsize=(4,2))
plt.show()

print(lam3)
df.AvgPurchasing.hist(figsize=(4,2))
plt.show()

### 2.4 Missing values of numerical features

In [None]:
print(df.select_dtypes(exclude=['object']).isnull().sum(),'\n')
print(df2.select_dtypes(exclude=['object']).isnull().sum())

In [None]:
# for competion data, check the 3 missing CompetitionDistance
df2[df2['CompetitionDistance'].isnull()]

In [None]:
# fill and flag the missing numeric data
df2.CompetitionOpenSinceMonth.fillna(0, inplace=True)
df2.CompetitionOpenSinceYear.fillna(0, inplace=True)
df2.CompetitionDistance.fillna(0, inplace=True)

In [None]:
# flag: indicator variable for missing numeric data
df2['CompetitionOpenSinceMonth_missing'] = df2.CompetitionOpenSinceMonth.isnull().astype(int)
df2['CompetitionOpenSinceYear_missing'] = df2.CompetitionOpenSinceYear.isnull().astype(int)
df2['CompetitionDistance_missing'] = df2.CompetitionDistance.isnull().astype(int)

In [None]:
# check是否当且仅当promo2为0时，Promo2SinceWeek，Promo2SinceYear，Promo2Interval为Nan？
df2[df2['Promo2']==0][['Promo2SinceWeek','Promo2SinceYear','PromoInterval']].isnull().sum()

In [None]:
# just fill the nan with 0 because it is actually not missing data 
df2.Promo2SinceWeek.fillna(0, inplace=True)
df2.Promo2SinceYear.fillna(0, inplace=True)
df2.PromoInterval.fillna(0, inplace=True)

In [None]:
df2.isnull().sum()

## 3. Study of categorical features

### 3.1 Distribution of categorical features

In [None]:
# Plot bar plot for each categorical feature
plt.figure(figsize=(4,4))
sns.countplot(y='SchoolHoliday', data=df)
plt.show()
plt.figure(figsize=(4,4))
sns.countplot(y='StateHoliday', data=df)
plt.show()

In [None]:
for feature in df2.dtypes[df2.dtypes=='object'].index:
    plt.figure(figsize=(4,4))
    sns.countplot(y=feature, data=df2)
    plt.show()

### 3.2 Categorical features cleaning

###### 3.2.1 Structural errors

In [None]:
# Display unique values of 'basement'
df.StateHoliday.unique()

In [None]:
df.StateHoliday.replace(0, '0',inplace=True)

###### 3.2.2 Missing values

In [None]:
# Display number of missing values by feature (categorical)
print(df.select_dtypes(include=['object']).isnull().sum(), '\n')
print(df2.select_dtypes(include=['object']).isnull().sum())

###### To do 2: 1) StateHoliday, StoreType, Assortment, needs to be transformed into one-hot-encoding after all the cleaning and feature engineering; 3) CompetitionOpenSinceMonth, etc. may need to transformed to type int in order to match the Year, Month.

## 4. Sales, customers, average purchasing segmentated by categorical features

### 4.1 Sales on stateholiday are higher, with much more customers but lower avg purchasing

In [None]:
plt.figure(figsize=(4,4))
sns.boxplot(y='Sales', x='StateHoliday', data=df)

In [None]:
plt.figure(figsize=(4,4))
sns.boxplot(y='Customers', x='StateHoliday', data=df)

In [None]:
plt.figure(figsize=(4,4))
sns.boxplot(y='AvgPurchasing', x='StateHoliday', data=df)

### 4.2 SchoolHoliday seems have little impact on sales. 
(Note that all schools are closed on public holidays and weekends.)

In [None]:
plt.figure(figsize=(4,4))
sns.boxplot(y='Sales', x='SchoolHoliday', data=df)

### 4.3 the transformed sales are usually between 10-14

In [None]:
plt.figure(figsize=(4,4))
sns.boxplot(y='Sales', x='Store', data=df)

### 4.4 DoW pattern of sales

In [None]:
plt.figure(figsize=(4,4))
sns.boxplot(y='Sales', x='DayOfWeek', data=df)

### 4.5 Promo effect: more sales, more customers, more avg purchasing

In [None]:
plt.figure(figsize=(4,4))
sns.boxplot(y='Sales', x='Promo', data=df)

In [None]:
plt.figure(figsize=(4,4))
sns.boxplot(y='Customers', x='Promo', data=df)

In [None]:
plt.figure(figsize=(4,4))
sns.boxplot(y='AvgPurchasing', x='Promo', data=df)

### 4.6 Joining the 2 tables for exploration purpose

In [None]:
df.index = df['Store']
df2.index = df2['Store']
df = df.drop(['Store'], axis=1)
df_combined = df.join(df2)
df_combined = df_combined.reset_index(drop=True)
df_combined.head()

### 4.7 Sales v.s. storetype and assortment

In [None]:
# note that the order from the most to the least number in each type: a,d,c,b
sns.boxplot(y='Sales', x='StoreType', data=df_combined)

In [None]:
sns.boxplot(y='Sales', x='Assortment', data=df_combined)

In [None]:
sns.factorplot(data=df_combined, x="StoreType", y="Sales", col="Assortment")

In [None]:
# only 9 stores has assortment == 'b'
df_combined[df_combined.Assortment == 'b'].Store.unique()

In [None]:
# only 17 stores has StoreType == 'b'
df_combined[df_combined.StoreType == 'b'].Store.unique()

In [None]:
g = sns.FacetGrid(df_combined, col="StoreType")
g.map(sns.distplot, "Sales")

## 5. relationship between numerical features and targets

In [None]:
g = sns.FacetGrid(df_combined, col="StoreType")
g.map(plt.scatter, "Customers", "Sales")

In [None]:
sns.lmplot(x='Customers', y='Sales', data=df_combined, hue='StoreType',fit_reg=False)

In [None]:
sns.lmplot(x='AvgPurchasing', y='Sales', data=df_combined, hue='StoreType',fit_reg=False)

In [None]:
sns.lmplot(x='Customers', y='Sales', data=df_combined, hue='Assortment',fit_reg=False)

In [None]:
sns.lmplot(x='AvgPurchasing', y='Sales', data=df_combined, hue='Assortment',fit_reg=False)

In [None]:
sns.lmplot(x='Customers', y='Sales', data=df_combined, hue='Promo',fit_reg=False)

In [None]:
# Calculate correlations between numeric features
correlations = df_combined.corr()

In [None]:
# Generate a mask for the upper triangle
mask = np.zeros_like(correlations, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

In [None]:
# Make the figsize 10 x 8
plt.figure(figsize=(9,8))
# Plot heatmap of annotated correlations
sns.heatmap(correlations*100, annot=True, fmt='.0f',mask = mask, cbar=False)

## 6. Time series exploration

### 6.1 Date related feature engineering

In [None]:
def get_date_features(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['Quarter'] = df['Date'].dt.quarter
    df['Week'] = df['Date'].dt.week
    
    return df

In [None]:
df_combined = get_date_features(df_combined)

### 6.2 Typical store sales study

In [None]:
def get_series(Store_i):
    new_df = df_combined[df_combined.Store == Store_i][['Date','Sales']]
    new_df.index = new_df.Date
    new_df.drop('Date', axis = 1, inplace = True)
    new_series = new_df.T.squeeze()
    return new_series

In [None]:
for i in df_combined.StoreType.unique():
    print(i, df_combined[df_combined.StoreType == i].Store[:1])

In [None]:
new_series_2 = get_series(2)
new_series_85 = get_series(85)
new_series_1 = get_series(1)
new_series_13 = get_series(13)

In [None]:
plt.figure(figsize=(16,2))
new_series_2.plot(style = 'k--')
plt.show()
plt.figure(figsize=(16,2))
new_series_85.plot(style = 'k--')
plt.show()
plt.figure(figsize=(16,2))
new_series_1.plot(style = 'k--')
plt.show()
plt.figure(figsize=(16,2))
new_series_13.plot(style = 'k--')
plt.show()

In [None]:
new_series_2.index = pd.to_datetime(new_series_2.index)
groups = new_series_2.groupby([TimeGrouper("A")])
plt.figure(figsize=(20,3))
a=311
print("Store2 Daily Sales Plot")
for name, group in groups:
    plt.subplot(a) 
    group.plot()
    a+=1
    plt.title(name.year)

In [None]:
groups = new_series_2['2013'].groupby([TimeGrouper("A"),TimeGrouper("Q")])
plt.figure(figsize=(20,3))
a=411
print("Store2 Daily Sales Plot")
for name, group in groups:
    plt.subplot(a) 
    group.plot()
    a+=1
    plt.title(name)
    
groups = new_series_2['2014'].groupby([TimeGrouper("A"),TimeGrouper("Q")])
plt.figure(figsize=(20,3))
a=411
for name, group in groups:
    plt.subplot(a) 
    group.plot()
    a+=1
    plt.title(name)
    
groups = new_series_2['2015'].groupby([TimeGrouper("A"),TimeGrouper("Q")])
plt.figure(figsize=(20,3))
a=411
for name, group in groups:
    plt.subplot(a) 
    group.plot()
    a+=1
    plt.title(name)

In [None]:
groups = new_series_2['2013'].groupby([TimeGrouper("A"),TimeGrouper("M")])
plt.figure(figsize=(15,6))
a=611
print("Store2 Daily Sales Plot")
i = 1
for name, group in groups:
    if i>6:
        break
    else:
        plt.subplot(a) 
        group.plot()
        a+=1
        i+=1
        plt.title(name)

plt.figure(figsize=(15,6))
i = 1
a=611
for name, group in groups:
    if i<=6:
        i+=1
    else:
        plt.subplot(a) 
        group.plot()
        a+=1
        i+=1
        plt.title(name)

###### sales time series basically has a weekly seasonality, with typical pattern around DoW, MoY

### 6.3 All store sales time series study

In [None]:
df_combined.Date = pd.to_datetime(df_combined.Date)

In [None]:
daily_sales_sum = df_combined.groupby(['Date'])['Sales'].sum()
daily_sales_mean = df_combined.groupby(['Date'])['Sales'].mean()
daily_sales_median = df_combined.groupby(['Date'])['Sales'].median()
daily_sales_max = df_combined.groupby(['Date'])['Sales'].max()
daily_sales_min = df_combined.groupby(['Date'])['Sales'].min()

In [None]:
print("All stores total monthly sales - by Year")
plt.figure(figsize=(16,2))
daily_sales_sum['2013'].groupby([TimeGrouper("A"),TimeGrouper("M")]).sum().plot()
plt.show()

plt.figure(figsize=(16,2))
daily_sales_sum["2014"].groupby([TimeGrouper("A"),TimeGrouper("M")]).sum().plot()
plt.show()

plt.figure(figsize=(16,2))
daily_sales_sum["2015"].groupby([TimeGrouper("A"),TimeGrouper("M")]).sum().plot()
plt.show()

In [None]:
groups = daily_sales_sum["2013"].groupby([TimeGrouper("A"),TimeGrouper("M")])

plt.figure(figsize=(15,6))
a=611
print("2013 All Store Daily Total Sales Plot - by Month")
i = 1
for name, group in groups:
    if i>6:
        break
    else:
        plt.subplot(a) 
        group.plot()
        a+=1
        i+=1
        plt.title(name)

plt.figure(figsize=(15,6))
i = 1
a=611
for name, group in groups:
    if i<=6:
        i+=1
    else:
        plt.subplot(a) 
        group.plot()
        a+=1
        i+=1
        plt.title(name)

In [None]:
groups = daily_sales_mean["2014"].groupby([TimeGrouper("A"),TimeGrouper("M")])

plt.figure(figsize=(15,6))
a=611
print("2014 All Store Daily Total Sales Plot - by Month")
i = 1
for name, group in groups:
    if i>6:
        break
    else:
        plt.subplot(a) 
        group.plot()
        a+=1
        i+=1
        plt.title(name)

plt.figure(figsize=(15,6))
i = 1
a=611
for name, group in groups:
    if i<=6:
        i+=1
    else:
        plt.subplot(a) 
        group.plot()
        a+=1
        i+=1
        plt.title(name)

In [None]:
groups = daily_sales_mean["2015"].groupby([TimeGrouper("A"),TimeGrouper("M")])

plt.figure(figsize=(15,6))
a=611
print("2015 All Store Daily Total Sales Plot - by Month")
i = 1
for name, group in groups:
    if i>6:
        break
    else:
        plt.subplot(a) 
        group.plot()
        a+=1
        i+=1
        plt.title(name)

plt.figure(figsize=(15,6))
i = 1
a=611
for name, group in groups:
    if i<=6:
        i+=1
    else:
        plt.subplot(a) 
        group.plot()
        a+=1
        i+=1
        plt.title(name)

### 6.4 Statistics over window period

In [None]:
groups = df_combined.groupby(['Year','Month'])['Sales'].mean()
plt.figure(figsize=(10,3))
a = plt.subplot(1,1,1)
#plt.subplot(131) 
#plt.title('Monthly mean plot',color='blue') 
line1=groups.plot(label = 'mean')

groups = df_combined.groupby(['Year','Month'])['Sales'].median()
line2=groups.plot(label = 'median')

groups = df_combined.groupby(['Year','Month'])['Sales'].max()
line3=groups.plot(label = 'max')

groups = df_combined.groupby(['Year','Month'])['Sales'].min()
line4=groups.plot(label = 'min')

handles, labels = a.get_legend_handles_labels()
a.legend(handles[::-1], labels[::-1])
plt.title("overall sales: monthly statistics",color='blue')
plt.show()

In [None]:
groups = df_combined.groupby(['Year'])['Sales'].mean()
plt.figure(figsize=(10,3))
a = plt.subplot(1,1,1)
#plt.subplot(131) 
#plt.title('Monthly mean plot',color='blue') 
line1=groups.plot(label = 'mean')

groups = df_combined.groupby(['Year'])['Sales'].median()
line2=groups.plot(label = 'median')

groups = df_combined.groupby(['Year'])['Sales'].max()
line3=groups.plot(label = 'max')

groups = df_combined.groupby(['Year'])['Sales'].min()
line4=groups.plot(label = 'min')

handles, labels = a.get_legend_handles_labels()
a.legend(handles[::-1], labels[::-1])
plt.title("overall sales: yearly statistics",color='blue')
plt.show()

In [None]:
groups = new_series_2.groupby(TimeGrouper("W")).mean()
plt.figure(figsize=(10,3))
a = plt.subplot(1,1,1)
#plt.subplot(131) 
#plt.title('Monthly mean plot',color='blue') 
line1=groups.plot(label = 'mean')

groups = new_series_2.groupby(TimeGrouper("W")).median()
line2=groups.plot(label = 'median')

groups = new_series_2.groupby(TimeGrouper("W")).max()
line3=groups.plot(label = 'max')

groups = new_series_2.groupby(TimeGrouper("W")).min()
line4=groups.plot(label = 'min')

handles, labels = a.get_legend_handles_labels()
a.legend(handles[::-1], labels[::-1])
plt.show()

In [None]:
groups = new_series_2.groupby(TimeGrouper("M")).mean()
plt.figure(figsize=(10,3))
a = plt.subplot(1,1,1)
#plt.subplot(131) 
#plt.title('Monthly mean plot',color='blue') 
line1=groups.plot(label = 'mean')

groups = new_series_2.groupby(TimeGrouper("M")).median()
line2=groups.plot(label = 'median')

groups = new_series_2.groupby(TimeGrouper("M")).max()
line3=groups.plot(label = 'max')

groups = new_series_2.groupby(TimeGrouper("M")).min()
line4=groups.plot(label = 'min')

handles, labels = a.get_legend_handles_labels()
a.legend(handles[::-1], labels[::-1])
plt.show()

In [None]:
groups = new_series_2.groupby(TimeGrouper("Q")).mean()
plt.figure(figsize=(10,3))
a = plt.subplot(1,1,1)
#plt.subplot(131) 
#plt.title('Monthly mean plot',color='blue') 
line1=groups.plot(label = 'mean')

groups = new_series_2.groupby(TimeGrouper("Q")).median()
line2=groups.plot(label = 'median')

groups = new_series_2.groupby(TimeGrouper("Q")).max()
line3=groups.plot(label = 'max')

groups = new_series_2.groupby(TimeGrouper("Q")).min()
line4=groups.plot(label = 'min')

handles, labels = a.get_legend_handles_labels()
a.legend(handles[::-1], labels[::-1])
plt.show()

### 6.5 Time series lag plot

In [None]:
def lag_n_plot(series, n):
    series_lag_n = series.shift(n)
    df_from_series = pd.DataFrame(series)
    df_from_series = df_from_series.rename(columns={'Sales':'Sales_t'})
    df_from_series_lag_n = pd.DataFrame(series_lag_n)
    df_from_series_lag_n = df_from_series_lag_n.rename(columns={'Sales':'Sales_t-n'})
    new_df = pd.concat([df_from_series, df_from_series_lag_n], axis=1)
    plt.title('Lag %d plot' %(n))
    #plt.figure(figsize=(3,3))
    plt.scatter(y = "Sales_t", x = "Sales_t-n", data=new_df, alpha = 0.5)

In [None]:
print('lag plot of All Store daily sales sum')
plt.figure(figsize=(16,2))
plt.subplot(151) 
lag_n_plot(daily_sales_sum, 1)

plt.subplot(152) 
lag_n_plot(daily_sales_sum, 7)

plt.subplot(153) 
lag_n_plot(daily_sales_sum, 14)

plt.subplot(154) 
lag_n_plot(daily_sales_sum, 28)

plt.subplot(155) 
lag_n_plot(daily_sales_sum, 90)

In [None]:
print('lag plot of All Store daily sales mean')
plt.figure(figsize=(16,2))
plt.subplot(151) 
lag_n_plot(daily_sales_mean, 1)

plt.subplot(152) 
lag_n_plot(daily_sales_mean, 7)

plt.subplot(153) 
lag_n_plot(daily_sales_mean, 14)

plt.subplot(154) 
lag_n_plot(daily_sales_mean, 28)

plt.subplot(155) 
lag_n_plot(daily_sales_mean, 90)

In [None]:
print('lag plot of Store2 daily sales')
plt.figure(figsize=(16,2))
plt.subplot(151) 
lag_n_plot(new_series_2, 1)

plt.subplot(152) 
lag_n_plot(new_series_2, 7)

plt.subplot(153) 
lag_n_plot(new_series_2, 14)

plt.subplot(154) 
lag_n_plot(new_series_2, 28)

plt.subplot(155) 
lag_n_plot(new_series_2, 90)

### 6.6 Time series autocorrelation plot

In [None]:
from pandas.plotting import autocorrelation_plot

In [None]:
print('autocorrelation plot of Store2 daily sales')
plt.figure(figsize=(20,4))
plt.xticks([x for x in range(900) if x % 28 == 0]) 
autocorrelation_plot(new_series_2)
plt.show()

In [None]:
print('autocorrelation plot of All Store daily sales mean')
plt.figure(figsize=(20,4))
plt.xticks([x for x in range(900) if x % 28 == 0])  
autocorrelation_plot(daily_sales_mean)
plt.show()

In [None]:
print('autocorrelation plot of All Store daily sales sum')
plt.figure(figsize=(20,4))
plt.xticks([x for x in range(900) if x % 28 == 0]) 
autocorrelation_plot(daily_sales_sum)
plt.show()

###### we can see from above that the sales have strong autocorrelation with lag7, lag 14, lag 28, lag 365.

## 7. Save the table

In [None]:
df_combined.to_csv('df_combined_cleaned.csv', index=None)

###### Next, go to Modual 2: Feature Engineering.