In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt 
import seaborn as sns
%matplotlib inline

In [None]:
train_df = pd.read_csv('../input/rossmann-store-sales/train.csv', low_memory=False, parse_dates=['Date'])
store_df = pd.read_csv('../input/rossmann-store-sales/store.csv')
merged_df = train_df.merge(store_df, left_on='Store', right_on='Store')
merged_df[:5]

In [None]:
train_df.info()

In [None]:
store_df.head(5)

In [None]:
merged_df.describe()

In [None]:
merged_df.info()

## Visualization

In [None]:
store_df.info()

In [None]:
#STORE
ax= sns.lineplot( y=store_df['Store'],x=store_df.index)
ax.set_xlabel('Index')
ax.set_title('Store values')

In [None]:
#StoreType pie
store_types = store_df['StoreType'].value_counts().sort_values(ascending=False)
ax= store_types.plot.pie(autopct="%.1f%%",startangle=90, figsize=(10,10))
ax.set_title('StoreType pie chart')

In [None]:
#Assortment pie
assortments = store_df['Assortment'].value_counts().sort_values(ascending=False)
ax=assortments.plot.pie(autopct="%.1f%%",startangle=90, figsize=(10,10))
ax.set_title('Assortment pie chart')

In [None]:
#CompetitionDistance histogram
fig, ax = plt.subplots(figsize=(15,5))
g = sns.histplot(data=store_df, x='CompetitionDistance',ax=ax)
g.set_title('CompetitionDistance Histogram')

In [None]:
#CompetitionOpenSinceYear histogram
fig, ax = plt.subplots(figsize=(15,5))
g = sns.histplot(data=store_df, x='CompetitionOpenSinceYear',ax=ax)
g.set_title('CompetitionOpenSinceYear Histogram')

In [None]:
# Promo2 pie
promo2s = store_df['Promo2'].value_counts().sort_values(ascending=False)
ax = promo2s.plot.pie(autopct="%.1f%%",startangle=90, figsize=(10,10))
ax.set_title('Promo2 Pie chart')

In [None]:
# Promo2SinceYear hist
fig, ax = plt.subplots(figsize=(15,5))
g = sns.histplot(data=store_df, x='Promo2SinceYear',ax=ax,discrete=True)
g.set_title('Promo2SinceYear Histogram')

In [None]:
#PromoInterval hist
promo_intervals = store_df['PromoInterval'].value_counts().sort_values(ascending=False)
ax = promo_intervals.plot.pie(autopct="%.1f%%",startangle=90, figsize=(10,10))
ax.set_title('PromoInterval pie chart')

In [None]:
#Date lineplot
ax =train_df['Date'].plot.line(title='Date values')
ax.set_xlabel('Index')
ax.set_ylabel('Date')

In [None]:
#DayOfWeek pie
day_of_weeks = train_df['DayOfWeek'].value_counts().sort_values(ascending=False)
ax = day_of_weeks.plot.pie(autopct="%.1f%%",startangle=90, figsize=(10,10))
ax.set_title('DayOfWeek pie chart')

In [None]:
#Sales hist
fig, ax = plt.subplots(figsize=(15,5))
g = sns.histplot(data=merged_df, x='Sales',ax=ax)
ax.set_yscale('log')
ax.set_title('Sales Histogram')

In [None]:
#Customers hist
fig, ax = plt.subplots(figsize=(15,5))
g = sns.histplot(data=merged_df, x='Customers',ax=ax)
ax.set_yscale('log')
ax.set_title('Customers Histogram')

In [None]:
#Open pie
opens = train_df['Open'].value_counts().sort_values(ascending=False)
ax = opens.plot.pie(autopct="%.1f%%",startangle=90, figsize=(10,10))
ax.set_title('Open pie chart')

In [None]:
#Promo pie
promos = train_df['Promo'].value_counts().sort_values(ascending=False)
ax = promos.plot.pie(autopct="%.1f%%",startangle=90, figsize=(10,10))
ax.set_title('Promo pie chart')

In [None]:
#StateHoliday pie
fig, ax = plt.subplots(figsize=(7,7))
state_holidays = train_df['StateHoliday'].value_counts().sort_values(ascending=False)
_, labels, pct_texts = plt.pie(state_holidays, labels=state_holidays.keys(), autopct="%.1f%%", 
                          startangle=160, rotatelabels=True, pctdistance=0.75)
for label, pct_text in zip(labels, pct_texts):
    pct_text.set_rotation(label.get_rotation())
plt.title('State Holiday pie chart')
plt.axis('equal')
plt.tight_layout()
plt.show()

In [None]:
#SchoolHoliday pie
school_holidays=train_df['SchoolHoliday'].value_counts().sort_values(ascending=False)
ax = school_holidays.plot.pie(autopct="%.1f%%", figsize=(10,10), startangle=90)
ax.set_title('SchoolHoliday pie chart')

### Correlation

In [None]:
corr = merged_df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
np.fill_diagonal(mask, False)
sns.heatmap(corr,linewidths=.5,cmap="Paired",mask=mask)

In [None]:
med_distance = merged_df['CompetitionDistance'].median()
competition_distance_df = merged_df.groupby([merged_df['CompetitionDistance']>med_distance]).groups
lower_med, higher_med = competition_distance_df[False], competition_distance_df[True]
print(f"lower distance = {merged_df.loc[lower_med]['Customers'].mean()}\nhigher distance = {merged_df.loc[higher_med]['Customers'].mean()}")
# => Tại nơi cách xa shop đối thủ thì số khách hàng trung bình mỗi ngày < nơi gần shop đối thủ
# có thể tại những nơi này tập trung đông dân cư hơn

In [None]:
# StateHoliday vs Open
state_holidays = merged_df.groupby('StateHoliday')['Open']
state_holiday_open_rates = state_holidays.sum()/state_holidays.count()
state_holiday_open_rates.plot.line()
# các ngày lễ a,b,c hầu như là nghỉ

In [None]:
# Tác động của các nhà thuốc có tham gia Promo2 đến với doanh thu 
fig, ax = plt.subplots(figsize=(15,10))
sns.boxplot(x="Promo2", y="Sales", data=merged_df, ax=ax)
#=> Những nhà thuốc tham gia Promo2 có doanh thu thấp hơn những nhà thuốc không tham gia, có thẻ
# là vì những nhà thuốc này đã có doanh thu rất thấp từ lúc bắt đầu

In [None]:
# DayOfWeek vs Sales
fig, ax = plt.subplots(figsize=(15,10))
sns.barplot(x="DayOfWeek", y="Sales", data=merged_df)
# => Sunday sales is low

In [None]:
#Promo vs Sales
sns.boxplot(data=merged_df[merged_df['Open']==1],x='Promo',y='Sales')
# => Promo helps increase sales

In [None]:
#Customers vs Sales
sns.relplot(data=merged_df, x='Customers',y='Sales')
# => ...