# Librerías usadas:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import cufflinks as cf
import plotly.express as px
from plotly.offline import download_plotlyjs,iplot,plot,init_notebook_mode
init_notebook_mode(connected=True)
cf.go_offline
import warnings
warnings.filterwarnings('ignore')
import scipy.stats as st
from scipy.stats import ttest_1samp,ttest_ind,ttest_rel,f_oneway,chi2_contingency

# Lectura de datos

In [None]:
features = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/features.csv.zip')
stores = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/stores.csv')
test = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/test.csv.zip')
train = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/train.csv.zip')

## Datos

In [None]:
print('Features')
print(features.info())
print('***'*40,'\n')
print('stores')
print(stores.info())
print('***'*40,'\n')
print('train')
print(train.info())
print('***'*40,'\n')
print('test')
print(test.info())


## Shape de los datos

In [None]:
print('Features')
print(features.shape)
print('***'*40,'\n')
print('stores')
print(stores.shape)
print('***'*40,'\n')
print('train')
print(train.shape)
print('***'*40,'\n')
print('test')
print(test.shape)
print('***'*40)


## Primeras 5 filas

In [None]:
features.head()

In [None]:
stores.head()

In [None]:
train.head()

In [None]:
test.head()

## Unir datos

In [None]:
df1 = pd.merge(features,stores,on='Store',how='inner')

## Unión para entrenar y probar al mismo tiempo

In [None]:
df_train = pd.merge(df1,train,on=['Date','Store','IsHoliday'],how='inner')
df_test = pd.merge(df1,test,on=['Date','Store','IsHoliday'],how='inner')

## Para diferenciar prueba y entrenamiento

In [None]:
# Creating a column to identify the test and train data
df_train['train/test'] = 'train'
df_test['train/test'] = 'test'

## Agregando columna de ventas semanales



In [None]:
df_test['Weekly_Sales'] = np.nan

## Concatenando datos de entrenamiento y prueba


> Concatenating the data sets to perform data preprocessing on the whole data.



In [None]:
data = pd.concat([df_train,df_test],axis=0,ignore_index=True)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
# Si alguna tienda tiene ventas semanales menores a 0 las borra
data[data['Weekly_Sales']<=0].shape

In [None]:

data = data.drop(data[(data['Weekly_Sales']<=0)&(data['train/test']=='train')].index)

## Valores nulos

In [None]:
data.isnull().sum()

In [None]:
# El porcentaje de valores nulos
(data.isnull().sum()/data.shape[0])*100

In [None]:
# Sustituir valores nulos con 0
data.iloc[:,4:9]=data.iloc[:,4:9].fillna(0)

In [None]:
data[['CPI','Unemployment']] = data[['CPI','Unemployment']].fillna(method='ffill')

In [None]:
# Comprobar valores nulos de nuevo 
data.isnull().sum()

In [None]:
data['markdown'] = data.iloc[:,4:9].sum(axis=1)
data['markdown'] = data['markdown'].apply(lambda x:0 if x==0 else 1)

In [None]:
data.head()

In [None]:
# Terminar de quitar los valores de Markdown

data = data.drop(data.iloc[:,4:9],axis=1)
data.head()

In [None]:
print(round(data['Dept'].value_counts(normalize=True),4).describe(percentiles=[0.20,0.40,0.60,0.80]))
dep = pd.DataFrame(data['Dept'].value_counts(normalize=True).values,columns=['Dept_freq'])
dep['Dept'] = data['Dept'].value_counts(normalize=True).index
data_n = pd.merge(data,dep,on='Dept',how='inner')
labels = ['rare','less frequent','moderately frequent','very frequent','most frequent']
bins = [0,0.0115,0.0136,0.0149,0.0153,np.inf]
data_n['Dep_type'] = pd.cut(data_n['Dept_freq'],bins=bins,labels=labels)
data_n = data_n.drop('Dept_freq',axis=1)

In [None]:
# convertir fecha a formado de YMD
data_n['Date'] = pd.to_datetime(data_n['Date'],format='%Y-%m-%d',)


In [None]:
# Sólo se requiere la semana y año
from datetime import date as dt
data_n['Week'] = data_n['Date'].dt.week
data_n['year'] = data_n['Date'].dt.year

In [None]:
data_n[data_n['IsHoliday']==True][['Date','Week','year','IsHoliday']].drop_duplicates()

In [None]:
# Borrando datos innecesarios
data_n = data_n.drop(['Date'],axis=1)

In [None]:
data_n.head()

In [None]:
# Shape de los datos
print('Num filas = {}\nNum columnas = {}'.format(data_n.shape[0],data_n.shape[1]))

In [None]:
# convertir a objeto
data_n[['Store','markdown','Dept','Dep_type','IsHoliday']] = data_n[['Store','markdown','Dept','Dep_type','IsHoliday']].astype('object')


In [None]:
# carreglos con la categoría y número correspondiende
cat_col = data_n.select_dtypes(include='object').columns.drop('train/test')
num_col = data_n.select_dtypes(include='number').columns

## Boxplots

In [None]:
plt.figure(figsize=(15,20))
for i,col in enumerate(num_col,1):
  plt.subplot(13,1,i)
  sns.boxplot(data_n[col])
  plt.ylabel(col)
plt.show()

In [None]:
for col in num_col:
  count=0
  q1,q3 = data_n[col].quantile([0.25,0.75])
  iqr = q3-q1
  uw = q3+1.5*iqr
  lw = q1-1.5*iqr
  for i in data_n[col]:
    if i<lw or i>uw:
      count+=1
  print('Porcentaje de outliers '+col+' ={}'.format(count/data_n[col].shape[0]))

In [None]:
cr = data_n.corr()
plt.figure(figsize=(10,10), dpi=80)
sns.heatmap(cr[(cr>=0.4)|(cr<=-0.4)],annot=True,cmap='coolwarm')

In [None]:
import matplotlib.pylab as pylab
params = {'axes.labelsize':'x-large',
          'axes.titlesize':'x-large',
          'xtick.labelsize':'x-small',
          'ytick.labelsize':'x-small'}
pylab.rcParams.update(params)

In [None]:
fig,axes = plt.subplots(2,2,figsize=(15,10),dpi=100)
for idx,col in enumerate(cat_col.drop(['Store','Dept'])):
  row,col1 = idx//2,idx%2
  sns.countplot(x = data_n[col],ax=axes[row,col1])
plt.show()

In [None]:
fig,axes = plt.subplots(2,1,figsize=(25,8),dpi=100)
for idx,col in enumerate(['Store','Dept']):
  row = idx//1
  sns.countplot(x = data_n[col],ax=axes[row],palette='icefire')
plt.show()

In [None]:
data_n_train = data_n[data_n['train/test']=='train']
data_n_test = data_n[data_n['train/test']=='test']

In [None]:
params1 = {'axes.labelsize':'x-large',
          'axes.titlesize':'x-large',
          'xtick.labelsize':'medium',
          'ytick.labelsize':'medium'}
pylab.rcParams.update(params1)

In [None]:
data_n_train.groupby(['year','Week']).agg({'Weekly_Sales':['mean']}).plot(figsize=(18,8))
plt.ylabel('Average weekly Sales')
plt.show()

In [None]:
week_sales_2010 = data_n_train[data_n_train['year']==2010].groupby('Week')['Weekly_Sales'].agg('mean')
week_sales_2011 = data_n_train[data_n_train['year']==2011].groupby('Week')['Weekly_Sales'].agg('mean')
week_sales_2012 = data_n_train[data_n_train['year']==2012].groupby('Week')['Weekly_Sales'].agg('mean')
plt.figure(figsize=(18,8),dpi=100)
sns.lineplot(week_sales_2010.index,week_sales_2010.values)
sns.lineplot(week_sales_2011.index,week_sales_2011.values)
sns.lineplot(week_sales_2012.index,week_sales_2012.values)
plt.legend(['2010','2011','2012'])
plt.ylabel('Average Weekly Sales')
plt.xticks(np.arange(1,52,step=1))
plt.show()

In [None]:
#
plt.figure(figsize=(15,10), dpi=100)
sns.barplot(x='Store',y='Weekly_Sales',data=data_n_train,color='grey')


In [None]:
store_grp = data_n_train['Weekly_Sales'].groupby(data_n_train['Store'])



**Hipótesis**

    H0: Datos normales
    H1: Datos no normales

In [None]:
for i in range(1,data_n_train['Store'].nunique()+1):
  print('Store '+str(i))
  print()
  print(st.shapiro(store_grp.get_group(i)))
  print('***'*40)

**Hypotheis for checking equalence of means between samples:**

    H0: Media igual
    H1: Media inequitativa

In [None]:
st.kruskal(store_grp.get_group(1),store_grp.get_group(2),store_grp.get_group(3),store_grp.get_group(4),store_grp.get_group(5),store_grp.get_group(6),store_grp.get_group(7),store_grp.get_group(8),
           store_grp.get_group(9),store_grp.get_group(10),store_grp.get_group(11),store_grp.get_group(12),store_grp.get_group(13),store_grp.get_group(14),store_grp.get_group(15),store_grp.get_group(16),
           store_grp.get_group(17),store_grp.get_group(18),store_grp.get_group(19),store_grp.get_group(20),store_grp.get_group(21),store_grp.get_group(22),store_grp.get_group(23),store_grp.get_group(24),
           store_grp.get_group(25),store_grp.get_group(26),store_grp.get_group(27),store_grp.get_group(28),store_grp.get_group(29),store_grp.get_group(30),store_grp.get_group(31),store_grp.get_group(32),
           store_grp.get_group(33),store_grp.get_group(34),store_grp.get_group(35),store_grp.get_group(36),store_grp.get_group(37),store_grp.get_group(38),store_grp.get_group(39),store_grp.get_group(40),
           store_grp.get_group(41),store_grp.get_group(42),store_grp.get_group(43),store_grp.get_group(44),store_grp.get_group(45))

In [None]:
plt.figure(figsize=(20,10), dpi=100)
sns.barplot(x='Dept',y='Weekly_Sales',data=data_n_train)


In [None]:
#
plt.figure(figsize=(15,10))
sns.boxplot(x='Type',y='Weekly_Sales',data=data_n_train,showfliers=False)


In [None]:
df_typeA = data_n_train[data_n_train['Type']=='A']['Weekly_Sales']
df_typeB = data_n_train[data_n_train['Type']=='B']['Weekly_Sales']
df_typeC = data_n_train[data_n_train['Type']=='C']['Weekly_Sales']


In [None]:
for i,sample in enumerate([df_typeA,df_typeB,df_typeC]):
  s,p = st.shapiro(sample)
  print('The P-value for the above test for the sample '+str(i)+' ={}'.format(p))

In [None]:
print(st.kruskal(df_typeA,df_typeB,df_typeC))

In [None]:
plt.figure(figsize=(15,8), dpi=100)
sns.boxplot(x='Dep_type',y='Weekly_Sales',data=data_n_train,hue='IsHoliday',showfliers=False)

In [None]:
plt.figure(figsize=(20,8), dpi=100)
plt.subplot(1,2,1)
sns.boxplot(x='Dep_type',y='Weekly_Sales',data=data_n_train)
plt.subplot(1,2,2)
sns.violinplot(x='Dep_type',y='Weekly_Sales',data=data_n_train,hue='IsHoliday')

In [None]:
df_dt1 = data_n_train[data_n_train['Dep_type']=='most frequent']['Weekly_Sales']
df_dt2 = data_n_train[data_n_train['Dep_type']=='very frequent']['Weekly_Sales']
df_dt3 = data_n_train[data_n_train['Dep_type']=='moderately frequent']['Weekly_Sales']
df_dt4 = data_n_train[data_n_train['Dep_type']=='moderately frequent']['Weekly_Sales']
df_dt5 = data_n_train[data_n_train['Dep_type']=='moderately frequent']['Weekly_Sales']

In [None]:
for i,sample in enumerate([df_dt1,df_dt2,df_dt3,df_dt4,df_dt5]):
  s,p = st.shapiro(sample)
  print('The P-value for the above test for the sample '+str(i)+' ={}'.format(p))

In [None]:
print(st.kruskal(df_dt1,df_dt2,df_dt3,df_dt4,df_dt5))

In [None]:
# Analyzing the Average sales at each store on normal and holidays through vizualizations
plt.figure(figsize=(15,10), dpi=100)
pd.crosstab(index = data_n_train['Store'],columns = data_n_train['IsHoliday'],values = data_n_train['Weekly_Sales'],aggfunc='mean').plot(kind='bar',figsize=(15,10))

In [None]:
data_n_train['Weekly_Sales'].groupby(data_n_train['markdown']).mean().plot(kind='bar')

In [None]:
mark_grp = data_n_train['Weekly_Sales'].groupby(data_n_train['markdown'])

In [None]:
for i in range(data_n_train['markdown'].nunique()):
  print('markdown = {}'.format(i))
  print()
  print('P-value = {}'.format(st.shapiro(mark_grp.get_group(i))[1]))
  print('***'*40)

In [None]:
print('P-value = {}'.format(st.mannwhitneyu(mark_grp.get_group(0),mark_grp.get_group(1))[1]))

In [None]:
plt.figure(figsize=(10,10), dpi=100)
plt.subplot(2,2,1)
sns.scatterplot(x='Temperature',y='Weekly_Sales',data=data_n_train)
plt.subplot(2,2,2)
sns.scatterplot(x='Fuel_Price',y='Weekly_Sales',data=data_n_train)
plt.subplot(2,2,3)
sns.scatterplot(x='CPI',y='Weekly_Sales',data=data_n_train)
plt.subplot(2,2,4)
sns.scatterplot(x='Unemployment',y='Weekly_Sales',data=data_n_train)
plt.show()

In [None]:
df_new_train = data_n_train.drop(['train/test','year'],axis=1)
df_new_train.head()

In [None]:
df_new_test = data_n_test.drop(['train/test','year','Weekly_Sales'],axis=1)
df_new_test.head()

In [None]:
df_new_train = df_new_train.sample(frac=1,random_state=10).reset_index(drop=True)
df_new_train.head()

In [None]:
df_new_test = df_new_test.sample(frac=1,random_state=10).reset_index(drop=True)
df_new_test.head()

In [None]:
df_new_train.info()

In [None]:
df_new_test.info()

In [None]:
df_new_train= df_new_train.drop(['Dept','Store'],axis=1)
df_new_test= df_new_test.drop(['Dept','Store'],axis=1)


In [None]:
df_new_train['IsHoliday'] = df_new_train['IsHoliday'].replace({False:0,True:1}) 
df_new_train['Type'] = df_new_train['Type'].replace({'A':3,'B':2,'C':1})
df_new_train['Dep_type'] = df_new_train['Dep_type'].replace({'most frequent':5,'very frequent':4,'moderately frequent':3,'less frequent':2,'rare':1}) 
df_new_train['markdown'] = df_new_train['markdown'].astype('int64')


In [None]:
df_new_test['IsHoliday'] = df_new_test['IsHoliday'].replace({False:0,True:1}) 
df_new_test['Type'] = df_new_test['Type'].replace({'A':3,'B':2,'C':1})
df_new_test['Dep_type'] = df_new_test['Dep_type'].replace({'most frequent':5,'very frequent':4,'moderately frequent':3,'less frequent':2,'rare':1}) 
df_new_test['markdown'] = df_new_test['markdown'].astype('int64')


In [None]:
df_new_train.head()

In [None]:
df_new_test.head()

In [None]:
df_new_train.dtypes

In [None]:
# quitando outliers
for col in df_new_train.columns.drop(['Weekly_Sales','IsHoliday','markdown','Week']):
  q1,q3 = df_new_train[col].quantile([0.25,0.75])
  iqr = q3-q1
  lw = q1-1.5*iqr
  uw = q3+1.5*iqr
  df_new_train[col] = df_new_train[col].apply(lambda x: lw if x < lw else x)
  df_new_train[col] = df_new_train[col].apply(lambda x: uw if x > uw else x)



In [None]:
# cchecando presencia de outliers en el boxplot
plt.figure(figsize=(15,10))
for i,col in enumerate(df_new_train.columns.drop(['Weekly_Sales','IsHoliday','markdown','Week']),1):
  plt.subplot(7,1,i)
  sns.boxplot(df_new_train[col])
  plt.ylabel(col)
  plt.tight_layout()
plt.show()

In [None]:
# distribución de datos
plt.figure(figsize=(15,15))
for i,col in enumerate(df_new_train.columns,1):
  plt.subplot(4,3,i)
  sns.distplot(df_new_train[col])
  plt.tight_layout()

# Se crearon dos conjuntos de datos con y sin outliers

In [None]:
q1,q3 = df_new_train['Weekly_Sales'].quantile([0.25,0.75])
iqr = q3-q1
lw = q1-1.5*iqr
uw = q3+1.5*iqr

In [None]:
# creating data set without outliers
df_outliersna = df_new_train[(df_new_train['Weekly_Sales']>lw)&(df_new_train['Weekly_Sales']<uw)]

In [None]:
df_outliersna.head()

In [None]:
df_outliersna.info()

In [None]:
df_outliersna.describe()

In [None]:
# distribución total
plt.figure(figsize=(15,20))
for i,col in enumerate(df_outliersna.columns,1):
  plt.subplot(4,3,i)
  sns.distplot(df_outliersna[col])
  plt.tight_layout()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df_outliersna.corr(),annot=True,cmap='coolwarm')
plt.show()

In [None]:
# creando datos únicamente en outliers
df_outliers = df_new_train[(df_new_train['Weekly_Sales']<lw)|(df_new_train['Weekly_Sales']>uw)]

In [None]:
df_outliers.head()

In [None]:
df_outliers.info()

In [None]:
df_outliers.describe()

In [None]:
# distribución de datos
plt.figure(figsize=(15,20))
for i,col in enumerate(df_outliers.columns,1):
  plt.subplot(4,3,i)
  sns.distplot(df_outliers[col])
  plt.tight_layout()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df_outliers.corr(),annot=True,cmap='coolwarm')
plt.show()

# Modelo total

In [None]:

for col in df_new_train.columns.drop(['IsHoliday','Type','markdown','Dep_type']):
  df_new_train[col] = np.log(df_new_train[col])

In [None]:
# distribución
plt.figure(figsize=(15,15))
for i,col in enumerate(df_new_train.columns,1):
  plt.subplot(4,3,i)
  sns.distplot(df_new_train[col])
  plt.tight_layout()

# Modelo

In [None]:
# variable independiente y dependiente
x = df_new_train.drop('Weekly_Sales',axis=1)
y = df_new_train['Weekly_Sales']

In [None]:
import statsmodels.api as sm
xc = sm.add_constant(x)
model = sm.OLS(y,xc).fit()
model.summary()

In [None]:
# quitando la columna de markdown
xc1 = xc.drop('markdown',axis=1)
model1 = sm.OLS(y,xc1).fit()
model1.summary()

In [None]:
model1.params

In [None]:
resids = model1.resid

In [None]:
y_pred = model1.predict(xc)

In [None]:
# Checando valor P
print('P-value  = {}'.format(st.jarque_bera(resids,)))

In [None]:
sns.distplot(resids,fit=st.norm)

In [None]:
st.probplot(resids,plot=plt)
plt.show()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
pd.DataFrame([vif(xc.values,i) for i in range(xc.shape[1])],index = xc.columns,columns=['vif']).sort_values('vif',ascending=False)

In [None]:
import statsmodels.stats.api as sms
print(sms.het_goldfeldquandt(resids,xc))

In [None]:
print(sm.stats.diagnostic.linear_rainbow(model1))

In [None]:
df2 = df_new_train.sample(frac=0.1,random_state = 10).reset_index(drop=True)
df2.head()

In [None]:
df2.shape

In [None]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV,cross_val_score,KFold,train_test_split
x = df2.drop('Weekly_Sales',axis=1)
y = df2['Weekly_Sales']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state = 42)

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,StackingClassifier
import lightgbm as lgb

In [None]:
# creating instances for each Regression Machine Learning Algorithm
lr = LinearRegression()
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor()
lgbmr = lgb.LGBMRegressor()

In [None]:
def model_res(algo,x_train=x_train,x_test=x_test,y_train=y_train,y_test=y_test):
  algo.fit(x_train,y_train)
  cof_df = pd.DataFrame(algo.coef_,index=x_train.columns,columns=['Coefs_lr'])
  print(cof_df)
  print()
  print('Intercept = {}'.format(algo.intercept_))
  print('***'*40)
  y_pred_train = algo.predict(x_train)
  y_pred_test = algo.predict(x_test)

  print('Evolución de modelo con datos de entrenamiento')
  print('R-cuad = {}'.format(r2_score(y_train,y_pred_train)))
  print('RMSE = {}'.format(np.sqrt(mean_squared_error(y_train,y_pred_train))))
  print('MAE = {}'.format(mean_absolute_error(y_train,y_pred_train)))
  print('***'*40)
  print('Evolución de modelo con los de pruebas')
  print('R-cuad = {}'.format(r2_score(y_test,y_pred_test)))
  print('RMSE = {}'.format(np.sqrt(mean_squared_error(y_test,y_pred_test))))
  print('MAE = {}'.format(mean_absolute_error(y_test,y_pred_test)))