**Importing all the libraries that we will need**

In [30]:
#Import necessary packages for data analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing, linear_model, metrics
import gc; gc.enable()
import seaborn as sns
sns.set(style = 'whitegrid', color_codes = True)
%matplotlib inline
#Machine Learning Algorithms
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

from operator import add # elementwise addition
import folium # plotting data on interactive maps

**Loading the Data**

In [2]:
PATH = ("../input/favorita-grocery-sales-forecasting")
from subprocess import check_output
print(check_output(["ls", PATH]).decode("utf8"))

In [3]:
dtypes = {'id':'int64', 'item_nbr':'int32', 'store_nbr':'int8', 'onpromotion':str}
data = {
    'train_1': pd.read_csv('../input/favorita-grocery-sales-forecasting/train.csv', dtype=dtypes, parse_dates=['date']),
    'test': pd.read_csv('../input/favorita-grocery-sales-forecasting/test.csv', dtype=dtypes, parse_dates=['date']),
    'items': pd.read_csv('../input/favorita-grocery-sales-forecasting/items.csv'),
    'stores': pd.read_csv('../input/favorita-grocery-sales-forecasting/stores.csv'),
    'trans': pd.read_csv('../input/favorita-grocery-sales-forecasting/transactions.csv', parse_dates=['date']),
    'holidays': pd.read_csv('../input/favorita-grocery-sales-forecasting/holidays_events.csv', dtype={'transferred':str}, parse_dates=['date']),
    'oil': pd.read_csv('../input/favorita-grocery-sales-forecasting/oil.csv', parse_dates=['date']),
    }

**Data processing**

In [4]:
train = data['train_1'][(data['train_1']['date'].dt.month == 8) & (data['train_1']['date'].dt.day > 15)]
del data['train_1']; gc.collect();
target = train['unit_sales'].values
target[target < 0.] = 0.
train['unit_sales'] = np.log1p(target)

def df_lbl_enc(df):
    for c in df.columns:
        if df[c].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            df[c] = lbl.fit_transform(df[c])
            print(c)
    return df

def df_transform(df):
    df['date'] = pd.to_datetime(df['date'])
    df['yea'] = df['date'].dt.year
    df['mon'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['date'] = df['date'].dt.dayofweek
    df['onpromotion'] = df['onpromotion'].map({'False': 0, 'True': 1})
    df['perishable'] = df['perishable'].map({0:1.0, 1:1.25})
    df = df.fillna(-1)
    return df
#Items data
data['items'] = df_lbl_enc(data['items'])
train = pd.merge(train, data['items'], how='left', on=['item_nbr'])
test = pd.merge(data['test'], data['items'], how='left', on=['item_nbr'])
del data['test']; gc.collect();
del data['items']; gc.collect();
#Transactions data
train = pd.merge(train, data['trans'], how='left', on=['date','store_nbr'])
test = pd.merge(test, data['trans'], how='left', on=['date','store_nbr'])
del data['trans']; gc.collect();
target = train['transactions'].values
target[target < 0.] = 0.00015
train['transactions'] = np.log1p(target)
#Stores data
data['stores'] = df_lbl_enc(data['stores'])
train = pd.merge(train, data['stores'], how='left', on=['store_nbr'])
test = pd.merge(test, data['stores'], how='left', on=['store_nbr'])
del data['stores']; gc.collect();
#Holidays data
data['holidays'] = data['holidays'][data['holidays']['locale'] == 'National'][['date','transferred']]
data['holidays']['transferred'] = data['holidays']['transferred'].map({'False': 0, 'True': 1})
train = pd.merge(train, data['holidays'], how='left', on=['date'])
test = pd.merge(test, data['holidays'], how='left', on=['date'])
del data['holidays']; gc.collect();
#Oil data
train = pd.merge(train, data['oil'], how='left', on=['date'])
test = pd.merge(test, data['oil'], how='left', on=['date'])
del data['oil']; gc.collect();
#Join data
train = df_transform(train)
test = df_transform(test)
col = [c for c in train if c not in ['id', 'unit_sales','perishable','transactions']]
x1 = train[(train['yea'] != 2016)]
x2 = train[(train['yea'] == 2016)]
del train; gc.collect();
y1 = x1['transactions'].values
y2 = x2['transactions'].values

In [5]:
x1.head()

In [6]:
x2.head()

*Unit sales per promotion*

In [7]:
fig, (axis1) = plt.subplots(1,1,figsize=(30,4))
sns.barplot(x='onpromotion', y='unit_sales', data=x1, ax=axis1)

*Unit sales per store type*

In [8]:
fig, (axis1) = plt.subplots(1,1,figsize=(30,4))
sns.barplot(x='type', y='unit_sales', data=x1, ax=axis1)

*Unit sales per *

In [9]:
fig, (axis1) = plt.subplots(1,1,figsize=(30,4))
sns.countplot(x=x2['city'], data=x2, ax=axis1)

*Transactions*

In [10]:
t=x2['transactions']
fig, (axis1) = plt.subplots(1,1,sharex=True,figsize=(15,8))
ax1 = t.plot(legend=True,ax=axis1,marker='o',title="Transactions")

*Oil price *

In [11]:
oil_plot = x2['dcoilwtico'].copy()
oil_plot.index = x2['day'].copy()
oil_plot.plot(figsize=(30,10))
plt.show()

**Predict Sales : Random Forest**

In [13]:
r3 = RandomForestRegressor(n_estimators=86, max_depth=4, n_jobs=-1, 
                                    verbose=0, warm_start=True)

r4 = RandomForestRegressor(n_estimators=78, max_depth=3, n_jobs=-1, 
                                    verbose=0, warm_start=True)

In [14]:
#Initialization
def NWRMSLE(y, pred, w):
    return metrics.mean_squared_error(y, pred, sample_weight=w)**0.5
#Model 1
r3.fit(x1[col], y1)
a3 = NWRMSLE(y2, r3.predict(x2[col]), x2['perishable'])

#Model 2
r4.fit(x1[col], y1)
a4 = NWRMSLE(y2, r4.predict(x2[col]), x2['perishable'])

N3 = str(a3)
N4 = str(a4)

test['transactions'] = r3.predict(test[col])
test['transactions'] = r4.predict(test[col])
col = [c for c in x1 if c not in ['id', 'unit_sales','city','cluster','perishable']]
y1 = x1['unit_sales'].values
y2 = x2['unit_sales'].values

r3 = RandomForestRegressor(n_estimators=79, max_depth=5, n_jobs=-1, 
                                    verbose=0, warm_start=True)

r4 = GradientBoostingRegressor(n_estimators=120, max_depth=3, learning_rate = 0.05, 
                                       verbose=0, warm_start=True,
                                       subsample= 0.65, max_features = 0.35)
#Fit a random forest classifier to our training set
r3.fit(x1[col], y1)
r4.fit(x1[col], y1)
a3 = NWRMSLE(y2, r3.predict(x2[col]), x2['perishable'])
a4 = NWRMSLE(y2, r4.predict(x2[col]), x2['perishable'])
print('model fit')

*Save Predictions Model 1*

In [15]:
#Output file
N3 = str(a3)
print('Accuracy = ',a3*100)
test['unit_sales'] = r3.predict(test[col])
cut = 0.+1e-12 # 0.+1e-15
test['unit_sales'] = (np.exp(test['unit_sales']) - 1).clip(lower=cut)

output_file = 'sample_submission_RF1.csv'
 
test[['id','unit_sales','city','cluster']].to_csv(output_file, index=False, float_format='%.2f')
print('file created')

*Save Predictions Model 2*

In [16]:
# part of the output file name
N4 = str(a4)
print('Accuracy = ',a4*100)
test['unit_sales'] = r4.predict(test[col])
cut = 0.+1e-12 # 0.+1e-15
test['unit_sales'] = (np.exp(test['unit_sales']) - 1).clip(lower=cut)

output_file = 'sample_submission_FR2.csv'
 
test[['id','unit_sales','city','cluster']].to_csv(output_file, index=False, float_format='%.2f')
print('file created')

**Predict Sales : GradientBoosting**

In [17]:
r5 = GradientBoostingRegressor(n_estimators=110, max_depth=3, learning_rate = 0.05, 
                                       verbose=0, warm_start=True,
                                       subsample= 0.7, max_features = 0.35)

r6 = GradientBoostingRegressor(n_estimators=125, max_depth=4, learning_rate = 0.05, 
                                        verbose=0, warm_start=True,
                                       subsample= 0.85, max_features = 0.40) 

In [18]:
r5.fit(x1[col], y1)
r6.fit(x1[col], y1)
a5 = NWRMSLE(y2, r5.predict(x2[col]), x2['perishable'])
a6 = NWRMSLE(y2, r6.predict(x2[col]), x2['perishable'])
N5 = str(a5)
N6 = str(a6)
test['transactions'] = r5.predict(test[col])
test['transactions'] = r6.predict(test[col])
test['transactions'] = test['transactions'].clip(lower=0.+1e-15)

col = [c for c in x1 if c not in ['id', 'unit_sales','city','cluster','perishable']]
y1 = x1['unit_sales'].values
y2 = x2['unit_sales'].values
print('model fit')

*Save Predictions Model 1*

In [19]:
N5 = str(a5)
print('Precisions = ',a5)
test['unit_sales'] = r5.predict(test[col])
cut = 0.+1e-12 # 0.+1e-15
test['unit_sales'] = (np.exp(test['unit_sales']) - 1).clip(lower=cut)

output_file = 'sample_submission_GB1.csv'
 
test[['id','unit_sales']].to_csv(output_file, index=False, float_format='%.2f')
print('file created')

*Save Predictions Model 2*

In [20]:
N6 = str(a6)
print('Precisions = ',a6)
test['unit_sales'] = r6.predict(test[col])
cut = 0.+1e-12 # 0.+1e-15
test['unit_sales'] = (np.exp(test['unit_sales']) - 1).clip(lower=cut)

output_file = 'sample_submission_GB2.csv'
 
test[['id','unit_sales','cluster','city','cluster']].to_csv(output_file, index=False, float_format='%.2f')
print('file created')

In [28]:
pred_by_city1 = pd.read_csv('sample_submission_RF1.csv')
pred_by_city1.head()

In [27]:
pred_by_city2 = pd.read_csv('sample_submission_FR2.csv')
pred_by_city2.head()

In [24]:
pred_by_city3 = pd.read_csv('sample_submission_GB2.csv')
pred_by_city3.head()

In [32]:
store_locations={
 'Ambato' : [-1.2543408,-78.6228504],
 'Babahoyo' : [-1.801926,-79.53464589999999],
 'Cayambe' : [0.025,-77.98916659999998],
 'Cuenca' : [-2.9001285,-79.0058965],
 'Daule' : [-1.86218,-79.97766899999999],
 'El Carmen' : [-0.266667, -79.4333],
 'Esmeraldas' : [0.9681788999999998,-79.6517202],
 'Guaranda' : [-1.5904721,-78.9995154],
 'Guayaquil' : [-2.1709979,-79.92235920000002],
 'Ibarra' : [0.3391763,-78.12223360000002],
 'Latacunga' : [-0.7754954,-78.52064999999999],
 'Libertad' : [-2.2344458,-79.91122430000001],
 'Loja' : [-4.0078909,-79.21127690000003],
 'Machala' : [-3.2581112,-79.9553924],
 'Manta' : [-0.9676533,-80.70891010000003],
 'Playas' : [-2.6284683,-80.38958860000002],
 'Puyo' : [-1.4923925,-78.00241340000002],
 'Quevedo' : [-1.0225124,-79.46040349999998],
 'Quito' : [-0.1806532,-78.46783820000002],
 'Riobamba' : [-1.6635508,-78.65464600000001],
 'Salinas' : [-2.2233633,-80.958462],
 'Santo Domingo' : [-0.2389045,-79.17742679999998]
}

# Defining a color dictionary
col={'A':'red','B':'blue','C':'green','D':'pink','E':'beige',
     0:'red',1:'blue',2:'green',3:'darkblue',4:'pink',5:'beige',13:'black'}

#
def add_city_map(name,typ):
    folium.Marker(
         location=list(map(add,store_locations.get(name),[(0.5-rd.random())/20,(0.5-rd.random())/20])),
         icon=folium.Icon(color=col.get(typ), icon='cloud'),
    ).add_to(map_Ecuador)

map_Ecuador=folium.Map(location=[-1.233333, -78.516667],zoom_start=7)

# Enabling clustering (also replace map_ecuador by store_cluster in the add_city_map function)
# from folium.plugins import MarkerCluster
#store_cluster=MarkerCluster().add_to(map_Ecuador)

[add_city_map(x,y) for x,y in zip(pred_by_city1.city,pred_by_city1.type)]
map_Ecuador