In [25]:
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns; sns.set(style='ticks', color_codes=True)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.feature_selection import RFE
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

import os 

In [26]:
print(os.listdir("../walmart-recruiting-store-sales-forecasting"))

['features.csv', 'sampleSubmission.csv', 'stores.csv', 'test.csv', 'train.csv']


In [27]:
df_stores       = pd.read_csv("../walmart-recruiting-store-sales-forecasting/stores.csv", sep=',',header=0)
df_features     = pd.read_csv("../walmart-recruiting-store-sales-forecasting/features.csv", sep=',',header=0)
df_samplesub    = pd.read_csv("../walmart-recruiting-store-sales-forecasting/sampleSubmission.csv", sep=',',header=0)
df_train        = pd.read_csv("../walmart-recruiting-store-sales-forecasting/train.csv", sep=',',header=0)
df_test         = pd.read_csv("../walmart-recruiting-store-sales-forecasting/test.csv", sep=',',header=0)

In [30]:
df_train_stores_features = df_train.merge(df_stores,how='left').merge(df_features,how='left')

In [31]:
df_train_stores_features['Date'] = pd.to_datetime(df_train_stores_features['Date'])

In [32]:
df_train_stores_features = df_train_stores_features.drop(columns=['Type'])

In [None]:
df_train_stores_features

In [None]:
df_stores

In [None]:
df_stores.isnull().sum()

In [None]:
df_features

In [None]:
df_features.isnull().sum()

In [None]:
df_samplesub

In [None]:
df_samplesub.isnull().sum()

## Explorando os Dados

In [None]:
df_train_stores_features.head()

In [None]:
def scatter(df_train_stores_features, column):
    plt.figure()
    plt.scatter(df_train_stores_features[column], df_train_stores_features['Weekly_Sales'])
    plt.ylabel('weeklySales')
    plt.xlabel(column)

In [None]:
df_train_stores_features.columns

In [None]:
scatter(df_train_stores_features, 'Fuel_Price')
scatter(df_train_stores_features, 'Size')
scatter(df_train_stores_features, 'CPI')
scatter(df_train_stores_features, 'Type')
scatter(df_train_stores_features, 'IsHoliday')
scatter(df_train_stores_features, 'Unemployment')
scatter(df_train_stores_features, 'Temperature')
scatter(df_train_stores_features, 'Store')
scatter(df_train_stores_features, 'Dept')


In [None]:
fig = plt.Figure(figsize=(18,14))
corr = df_train_stores_features.corr()
c = plt.pcolor(corr)
plt.yticks(np.arange(0.5, len(corr.index), 1), corr.index)
plt.xticks(np.arange(0.5, len(corr.columns), 1), corr.columns)
fig.colorbar(c)

In [None]:
df_train_stores_features.columns

In [None]:
sns.pairplot(df_train_stores_features,vars=['Weekly_Sales', 'Fuel_Price', 'Size', 'CPI', 'Dept', 'Temperature', 'Unemployment'])

In [None]:
#sns.pairplot(df_train_stores_features.fillna(0),vars=['Weekly_Sales', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5'])

In [None]:
for name, group in df_train_stores_features.groupby(["Store","Dept"]):
    plt.title(name)
    plt.scatter(range(len(group)), group["Weekly_Sales"])
    plt.show()
    break

## Manipulando Dados

In [33]:
df_train_stores_features = pd.get_dummies(df_train_stores_features, columns=["Type"])
df_train_stores_features[['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']] = df_train_stores_features[['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']].fillna(0)
df_train_stores_features['Month'] = pd.to_datetime(df_train_stores_features['Date']).dt.month
df_train_stores_features = df_train_stores_features.drop(columns=['Date','CPI','Fuel_Price','Unemployment','MarkDown3'])

KeyError: "None of [Index(['Type'], dtype='object')] are in the [columns]"

In [None]:
df_train_stores_features.head()

## Algoritimos

In [34]:
def knn():
    knn = KNeighborsRegressor(n_neighbors=10)
    return knn

def extraTreesRegressor():
    clf = ExtraTreesRegressor(n_estimators=100, max_features='sqrt', verbose=1, n_jobs=1)
    return clf

def randomForestRegressor():
    clf = RandomForestRegressor(n_estimators=100 , max_features='log2', verbose=1)
    return clf

def svm():
    clf = SVR(kernel='rgb', gamma='auto')
    return clf

def nn():
    clf = MLPRegressor(hidden_layer_sizes=(10,), activation='relu', verbose=3)
    return clf

def predict_(m, test_x):
    return pd.Series(m.predict(test_x))

def model_():
    return extraTreesRegressor()

def train_(train_x, train_y):
    m = model_()
    m.fit(train_x, train_y)
    return m

def train_and_predict(train_x, train_y, test_x):
    m = train_(train_x, train_y)
    return predict_(m, test_x), m

In [35]:
def calculate_error(test_y, predicted, weights):
    return mean_absolute_error(test_y, predicted, sample_weight=weights)

## K-Fold Cross Validation

In [36]:
kf = KFold(n_splits=5)
splited = []

for name, group in df_train_stores_features.groupby(['Store', 'Dept']):
    group = group.reset_index(drop=True)
    trains_x = []
    trains_y = []
    tests_x = []
    tests_y = []
    if group.shape[0] <= 5:
        f = np.array(range(5))
        np.random.shuffle(f)
        group['fold'] = f[:group.shape[0]]
        continue
    fold = 0
    for train_index, test_index in kf.split(group):
        group.loc[test_index, 'fold'] = fold
        fold += 1
    splited.append(group)

splited = pd.concat(splited).reset_index(drop=True)

In [37]:
splited.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,fold
0,1,1,2010-02-05,24924.5,False,151315,42.31,2.572,,,,,,211.096358,8.106,0.0
1,1,1,2010-02-12,46039.49,True,151315,38.51,2.548,,,,,,211.24217,8.106,0.0
2,1,1,2010-02-19,41595.55,False,151315,39.93,2.514,,,,,,211.289143,8.106,0.0
3,1,1,2010-02-26,19403.54,False,151315,46.63,2.561,,,,,,211.319643,8.106,0.0
4,1,1,2010-03-05,21827.9,False,151315,46.5,2.625,,,,,,211.350143,8.106,0.0


In [38]:
best_model = None
error_cv = 0
best_error = np.iinfo(np.int32).max
for fold in range(5):
    df_train = splited.loc[splited['fold'] != fold]
    df_test = splited.loc[splited['fold'] == fold]
    train_y = df_train['Weekly_Sales']
    train_x = df_train.drop(columns=['Weekly_Sales', 'fold'])
    test_y = df_test['Weekly_Sales']
    test_x = df_test.drop(columns=['Weekly_Sales', 'fold'])
    print(df_train.shape, df_test.shape)
    predicted, model = train_and_predict(train_x, train_y, test_x)
    weights = test_x['IsHoliday'].replace(True,5).replace(False, 1)
    error = calculate_error(test_y, predicted, weights)
    error_cv += error
    print(fold, error)
    if error < best_error:
        print('Find Best Model: ')
        best_error = error
        best_model = model
error_cv /= 5

(335722, 16) (85552, 16)


DTypePromotionError: The DType <class 'numpy.dtypes.DateTime64DType'> could not be promoted by <class 'numpy.dtypes.Float64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>)

In [None]:
error_cv

In [None]:
best_error

In [None]:
dataset_test = pd.read_csv("../walmart-recruiting-store-sales-forecasting/test.csv", names=['Store','Dept','Date','isHoliday'],sep=',', header=0)
features = pd.read_csv("../walmart-recruiting-store-sales-forecasting/features.csv",sep=',', header=0,
                       names=['Store','Date','Temperature','Fuel_Price','MarkDown1','MarkDown2','MarkDown3','MarkDown4',
                              'MarkDown5','CPI','Unemployment','IsHoliday']).drop(columns=['IsHoliday'])
stores = pd.read_csv("../walmart-recruiting-store-sales-forecasting/stores.csv", names=['Store','Type','Size'],sep=',', header=0)
dataset_test = dataset_test.merge(stores, how='left').merge(features, how='left')

In [None]:
dataset_test = pd.get_dummies(dataset_test, columns=["Type"])
dataset_test[['MarkDown1','MarkDown2','MarkDown3','MarkDown4', 'MarkDown5']] = dataset_test[['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']].fillna(0)
dataset_test = dataset_test.fillna(0)
column_date = dataset_test['Date']
dataset_test['Month'] = pd.to_datetime(dataset_test['Date']).dt.month
dataset_test = dataset_test.drop(columns=["Date","CPI", "Fuel_Price", 'Unemployment', 'MarkDown3'])
dataset_test

In [None]:
predicted_test = best_model.predict(dataset_test)

In [None]:
dataset_test['weeklySales'] = predicted_test
dataset_test['Date'] = column_date
dataset_test['id'] = dataset_test['Store'].astype(str) + '_' +  dataset_test['Dept'].astype(str) + '_' +  dataset_test['Date'].astype(str)
dataset_test = dataset_test[['id', 'weeklySales']]
dataset_test = dataset_test.rename(columns={'id': 'Id', 'weeklySales': 'Weekly_Sales'})

In [None]:
dataset_test.to_csv('output.csv', index=False)