### Feature Proprocessing

In [116]:
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [117]:
store = pd.read_csv('store.csv')
train = pd.read_csv('train.csv',dtype={"StateHoliday": str})
test = pd.read_csv('test.csv')

In [118]:
"""
Functions that help initialise data and include new features to the data set to be trained
"""
def initialise_data(train, store):
    ## removed 0 sales because they are not used in grading
    train = train[train.Sales != 0]
    ## fill the N.A.N values
    store = store.fillna(0)
    ## combine all features together
    df = train.merge(store, on='Store')
    # Get labels and remove from dataframe
    labels = df.values[:,3]
    labels = np.array([labels]).T
    df = df.drop('Sales', axis=1)
    return (df, labels)

## converting dates into year, month, day and additional feature week of the year
def date_convert(df):
    df['Date']  = pd.to_datetime(df['Date'], errors='coerce')
    df['Year']  = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['WeekOfYear'] = df['Date'].dt.weekofyear
    return df

## adjust and standardise the mappings for all the categorical variables.
def mapping_encoding(df):
    mappings = {'0': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4}
    mappings_month = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    ## replace with values so they can be one hot encoded
    df.StoreType.replace(mappings, inplace=True)
    df.Assortment.replace(mappings, inplace=True)
    df.StateHoliday.replace(mappings, inplace=True)
    df.PromoInterval.replace(mappings, inplace=True)
    
    df['StateHoliday'] = LabelEncoder().fit_transform(df['StateHoliday'])
    df['Assortment']   = LabelEncoder().fit_transform(df['Assortment'])
    df['StoreType']    = LabelEncoder().fit_transform(df['StoreType'])
    return df

In [119]:
"""
Build Features in the order defined by feature_builders array
"""
df, labels = initialise_data(train, store)

feature_builders = [date_convert, mapping_encoding]

for i in range(len(feature_builders)):
    df = feature_builders[i](df)

In [120]:
"""
Functions Wrappers that return models
"""

arr = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
mask = np.ones(3, dtype=bool)
mask[0:2] = False
print(mask)
print(arr[mask])

[False False  True]
[[ 9 10 11 12]]


In [126]:
from sklearn.svm import SVC
import math

def rmspe(pred, labels):
    return np.sqrt(np.mean((pred/labels-1) ** 2))

"""
Cross Validation Code
"""
def cv(df, labels, model):
    df = df.values
    num_rows = df.shape[0]
    K = 10
    cv_score = 0
    for i in range(K):
        # Get validation array
        start_val = math.floor(i/K * num_rows)
        end_val = math.floor((i+1)/K * num_rows)
        
        if K==10:
            end_val = num_rows
        print(start_val, end_val)
        
        df_val = df[start_val:end_val,:]
        labels_val = df[start_val:end_val,:]
        
        # Get training array by deleting rows for validation
        mask = np.ones(num_rows, dtype=bool)
        mask[start_val:end_val+1] = False
        df_train = df[mask]
        labels_train = labels[mask]
        print(mask)
        print(mask.shape)
        print(df.shape)
        print(labels.shape)
        
        fitted = model.fit(df_train, labels_train)
        pred = fitted.predict(df_val)
        
        cv_score += rmspe(pred, labels_val)

md = SVC()

print(cv(df, labels, md))

0 5468
[False False False ..., False False False]
(54688,)
(54688, 21)
(54688, 1)


ValueError: Found array with 0 sample(s) (shape=(0, 21)) while a minimum of 1 is required.

In [105]:
store1 = df.loc[df.Store == 1]
#store1.CompetitionOpenSinceYear
#plt.plot(store1.CompetitionOpenSinceMonth)
#plt.plot(store1.Sales)
plt.show()
s = test["Store"]
d = test["Date"]
test['Id'] = list(zip(test.Store, test.Date))
test.head()

Unnamed: 0,Store,DayOfWeek,Date,Customers,Open,Promo,StateHoliday,SchoolHoliday,Id
0,1,5,2015-07-31,555,1,1,0,1,"(1, 2015-07-31)"
1,2,5,2015-07-31,625,1,1,0,1,"(2, 2015-07-31)"
2,3,5,2015-07-31,821,1,1,0,1,"(3, 2015-07-31)"
3,4,5,2015-07-31,1498,1,1,0,1,"(4, 2015-07-31)"
4,5,5,2015-07-31,559,1,1,0,1,"(5, 2015-07-31)"


References:
- https://datascience.stackexchange.com/questions/9777/one-hot-vector-representation-vs-label-encoding-for-categorical-variables
- https://datascience.stackexchange.com/questions/9443/when-to-use-one-hot-encoding-vs-labelencoder-vs-dictvectorizor