In [1]:
from feat_competition_sales import *
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import time
import xgboost as xgb


Columns (7) have mixed types. Specify dtype option on import or set low_memory=False.



In [2]:
# Tell plotly to plot in a notebook
offline.init_notebook_mode(connected=True)

### Load and plot data

In [3]:
# Load data
store = pd.read_csv('data/store.csv')
train = pd.read_csv('data/train.csv', dtype = {'StateHoliday': np.str})

# Merge
train = pd.merge(store, train, on='Store')
train = date_convert(train)

# Split
end_val = np.floor(0.8 * train.shape[0]).astype(int)
end_test = np.floor(0.9 * train.shape[0]).astype(int)

Train = train.loc[:end_val]
Val = train.loc[end_val:end_test]
Test = train.loc[end_test:]

In [4]:
#plot_hists(store, name='store')

In [5]:
#plot_hists(train, name='train')

### Feature engineering 

In [None]:
# Clean data, construct new features for xgboost
Train = data_transformation(Train)
Val = data_transformation(Val)
Test = data_transformation(Test)

In [40]:
# OHE
cols = Train.select_dtypes(include='object').columns.tolist()

for col in cols:
    Train[col] = Train[col].astype(str)
    Val[col] = Val[col].astype(str)
    Test[col] = Test[col].astype(str)

ohe = OneHotEncoder(handle_unknown='ignore')
ohe_train = pd.DataFrame()
ohe_val = pd.DataFrame()
ohe_test = pd.DataFrame()


for col in cols:
    ohe.fit(np.array(Train[col]).reshape(-1,1))
    ohe_train_tmp = pd.DataFrame(columns=ohe.categories_, \
                                 data=ohe.transform(np.array(Train[col]).reshape(-1,1)).toarray())
    ohe_val_tmp = pd.DataFrame(columns=ohe.categories_, \
                                 data=ohe.transform(np.array(Val[col]).reshape(-1,1)).toarray()) 
    ohe_test_tmp = pd.DataFrame(columns=ohe.categories_, \
                                 data=ohe.transform(np.array(Test[col]).reshape(-1,1)).toarray()) 
    ohe_train = pd.concat([ohe_train, ohe_train_tmp], axis=1)
    ohe_val = pd.concat([ohe_val, ohe_val_tmp], axis=1)
    ohe_test = pd.concat([ohe_test, ohe_test_tmp], axis=1)

In [10]:
# Drop columns
Train.drop(axis=1, labels=cols, inplace=True)
Val.drop(axis=1, labels=cols, inplace=True)
Test.drop(axis=1, labels=cols, inplace=True)

In [41]:
# Concat
Train = pd.concat([Train.reset_index(), ohe_train], axis=1)
Val = pd.concat([Val.reset_index(), ohe_val], axis=1)
Test = pd.concat([Test.reset_index(), ohe_test], axis=1)

In [12]:
# Write data to csv to reuse it fast
#Train.to_csv('data/Train.csv', index=False)
#Test.to_csv('data/Test.csv', index=False)
#Val.to_csv('data/Val.csv', index=False)

In [28]:
Train = pd.read_csv('data/Train.csv')
Val = pd.read_csv('data/Val.csv')
Test = pd.read_csv('data/Test.csv')

In [30]:
# Create X_train, y_train and so on
# XGB
X_train = Train.drop(axis=1, labels='Sales')
X_train = X_train
y_train = Train['Sales']
X_val = Val.drop(axis=1, labels='Sales')
y_val = Val['Sales']
X_test = Test.drop(axis=1, labels='Sales')
y_test = Test['Sales']

### Training

In [26]:
def parameter_search(X, y, method, params, steps=None):
    valid = {'elastic', 'rf', 'xgboost', 'stack'}
    if method not in valid:
        raise ValueError("Method must be one of %r." % valid)
        
    if method == 'elastic':
        if not type(params) == dict:
            raise ValueError('params is not a dictionary, please support a dictionary of parameters')
        else:
            score = []
            end = np.floor(0.8 * X.shape[0]).astype(int)
            X_train = np.array(X)[:end]
            y_train = np.array(y)[:end]
            X_cv = np.array(X)[end:]
            y_cv = np.array(y)[end:]

        # Grid search for elastic net
        for ratio in params['ratio']:
            for alpha in params['alpha']:
                print(ratio, alpha)
                elastic = ElasticNet(alpha=alpha, l1_ratio=ratio)
                elastic.fit(X_train, y_train)
                preds = elastic.predict(X_cv)                                       
                score.append((alpha, ratio, np.sqrt(mean_squared_error(y_cv, preds))))

                with open('el_net_params.txt', 'w') as f:
                    f.write(str(score))                    

        # Get best score
        best_params = min(score, key=itemgetter(2))[0:2]
        
    elif method == 'rf':
        if not type(params) == dict:
            raise ValueError('params is not a dictionary, please support a dictionary of parameters')
        else:            
            score = []
            end = np.floor(0.8 * X.shape[0]).astype(int)
            X_train = np.array(X)[:end]
            y_train = np.array(y)[:end]
            X_cv = np.array(X)[end:]
            y_cv = np.array(y)[end:]

            # Random search for RF
            for step in tqdm(range(steps)):
                n_estimators = np.random.randint(params['n_estimators'][0], params['n_estimators'][1])
                max_depth = np.random.choice(params['max_depth'])
                max_features = np.random.choice(params['max_features'])
                print((n_estimators, max_depth, max_features))

                rf = RandomForestRegressor(n_estimators=n_estimators,
                                                max_depth=max_depth,
                                                max_features=max_features)
                rf.fit(X_train, y_train)
                preds = rf.predict(X_cv)                                       
                score.append((n_estimators, max_depth, max_features, np.sqrt(mean_squared_error(y_cv, preds))))
                
                with open('rf_params.txt', 'w') as f:
                    f.write(str(score))

            # Get best score
            best_params = min(score, key=itemgetter(3))[0:3]
            
    elif method == 'xgboost':
        if not type(params) == dict:
            raise ValueError('params is not a dictionary, please support a dictionary of parameters')
        else:
            score = []
            end = np.floor(0.8 * X.shape[0]).astype(int)
            X_train = np.array(X)[:end]
            y_train = np.array(y)[:end]
            X_cv = np.array(X)[end:]
            y_cv = np.array(y)[end:]

            fit_params = {
                'eval_metric': 'rmse',
                'early_stopping_rounds': 10,    
                'eval_set': [(X_cv, y_cv)]
            }
            # Random search for xgboost
            for step in tqdm(range(steps)):
                n_estimators = int(np.floor(np.random.uniform(params['n_estimators'][0], params['n_estimators'][1])))
                max_depth = np.random.choice(params['max_depth'])
                lr = np.random.choice(params['learning_rate'])
                subsample = np.random.choice(params['subsample'])
                colsample_bytree = np.random.choice(params['colsample_bytree'])
                colsample_bylevel = np.random.choice(params['colsample_bylevel'])
                reg_lambda = np.random.choice(params['reg_lambda'])
                
                print((lr, n_estimators, max_depth, subsample, colsample_bytree, colsample_bylevel, reg_lambda))
                # Train & predict
                xgb_model = xgb.XGBRegressor(learning_rate=lr, 
                                             n_estimators=n_estimators,
                                             max_depth=max_depth,
                                             subsample=subsample,
                                             colsample_bytree=colsample_bytree,
                                             colsample_bylevel=colsample_bylevel,
                                             objective='reg:squarederror')

                xgb_model.fit(X_train, y_train, eval_metric=fit_params['eval_metric'],
                              early_stopping_rounds=fit_params['early_stopping_rounds'], 
                              eval_set=fit_params['eval_set'], 
                              verbose=False)
                
                preds = xgb_model.predict(X_cv)                                       
                score.append((lr, n_estimators, max_depth, subsample, colsample_bytree, colsample_bylevel, reg_lambda, \
                              np.sqrt(mean_squared_error(y_cv, preds))))
                with open('xgb_params.txt', 'w') as f:
                    f.write(str(score))

            # Get best score
            best_params = min(score, key=itemgetter(7))[0:7]
            
    elif method == 'stack':
        pass
    
    return best_params

In [153]:
# XGBoost
params = {
        'max_depth': [3, 4, 5, 6, 7],
        'learning_rate': [0.0005, 0.001, 0.05, 0.1],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
        'colsample_bylevel': [0.3, 0.5, 0.6, 0.7, 0.8],
        'reg_lambda' : [0.1, 0.5, 1.0, 2, 3, 4, 5],
        'n_estimators': [1000, 10000]
}

ind = np.random.randint(0, X_train.shape[0], size=50000)
xgb_params = parameter_search(X_train.iloc[ind], y_train.iloc[ind], method='xgboost', params=params, steps=25)













  0%|          | 0/25 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A

(0.05, 8313, 4, 0.9, 0.3, 0.8, 4.0)














  4%|▍         | 1/25 [00:13<05:35, 13.96s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.05, 7034, 6, 0.8, 0.3, 0.8, 4.0)














  8%|▊         | 2/25 [00:32<05:53, 15.35s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.05, 8037, 3, 0.9, 0.3, 0.7, 1.0)














 12%|█▏        | 3/25 [00:43<05:09, 14.08s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.001, 5524, 7, 0.8, 0.3, 0.3, 3.0)














 16%|█▌        | 4/25 [02:40<15:40, 44.80s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.0005, 2952, 4, 0.8, 0.7, 0.8, 4.0)














 20%|██        | 5/25 [04:20<20:29, 61.45s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.1, 4606, 5, 1.0, 0.3, 0.6, 1.0)














 24%|██▍       | 6/25 [04:25<14:05, 44.49s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.001, 5180, 3, 0.8, 0.4, 0.8, 4.0)














 28%|██▊       | 7/25 [06:22<19:54, 66.36s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.1, 9988, 5, 0.9, 0.6, 0.5, 0.5)














 32%|███▏      | 8/25 [06:39<14:37, 51.61s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.05, 8997, 4, 0.8, 0.8, 0.6, 1.0)














 36%|███▌      | 9/25 [07:06<11:43, 43.98s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.1, 8210, 4, 0.9, 0.3, 0.5, 0.1)














 40%|████      | 10/25 [07:18<08:35, 34.37s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.0005, 1262, 5, 0.9, 0.6, 0.7, 2.0)














 44%|████▍     | 11/25 [08:01<08:37, 36.98s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.0005, 9459, 7, 1.0, 0.8, 0.8, 0.5)














 48%|████▊     | 12/25 [18:07<45:01, 207.79s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.1, 4421, 5, 0.8, 0.4, 0.5, 5.0)














 52%|█████▏    | 13/25 [18:13<29:28, 147.34s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.0005, 1342, 4, 1.0, 0.6, 0.8, 5.0)














 56%|█████▌    | 14/25 [18:52<21:01, 114.72s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.1, 1767, 7, 0.8, 0.7, 0.6, 1.0)














 60%|██████    | 15/25 [19:09<14:14, 85.41s/it] [A[A[A[A[A[A[A[A[A[A[A[A

(0.05, 7530, 4, 0.9, 0.7, 0.5, 2.0)














 64%|██████▍   | 16/25 [19:32<10:00, 66.75s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.05, 9615, 6, 0.9, 0.6, 0.7, 0.5)














 68%|██████▊   | 17/25 [20:01<07:23, 55.38s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.1, 5074, 6, 1.0, 0.3, 0.8, 1.0)














 72%|███████▏  | 18/25 [20:08<04:45, 40.82s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.05, 9561, 3, 1.0, 0.6, 0.8, 4.0)














 76%|███████▌  | 19/25 [20:24<03:19, 33.30s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.0005, 3648, 6, 1.0, 0.4, 0.5, 2.0)














 80%|████████  | 20/25 [22:06<04:30, 54.14s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.1, 8035, 5, 0.9, 0.5, 0.6, 0.1)














 84%|████████▍ | 21/25 [22:19<02:47, 41.84s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.001, 5479, 6, 0.8, 0.3, 0.3, 4.0)














 88%|████████▊ | 22/25 [24:11<03:07, 62.66s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.1, 2268, 6, 1.0, 0.3, 0.6, 2.0)














 92%|█████████▏| 23/25 [24:28<01:38, 49.05s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.1, 8109, 5, 0.8, 0.4, 0.6, 4.0)














 96%|█████████▌| 24/25 [24:40<00:37, 37.99s/it][A[A[A[A[A[A[A[A[A[A[A[A

(0.05, 5620, 6, 0.8, 0.3, 0.8, 0.5)














100%|██████████| 25/25 [24:59<00:00, 59.97s/it][A[A[A[A[A[A[A[A[A[A[A[A


In [154]:
# RF
params = {
        'max_depth': [3, 4, 5, 6, 7],
        'n_estimators': [5000, 15000],
        'max_features': [0.8, 0.9, 1]
}

ind = np.random.randint(0, X_train.shape[0], size=50000)
rf_params = parameter_search(X_train.iloc[ind], y_train.iloc[ind], method='rf', params=params, steps=25)













  0%|          | 0/25 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A

(11276, 3, 0.8)














  4%|▍         | 1/25 [09:06<3:38:42, 546.77s/it][A[A[A[A[A[A[A[A[A[A[A[A

(12516, 7, 0.9)














  8%|▊         | 2/25 [32:25<5:07:34, 802.38s/it][A[A[A[A[A[A[A[A[A[A[A[A

(10089, 4, 0.8)














 12%|█▏        | 3/25 [43:03<4:36:08, 753.11s/it][A[A[A[A[A[A[A[A[A[A[A[A

(6339, 4, 1.0)














 16%|█▌        | 4/25 [51:22<3:56:50, 676.68s/it][A[A[A[A[A[A[A[A[A[A[A[A

(5946, 7, 0.8)














 20%|██        | 5/25 [1:01:13<3:37:04, 651.22s/it][A[A[A[A[A[A[A[A[A[A[A[A

(7111, 3, 1.0)














 24%|██▍       | 6/25 [1:08:22<3:05:04, 584.46s/it][A[A[A[A[A[A[A[A[A[A[A[A

(5695, 4, 1.0)














 28%|██▊       | 7/25 [1:15:52<2:43:11, 543.97s/it][A[A[A[A[A[A[A[A[A[A[A[A

(7575, 4, 0.8)














 32%|███▏      | 8/25 [1:23:51<2:28:40, 524.71s/it][A[A[A[A[A[A[A[A[A[A[A[A

(14387, 4, 0.9)














 36%|███▌      | 9/25 [1:40:53<2:59:40, 673.75s/it][A[A[A[A[A[A[A[A[A[A[A[A

(13548, 7, 0.9)














 40%|████      | 10/25 [2:06:11<3:51:47, 927.15s/it][A[A[A[A[A[A[A[A[A[A[A[A

(10178, 6, 0.8)














 44%|████▍     | 11/25 [2:21:08<3:34:14, 918.15s/it][A[A[A[A[A[A[A[A[A[A[A[A

(9092, 4, 0.9)














 48%|████▊     | 12/25 [2:31:56<3:01:21, 837.04s/it][A[A[A[A[A[A[A[A[A[A[A[A

(10118, 4, 0.8)














 52%|█████▏    | 13/25 [2:42:37<2:35:37, 778.14s/it][A[A[A[A[A[A[A[A[A[A[A[A

(7017, 3, 0.8)














 56%|█████▌    | 14/25 [2:48:18<1:58:35, 646.88s/it][A[A[A[A[A[A[A[A[A[A[A[A

(13526, 7, 1.0)














 60%|██████    | 15/25 [3:16:24<2:39:47, 958.78s/it][A[A[A[A[A[A[A[A[A[A[A[A

(12612, 7, 0.8)














 64%|██████▍   | 16/25 [3:37:22<2:37:16, 1048.51s/it][A[A[A[A[A[A[A[A[A[A[A[A

(9879, 4, 0.9)














 68%|██████▊   | 17/25 [3:49:04<2:05:55, 944.46s/it] [A[A[A[A[A[A[A[A[A[A[A[A

(12721, 6, 1.0)














 72%|███████▏  | 18/25 [4:12:20<2:06:01, 1080.15s/it][A[A[A[A[A[A[A[A[A[A[A[A

(9984, 6, 0.8)














 76%|███████▌  | 19/25 [4:26:57<1:41:55, 1019.23s/it][A[A[A[A[A[A[A[A[A[A[A[A

(11878, 5, 0.9)














 80%|████████  | 20/25 [4:43:49<1:24:44, 1016.87s/it][A[A[A[A[A[A[A[A[A[A[A[A

(10247, 6, 1.0)


KeyboardInterrupt: 

In [32]:
pca = PCA(n_components=4)
X_train_pca = pca.fit_transform(X_train)

In [34]:
# Elastic Net
params = {
        'ratio': np.linspace(0.1, 1, 20),
        'alpha': [1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 5e-1, 1, 5, 10, 20, 50, 100],
        'max_features': [0.8, 0.9, 1]
}

el_net_params = parameter_search(X_train_pca, y_train, method='elastic', params=params)

0.1 1e-05
0.1 5e-05
0.1 0.0001
0.1 0.0005
0.1 0.001
0.1 0.005
0.1 0.01
0.1 0.05
0.1 0.1
0.1 0.5
0.1 1
0.1 5
0.1 10
0.1 20
0.1 50
0.1 100
0.1473684210526316 1e-05
0.1473684210526316 5e-05
0.1473684210526316 0.0001
0.1473684210526316 0.0005
0.1473684210526316 0.001
0.1473684210526316 0.005
0.1473684210526316 0.01
0.1473684210526316 0.05
0.1473684210526316 0.1
0.1473684210526316 0.5
0.1473684210526316 1
0.1473684210526316 5
0.1473684210526316 10
0.1473684210526316 20
0.1473684210526316 50
0.1473684210526316 100
0.19473684210526315 1e-05
0.19473684210526315 5e-05
0.19473684210526315 0.0001
0.19473684210526315 0.0005
0.19473684210526315 0.001
0.19473684210526315 0.005
0.19473684210526315 0.01
0.19473684210526315 0.05
0.19473684210526315 0.1
0.19473684210526315 0.5
0.19473684210526315 1
0.19473684210526315 5
0.19473684210526315 10
0.19473684210526315 20
0.19473684210526315 50
0.19473684210526315 100
0.24210526315789474 1e-05
0.24210526315789474 5e-05
0.24210526315789474 0.0001
0.242105263157

In [35]:
!cat el_net_params.txt

[(1e-05, 0.1, 1815.563687496391), (5e-05, 0.1, 1815.5636874966917), (0.0001, 0.1, 1815.5636874970673), (0.0005, 0.1, 1815.5636875000728), (0.001, 0.1, 1815.5636875038306), (0.005, 0.1, 1815.5636875338832), (0.01, 0.1, 1815.5636875714488), (0.05, 0.1, 1815.5636878720481), (0.1, 0.1, 1815.5636882382705), (0.5, 0.1, 1815.5636912435295), (1, 0.1, 1815.5636950001126), (5, 0.1, 1815.5637250531709), (10, 0.1, 1815.5637626204746), (20, 0.1, 1815.5638377583525), (50, 0.1, 1815.5640588872259), (100, 0.1, 1815.5644347074492), (1e-05, 0.1473684210526316, 1815.5636874964177), (5e-05, 0.1473684210526316, 1815.5636874968234), (0.0001, 0.1473684210526316, 1815.5636874973309), (0.0005, 0.1473684210526316, 1815.5636875013913), (0.001, 0.1473684210526316, 1815.5636875064652), (0.005, 0.1473684210526316, 1815.5636875470568), (0.01, 0.1473684210526316, 1815.5636875977966), (0.05, 0.1473684210526316, 1815.5636879943536), (0.1, 0.1473684210526316, 1815.5636885017502), (0.5, 0.1473684210526316, 1815.563692560