In [21]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score

from pprint import pprint
from utils.misc import label_gen_np, save_pred

In [2]:
import glob
models = glob.glob('./preds/*train.csv')

In [3]:
train_dfs = []
for model in models:
    mod_df = pd.concat((pd.read_csv(model), pd.read_csv(model.replace('train', 'external')))).reset_index(drop=True)
#     mod_df.columns = [models[0].split('_')[2] + "_" + cname for cname in mod_df.columns]
    train_dfs.append(mod_df)

In [4]:
# One Hot Encoding
features = pd.concat((train_dfs[i]['0'] for i in range(len(train_dfs))), axis=1)

# Extract features and labels
labels = pd.concat((pd.read_csv('./data/train.csv'), pd.read_csv('./data/HPAv18RBGY_wodpl.csv'))).reset_index(drop=True)
labels = labels['Target'].apply(label_gen_np)
labels = np.stack(labels.as_matrix())[:, 0]

# List of features for later use
feature_list = list(features.columns)

features = np.array(features)
labels = np.array(labels)

# Training and Testing Sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, 
                                                                            test_size = 0.25, random_state = 42)

In [5]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (79258, 3)
Training Labels Shape: (79258,)
Testing Features Shape: (26420, 3)
Testing Labels Shape: (26420,)


In [2]:
rf = RandomForestClassifier(random_state = 42)

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [7]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 4)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 4)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [8]:
%%time
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                              n_iter = 5, scoring='f1', 
                              cv = 3, verbose=2, random_state=42, n_jobs=-1)

# Fit the random search model
rf_random.fit(train_features, train_labels)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 42.8min finished


Wall time: 46min 2s


In [9]:
rf_random.best_params_

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 2000}

In [10]:
rf_random.best_score_

0.7943437806043752

In [11]:
yhat = rf_random.predict(test_features)

In [18]:
f1_score(yhat, test_labels)

0.795927106627968

In [25]:
maxf = 0
for j in range(test_features.shape[1]):
    for i in np.arange(-2, 2, 0.01):
        cf1 = f1_score(test_features[:,j]>i, test_labels)
        maxf = maxf if maxf > cf1 else cf1
maxf

0.788080662585524

In [26]:
feat = pd.concat((train_dfs[i] for i in range(len(train_dfs))), axis=1)

In [28]:
import time

In [29]:
time.time()

1546925698.9163165

In [30]:
from datetime import datetime

In [32]:
datetime.now().strftime('%Y_%m_%d_%H_%M_%S')

'2019_01_08_00_36_11'

## Combinig preds

In [7]:
f = np.load('./stacks/indi_10_models_16iter_label_21.npy')
b = np.load('./stacks/indi_10_models_16iter_label_20.npy')

In [25]:
f[:,21:] = b[:,21:]

In [27]:
f.sum(axis=0).plot

AttributeError: 'numpy.ndarray' object has no attribute 'plot'

In [28]:
save_pred(f, th=0.5, SUBM_OUT='./stacks/pehlajaani.csv', fill_empty=False)

Saved to  ./stacks/pehlajaani.csv


In [36]:
(f+0.5).astype('uint8')

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=uint8)

In [37]:
ex = pd.DataFrame(data=(f+0.5).astype('uint8'))

In [48]:
ex[ex.sum(axis=1)==0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
bsub = pd.read_csv('./stacks/subm_ensemble_2019_01_05_22_36_59.csv')

In [52]:
bsub['Predicted'] = bsub['Predicted'].apply(label_gen_np)

In [54]:
bpreds = np.stack(bsub['Predicted'].values)

In [56]:
bpreds_df = pd.DataFrame(data=bpreds)

In [62]:
ex[ex.sum(axis=1)==0] = bpreds_df[ex.sum(axis=1)==0]

In [64]:
ex.values

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=uint8)

In [68]:
save_pred(ex.values.astype('float'), th=0.5, SUBM_OUT='./stacks/stacked_random_forest_nonzero.csv', fill_empty=False)

Saved to  ./stacks/stacked_random_forest_nonzero.csv


In [69]:
import keras

Using TensorFlow backend.


In [70]:
model.add(layers.Dense(50, activation = "relu", input_shape=(10000, )))

NameError: name 'model' is not defined