In [7]:
# ## Import Libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.preprocessing import Imputer
import tensorflow as tf



In [8]:
import matplotlib.pylab as plt
get_ipython().magic(u'matplotlib inline')
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

In [3]:
# Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini(actual, pred):
    assert (len(actual) == len(pred))
    all = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float)
    all = all[np.lexsort((all[:, 2], -1 * all[:, 1]))]
    totalLosses = all[:, 0].sum()
    giniSum = all[:, 0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)


def gini_normalized(actual, pred):
    return gini(actual, pred) / gini(actual, actual)

In [4]:
# Create an XGBoost-compatible Gini metric
def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]

def gini_xgb_min(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', -1*gini_score)]

In [62]:
# ## Data exploration
train = pd.read_csv('train.csv')#pd.read_csv('train.csv', na_values=-1) # The na_values doesn't work somehow
test = pd.read_csv('test.csv')
pos_count = train[train['target']==1]['target'].count()
neg_count = train.shape[0] - pos_count
pos_neg_count_ratio = float(pos_count)/neg_count
print(pos_neg_count_ratio)

0.0378261885416


In [103]:
# Group different types of data
feats_intv = []
feats_ordi = []
feats_cat = []
feats_bin = []
for feat in train.columns:
    if '_cat' in feat:
        feats_cat.append(feat)
    elif '_bin' in feat:
        feats_bin.append(feat)
    elif train[feat].dtype == 'float':
        feats_intv.append(feat)
    elif feat != 'id' and feat != 'target':
        feats_ordi.append(feat)

In [104]:
print('Number of interval features: %d', len(feats_intv))
print('Number of ordinal features: %d', len(feats_ordi))
print('Number of categorical features: %d', len(feats_cat))
print('Number of binary features: %d', len(feats_bin))

('Number of interval features: %d', 10)
('Number of ordinal features: %d', 16)
('Number of categorical features: %d', 14)
('Number of binary features: %d', 17)


In [105]:
#Compute skewness of interval features and perform log transform
from scipy.stats import skew
skewed_feats = train[feats_intv].apply(lambda x: skew(x.dropna())) #compute skewness
print(skewed_feats)
skewed_feats = skewed_feats[abs(skewed_feats) > 0.5].index
#train[skewed_feats] = np.log1p(train[skewed_feats])
#test[skewed_feats] = np.log1p(test[skewed_feats]) #test data also needs to be transformed




ps_reg_01    -0.639471
ps_reg_02     1.281062
ps_reg_03    -1.013100
ps_car_12     1.067173
ps_car_13     1.700532
ps_car_14    -3.234060
ps_car_15    -2.216286
ps_calc_01   -0.001165
ps_calc_02    0.001010
ps_calc_03    0.000841
dtype: float64


In [106]:
#Define indice for features (i.e. predictors) and labels (i.e. target)
target ='target'
IDcol = 'id'
predictors = [x for x in train.columns if x not in [target, IDcol]]



In [167]:
sample_size = train.shape[0]
train_sample = train.sample(sample_size)
X_train = train_sample[predictors]
y_train = train_sample[target]
X_test = test[predictors]



In [168]:
#Create dummies for categorical features (try drop 'ps_car_11_cat')

#X_train.drop(['ps_car_11_cat'], axis=1, inplace=True)
#X_test.drop(['ps_car_11_cat'], axis=1, inplace=True)
#feats_cat.remove('ps_car_11_cat')
#X_train[feats_cat] = X_train[feats_cat].fillna(-1)

X_train.drop(['ps_car_14','ps_car_07_cat','ps_car_05_cat','ps_car_03_cat','ps_reg_03','ps_ind_05_cat'], axis=1, inplace=True)
X_test.drop(['ps_car_14','ps_car_07_cat','ps_car_05_cat','ps_car_03_cat','ps_reg_03','ps_ind_05_cat'], axis=1, inplace=True)
'''
feats_cat.remove('ps_car_07_cat')
feats_cat.remove('ps_car_05_cat')
feats_cat.remove('ps_car_03_cat')
feats_cat.remove('ps_ind_05_cat')
'''
X_train_cat = X_train[feats_cat].astype(object)
X_train_dummies = pd.get_dummies(X_train_cat)
X_train.drop(feats_cat, inplace=True, axis=1)
X_train = pd.concat([X_train, X_train_dummies], axis=1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [169]:
from sklearn.preprocessing import StandardScaler
print feats_cat
#X_test[feats_cat] = X_test[feats_cat].fillna(-1)
X_test_cat = X_test[feats_cat].astype(object)
X_test_dummies = pd.get_dummies(X_test_cat)
X_test.drop(feats_cat, inplace=True, axis=1)
X_test = pd.concat([X_test, X_test_dummies], axis=1)
scaler = StandardScaler(with_mean=True, with_std=True).fit(X_train.values)
X_train = scaler.transform(X_train.values)
X_test.values = scaler.transform(X_test.values)


['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_04_cat', 'ps_car_06_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


AttributeError: can't set attribute

In [122]:

#Use stratified train_test_split due to the very imbalanced label classes
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                  test_size = 0.2,
                                                  random_state = 1,
                                                  stratify = y_train)
eval_set=[(X_train, y_train), (X_val, y_val)]



In [123]:
tf.logging.set_verbosity(tf.logging.INFO)


In [210]:
def mlp_model_fn(features, labels, mode, params):
  """Model function for MLP."""
  
  #TODO: need works on this
  config = params


  # Input Layer
  
  input_layer = tf.reshape( features["x"], [-1, features["x"].shape[1] ] )
  #print ('feature x', features["x"])
  print ('feature x shape', features["x"].shape)
  #print ('reshape:', input_layer)
  print ('reshape shape:', input_layer.shape)
  #trans = tf.string_to_number(input_layer)
  #print ('trans reshape:', trans)
  #print ('reshape shape:', input_layer.shape)



  # Dense Layers

  hidden1 = tf.layers.dense(inputs=features["x"], units=config['n_hidden1'], activation=tf.nn.relu)
  training = tf.placeholder_with_default(False, shape=(), name='training')
  bn1 = tf.layers.batch_normalization(hidden1, momentum = 0.9)

  drop_h1 = tf.layers.dropout(
      inputs=bn1, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
  hidden2 = tf.layers.dense(inputs=drop_h1, units=config['n_hidden2'], activation=tf.nn.relu)
  drop_h2 = tf.layers.dropout(
      inputs=hidden2, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
  hidden3 = tf.layers.dense(inputs=drop_h2, units=config['n_hidden3'], activation=tf.nn.relu)
  drop_h3 = tf.layers.dropout(
      inputs=hidden3, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)

  logits = tf.layers.dense(inputs=drop_h3, units=config['nclasses'], activation=tf.nn.sigmoid)
  predictions = {
        # Generate predictions (for PREDICT and EVAL mode)
        "classes": tf.argmax(input=logits, axis=1),
        # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
        # `logging_hook`.
        "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
  }


  

  if mode == tf.estimator.ModeKeys.PREDICT:
    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

  # Calculate Loss (for both TRAIN and EVAL modes)
  onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=config['nclasses'])
  print (onehot_labels)
  print (logits)
  loss = tf.losses.softmax_cross_entropy(
        onehot_labels=onehot_labels, logits=logits)


  # Configure the Training Op (for TRAIN mode)
  if mode == tf.estimator.ModeKeys.TRAIN:
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
    train_op = optimizer.minimize(
        loss=loss,
        global_step=tf.train.get_global_step())
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

  # Add evaluation metrics (for EVAL mode)
  eval_metric_ops = {
      "accuracy": tf.metrics.accuracy(
          labels=labels, predictions=predictions["classes"])}
  return tf.estimator.EstimatorSpec(
      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

In [None]:

config = {}
config['nclasses'] = 2
config['model_dir'] = 'model/'
config['n_hidden1'] = 128
config['n_hidden2'] = 64
config['n_hidden3'] = 32


# Create the Estimator
mlp_classifier = tf.estimator.Estimator(
model_fn=mlp_model_fn, model_dir=config['model_dir'], params=config)

# Set up logging for predictions
tensors_to_log = {"probabilities": "softmax_tensor"}
logging_hook = tf.train.LoggingTensorHook(
  tensors=tensors_to_log, every_n_iter=50)

# Train the model
train_input_fn = tf.estimator.inputs.numpy_input_fn(
  x={"x": X_train},
  y=y_train.values,
  batch_size=100,
  num_epochs=None,
  shuffle=True)

mlp_classifier.train(
  input_fn=train_input_fn,
  steps=20000,
  hooks=[logging_hook])

# Evaluate the model and print results
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": X_val},
y=y_val.values,
num_epochs=1,
shuffle=False)



eval_results = mlp_classifier.evaluate(input_fn=eval_input_fn)
print(eval_results)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12f5cd9d0>, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': 'model/', '_save_summary_steps': 100}
('feature x shape', TensorShape([Dimension(100), Dimension(208)]))
('reshape shape:', TensorShape([Dimension(100), Dimension(208)]))
Tensor("one_hot:0", shape=(100, 2), dtype=float32)
Tensor("dense_4/Sigmoid:0", shape=(100, 2), dtype=float64)
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into model/model.ckpt.
INFO:tensorflow:probabilities = [[ 0.52348439  0.47651561]
 [ 0.57767591  0.4223240

In [None]:

pred_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": X_test.values},
num_epochs=1,
shuffle=False)


#label = y_val.values
#label = tf.reshape(y_val.values, y_val.values.shape)
predictions = mlp_classifier.predict(input_fn=pred_input_fn)
#for p in predictions:
#    print p['classes']
predictions = list(p["probabilities"][1] for p in predictions)
#print predictions

In [None]:
#from sklearn.metrics import classification_report
#print classification_report(label, predictions)


In [209]:
def write_preds(test_id, preds, fname):
    pd.DataFrame({"id": test_id, "target": preds}).to_csv(fname, index=False, header=True)

write_preds(test['id'], predictions, 'porto_nn.csv')