# CuDF TF Demo
This notebook is for **Archive Only.** Please **do not** expect it to run in the latest releases.  Cell output is saved and shown.  

Placed 17/8808. [Blog](https://medium.com/rapids-ai/financial-data-modeling-with-rapids-5bca466f348) 

In [1]:
import cudf as gd
import pandas as pd
import numpy as np
import tensorflow as tf
from collections import OrderedDict
import time
from tqdm import tqdm
import pickle
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")
print('cudf version:',gd.__version__)
print('tensorflow version:',tf.__version__)

cudf version: 0.6.1+0.gbeb4ef3.dirty
tensorflow version: 1.13.1


### Please download data from https://www.kaggle.com/c/santander-customer-transaction-prediction/data

In [2]:
PATH = '../input'

# ETL

### Read csv

In [3]:
%%time
cols = ['ID_code', 'target'] + ['var_%d'%i for i in range(200)]
dtypes = ['int32', 'int32'] + ['float32' for i in range(200)]
train_gd = gd.read_csv('%s/train.csv'%PATH,names=cols,dtype=dtypes,skiprows=1)

cols = ['ID_code', 'target'] + ['var_%d'%i for i in range(200)]
dtypes = ['int32', 'int32'] + ['float32' for i in range(200)]
test_gd = gd.read_csv('%s/test.csv'%PATH,names=cols,dtype=dtypes,skiprows=1)

CPU times: user 460 ms, sys: 92.9 ms, total: 553 ms
Wall time: 552 ms


In [4]:
train_gd.head().to_pandas()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,75153670,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.626602,...,4.4354,3.9642,3.1364,1.691,18.522701,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,75153671,0,11.5006,-4.1473,13.858801,5.389,12.362201,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.430499,2.0339,8.1267,8.7889,18.355999,1.9518
2,75153672,0,8.609301,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,75153673,0,11.0604,-2.1518,8.9522,7.1957,12.584599,-1.8361,5.8428,14.925,...,4.4666,4.743299,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.999599
4,75153674,0,9.8369,-1.4834,12.874599,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.194201,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


### create new features & normalize


In [5]:
%%time
for i in tqdm(range(200)):
    col = 'var_%d'%i
    new_col = 'new_%s'%col
    count_col = 'count_%s'%col
    
    df = train_gd.groupby(col).agg({col:'count'})
    df = df.reset_index()
    train_gd = train_gd.merge(df,on=col,how='left')
    test_gd = test_gd.merge(df,on=col,how='left')
    
    # feature values with count==1 have a lot of noise
    # we can replace these values with mean value of the column
    train_gd[new_col] = train_gd[col] * (train_gd[count_col]>1)
    mean = train_gd[new_col].mean()
    std = train_gd[new_col].std()
    train_gd['mean'] = mean
    train_gd[new_col] = train_gd[new_col] + train_gd['mean']*(train_gd[count_col]==1)
    train_gd[new_col] = (train_gd[new_col]-mean)/std
    train_gd[col] = (train_gd[col]-mean)/std
    
    test_gd[new_col] = test_gd[col] * (test_gd[count_col]>1)
    test_gd['mean'] = mean
    test_gd[new_col] = test_gd[new_col] + test_gd['mean']*(test_gd[count_col]==1)
    test_gd[new_col] = (test_gd[new_col]-mean)/std
    test_gd[col] = (test_gd[col]-mean)/std
    

100%|██████████| 200/200 [01:19<00:00,  1.65it/s]

CPU times: user 56.4 s, sys: 23.2 s, total: 1min 19s
Wall time: 1min 19s





In [6]:
feas = []
for i in range(200):
    feas.append('var_%d'%i)
    feas.append('new_var_%d'%i)
X = train_gd[feas].to_pandas()
X.head()

Unnamed: 0,var_0,new_var_0,var_1,new_var_1,var_2,new_var_2,var_3,new_var_3,var_4,new_var_4,...,var_195,new_var_195,var_196,new_var_196,var_197,new_var_197,var_198,new_var_198,var_199,new_var_199
0,0.216244,0.0,0.920815,0.920815,0.630195,0.630195,1.346586,1.346586,1.246226,1.246226,...,-0.697538,-0.697538,1.280454,1.280454,0.679912,0.679911,0.245115,0.0,1.263861,0.0
1,1.030498,1.030498,-1.627658,-1.627658,0.311086,0.311086,0.516251,0.516251,1.112466,0.0,...,-1.434332,-1.434332,1.037152,1.037152,0.824626,0.824626,0.597224,0.597224,0.842878,0.842878
2,0.241152,0.241152,-0.58387,-0.58387,0.417921,0.417921,-0.28128,-0.28128,-0.507862,-0.507862,...,1.77162,1.77162,0.076485,0.076485,-0.3447,-0.3447,0.852539,0.852539,-3.854148,0.0
3,-0.427054,-0.427054,0.646435,0.646435,-0.062363,-0.062363,-0.286137,-0.286137,0.83508,0.83508,...,-0.02868,-0.02868,-0.758242,-0.758242,1.004489,0.0,0.478298,0.478297,-1.517619,-1.517619
4,1.320872,1.320872,-1.71252,-1.71252,-0.481917,-0.481917,-0.147458,-0.147458,-0.806044,0.0,...,0.895614,0.895614,1.719011,0.0,0.057048,0.057048,0.502301,0.502301,0.837644,0.0


In [7]:
X = X.values
y = train_gd['target'].to_pandas().values
Xt = test_gd[feas].to_pandas().values
print(X.shape,y.shape,Xt.shape)

(200000, 400) (200000,) (200000, 400)


In [8]:
print(X.shape,y.shape,Xt.shape)
B = X.shape[0]
X = np.reshape(X,[B,200,2])
Xt = np.reshape(Xt,[B,200,2])
print(X.shape,y.shape,Xt.shape)

(200000, 400) (200000,) (200000, 400)
(200000, 200, 2) (200000,) (200000, 200, 2)


In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Define TF model

In [10]:
class groupNN:
    def __init__(self,**params):
        self.params = params
        
    def fit(self,X,y):
        self.X = X
        self.y = y
        tf.reset_default_graph()
        # build a tf computing graph
        logit = self._build()
        label = tf.placeholder(dtype=tf.int32,shape=[None]) # B,classes
        losst = self.get_loss(logit,label)
        opt_op = self.get_opt(losst)
        
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            #self.load(sess)
            i = 1
            ave_loss = []
            for c,(Xb,yb,end_epoch) in enumerate(self._batch_gen(shuffle=True)):
                loss,_ = sess.run([losst,opt_op],feed_dict={self.inputs:Xb, label:yb})
                ave_loss.append(loss)
                if end_epoch:
                    print("Epoch %d train loss %.4f"%(i,np.mean(ave_loss)))
                    i += 1
                    ave_loss = []
            self.save(sess)
    
    def _build(self):
        # build the computing graph      
        netname = 'groupNN'
        self.inputs = tf.placeholder(dtype=tf.float32,shape=[None,200,2])
        B = tf.shape(self.inputs)[0]
        H = self.params.get('hidden_units', 16)
        with tf.variable_scope(netname):
            net = self.inputs
            net = tf.contrib.layers.fully_connected(self.inputs,H)
            net = tf.reshape(net,[B,200*H])
            net = tf.contrib.layers.fully_connected(net,1,activation_fn=None)
            return tf.squeeze(net)
        
    def predict(self,X):
        print('predict')
        self.X = X 
        self.y = None
        tf.reset_default_graph()
        self.params['epochs'] = 1
        # build a tf computing graph
        logit = self._build()
        logit = tf.nn.sigmoid(logit)
        preds = []
        #print('here')
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            self.load(sess)
            for c,(Xb,_,end_epoch) in enumerate(self._batch_gen(shuffle=False)):
                pred = sess.run(logit,feed_dict={self.inputs:Xb})
                preds.append(pred)
        preds = np.concatenate(preds)
        return preds
    
    def get_opt(self,loss):
        learning_rate = self.params.get('learning_rate', 0.001)
        opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
        return opt.minimize(loss)
        
    def get_loss(self, logit, label):
        # build the loss tensor
        label = tf.cast(label,tf.float32)
        loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=logit,labels=label)
        return tf.reduce_mean(loss)
    
    def save(self, sess):
        varss = tf.trainable_variables()
        weights = {} # var.name => var.value: a numpy array
        for var in varss:
            val = sess.run(var)
            weights[var.name] = val
        pickle.dump(weights,open('weight.p','wb'))

    def load(self, sess, path = 'weight.p'):
        weights = pickle.load(open(path,'rb'))
        varss = tf.trainable_variables()
        for var in varss:
            value = weights[var.name]
            assign_op = var.assign(value)
            sess.run(assign_op)
    
    def _batch_gen(self, shuffle=True):
        X,y = self.X, self.y
        B = self.params.get('batch_size', 1024)
        epochs = self.params.get('epochs', 10)
        ids = [i for i in range(len(X))]
        batches = len(X)//B + 1
        #print(epochs,batches)
        for epoch in range(epochs):
            if shuffle:
                random.shuffle(ids)
            for i in range(batches):                
                idx = ids[i*B:(i+1)*B]
                if y is not None:
                    yield X[idx],y[idx],i==batches-1
                else:
                    yield X[idx],None,i==batches-1
            if (i+1)*B < len(X):
                idx = ids[(i+1)*B:len(X)]
                yield X[idx]

In [11]:
nn = groupNN(hidden_units=16,learning_rate=0.01,epochs=20)

In [12]:
nn.fit(X_train,y_train)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.



For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Epoch 1 train loss 0.2783
Epoch 2 train loss 0.2145
Epoch 3 train loss 0.2076
Epoch 4 train loss 0.2038
Epoch 5 train loss 0.1985
Epoch 6 train loss 0.1955
Epoch 7 train loss 0.1941
Epoch 8 train loss 0.1925
Epoch 9 train loss 0.1920
Epoch 10 train loss 0.1918
Epoch 11 train loss 0.1932
Epoch 12 train loss 0.1893
Epoch 13 train loss 0.1882
Epoch 14 train loss 0.1885
Epoch 15 train loss 0.1886
Epoch 16 train loss 0.1896
Epoch 17 train loss 0.1890
Epoch 18 train loss 0.1882
Epoch 19 train loss 0.1866
Epoch 20 train loss 0.1886


In [13]:
yp = nn.predict(X_valid)
print('Validation AUC',roc_auc_score(y_valid,yp))

predict
Validation AUC 0.9072589318554531


In [14]:
yp = nn.predict(Xt)

predict


In [15]:
submission = pd.DataFrame({'ID_code':test_gd['ID_code'].to_pandas().values,
                   'target':yp})
submission.to_csv('submission.csv',index=False)