In [1]:
import random
import itertools
from itertools import cycle
import pandas as pd
import numpy as np
from scipy import interp, stats
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="white", color_codes=True)
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore",category=DeprecationWarning)
import sklearn.ensemble as ske
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import *
import sklearn.preprocessing as prep

import xgboost
import tensorflow as tf

pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', 100) 
pd.set_option('display.float_format', lambda x: '%.3f' % x)
np.set_printoptions(suppress=True)
np.set_printoptions(precision=2)



In [2]:
class GenericAutoencoder(object):
    def __init__(self, ae_shape = [2,1,2] , transfer_function_arr=[tf.nn.sigmoid], optimizer = tf.train.AdamOptimizer()):
        self.ae_shape = ae_shape
        self.transfer_function_arr = transfer_function_arr

        network_weights = self._initialize_weights()
        self.weights = network_weights
        self.all_output = []
        self.x = tf.placeholder(tf.float32, [None, self.ae_shape[0]])
#         self.y = tf.placeholder(tf.float32, [None, self.ae_shape[-1]])
        self.y = tf.placeholder(tf.float32, [None])
        self.hidden = self.x
        self.all_output.append(self.hidden)
        for index in range(0,len(self.ae_shape)-2):
            self.hidden = self.transfer_function_arr[index](tf.add(tf.matmul(self.hidden, self.weights["w"+str(index+1)]), 
                                               self.weights["b"+str(index+1)]))
            self.all_output.append(self.hidden)
        
        self.reconstruction = tf.add(tf.matmul(self.hidden, self.weights["w"+str(index+2)]), 
                                     self.weights["b"+str(index+2)])

        self.all_output.append(self.reconstruction)
        # cost
        self.cost = 0.5 * tf.reduce_sum(tf.pow(tf.subtract(self.reconstruction, self.y), 2.0))
        
        
#         self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits 
#                                                                     = self.reconstruction, labels = self.y))
        
        self.optimizer = optimizer.minimize(self.cost)

        init = tf.global_variables_initializer()
        self.sess = tf.Session()
        self.sess.run(init)


    def _initialize_weights(self):
        all_weights = dict()
        low = 0
        high = 1
        for index in range(0,len(self.ae_shape)-1):
            all_weights["w"+str(index+1)] = tf.Variable(tf.random_uniform([self.ae_shape[index],
                                                                           self.ae_shape[index+1]], minval = low, 
                                                                          maxval = high, dtype = tf.float32))
            all_weights["b"+str(index+1)] = tf.Variable(tf.random_uniform([self.ae_shape[index+1]], minval = low, 
                                                                          maxval = high, dtype = tf.float32))
        return all_weights

    def partial_fit(self, X, Y):
        cost, opt = self.sess.run((self.cost, self.optimizer), feed_dict={self.x: X, self.y: Y})
        return cost

    def calc_total_cost(self, X):
        return self.sess.run(self.cost, feed_dict = {self.x: X})

    def transform(self, X):
        return self.sess.run(self.hidden, feed_dict={self.x: X})

    def generate(self, hidden = None):
        if hidden is None:
            hidden = self.sess.run(tf.random_normal([1, self.n_hidden]))
        return self.sess.run(self.reconstruction, feed_dict={self.hidden: hidden})

    def reconstruct(self, X):
        return self.sess.run(self.reconstruction, feed_dict={self.x: X})

    def getWeights(self):
        return self.sess.run(self.weights)

    def getBiases(self):
        return self.sess.run(self.weights)
    
    def getAllOutPut(self,X):
        return self.sess.run(self.all_output, feed_dict={self.x: X})

In [3]:
def min_max_scale(data):
    preprocessor = prep.MinMaxScaler().fit(data)
    data = preprocessor.transform(data)
    return data

def standard_scale(data):
    preprocessor = prep.StandardScaler().fit(data)
    data = preprocessor.transform(data)
    return data

def get_random_block_from_data(x, y, batch_size):
    start_index = np.random.randint(0, len(data) - batch_size)
    return (x[start_index:(start_index + batch_size)], y[start_index:(start_index + batch_size)])


In [4]:
folder_path = "/Users/prasanna/Downloads/"
file_name = "DS_Tech_Review_Dataset (1).txt"
data = pd.read_csv(folder_path + file_name, sep="|")
#data.describe()
#data.corr()
age_cols = [col for col in data.columns.values if "AGE" in col]
relevant_age_cols = [col for col in data.columns.values if "AGE" in col and ("_" in col or "UP" in col)]
data.dropna(thresh=len(data)*0.1, axis=1, inplace=True)

same_value_cols =[]
for col in data.columns.values:
    unique_cols = data[col].unique()
    if len(unique_cols) < 30 :
        #print (col, unique_cols)
        pass
    if len(unique_cols) == 1 or (len(unique_cols) == 2 and np.isnan(unique_cols).any()) :
        same_value_cols.append(col)

data["MAJOR_CREDIT_CARD_LIF"] = np.where(data["MAJOR_CREDIT_CARD_LIF"].isnull(),"NA", data["MAJOR_CREDIT_CARD_LIF"])
data.fillna(0,inplace=True)
data = pd.get_dummies(data, columns=["product", "MAJOR_CREDIT_CARD_LIF"])
data.drop(same_value_cols, axis=1, inplace=True)

target_values = data["target"].astype('float32').copy()
data.drop("target", axis=1, inplace=True)
class_names = ["No","Yes"]
features_names = data.columns.values.tolist()
features_data = data.astype('float32').as_matrix()
features_data.shape

(300000, 110)

In [None]:
training_epochs = 10
batch_size = 128*20
display_step = 5
X_train = min_max_scale(features_data)
target_values_re = target_values.reshape(300000,1)
n_samples = len(X_train)
autoencoder = GenericAutoencoder(
    ae_shape=[110, 40, 15, 40, 1],
    transfer_function_arr=[tf.nn.sigmoid,tf.nn.sigmoid,tf.nn.sigmoid],
    optimizer=tf.train.AdamOptimizer(learning_rate=0.001))

for epoch in range(training_epochs):
    avg_cost = 0.
#     total_batch = int(n_samples / batch_size)
#     for i in range(total_batch):
#         batch_xs = get_random_block_from_data(X_train, target_values_re, batch_size)
#         cost = autoencoder.partial_fit(batch_xs[0],batch_xs[1])
    cost = autoencoder.partial_fit(X_train, target_values)
    avg_cost += cost / n_samples #* batch_size
    if epoch % display_step == 0:
        print("Epoch:", '%d,' % (epoch + 1),
              "Cost:", "{:.9f}".format(avg_cost))