<br><br>
<h1 align="center"> 08 Neural Networks </h1>
<br><br>

In [15]:
'''Import the libraries'''
import numpy as np
import scipy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML
import hashlib
import joblib
import datetime
import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
import tensorflow as tf

'''Use the ggplot style'''
plt.style.use('fivethirtyeight')
colors = [[0,0,0], [230/255,159/255,0], [86/255,180/255,233/255], [0,158/255,115/255],
          [213/255,94/255,0], [0,114/255,178/255]]

In [2]:
'''Load the data. It's big, set low_memory=False'''
loan_data = pd.read_csv('/opt/apps/ml-data/bigdata/lending_club/loan_data.csv', low_memory=False)

In [3]:
'''Drop selected columns, not needed for ML'''
loan_data = loan_data.drop(['earliest_cr_line', 'issue_d', 'loan_status'], axis = 1)

In [4]:
'''Create the numeric target variable'''


def create_risk_n(risk_b):
    if risk_b == 'Risk':
        return 1
    elif risk_b == 'No Risk':
        return 0
    

loan_data['risk_n'] = loan_data['Risk'].apply(create_risk_n)

loan_data.reset_index(drop=True, inplace=True)

In [5]:
loan_data = loan_data.drop(['Risk'], axis = 1)
loan_data.reset_index(drop=True, inplace=True)

In [6]:
'''Split the data to 80% training set and 20% test set'''
stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

for train_set, test_set in stratified.split(loan_data, loan_data["risk_n"]):
    stratified_train = loan_data.loc[train_set]
    stratified_test = loan_data.loc[test_set]
    
print('Train Set Ratio \n', stratified_train["risk_n"].value_counts()/len(stratified_train))
print('Test Set Ratio \n', stratified_test["risk_n"].value_counts()/len(stratified_test))

Train Set Ratio 
 0    0.778608
1    0.221392
Name: risk_n, dtype: float64
Test Set Ratio 
 0    0.778608
1    0.221392
Name: risk_n, dtype: float64


In [7]:
'''Lets prepare the data'''
train_df = stratified_train
test_df = stratified_test

# Let's Shuffle the data
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

In [8]:
# Train set (Normal training dataset)
X_train = train_df.drop('risk_n', axis=1)
y_train = train_df['risk_n']


# Test Dataset
X_test = test_df.drop('risk_n', axis=1)
y_test = test_df['risk_n']

In [9]:
'''Separate numeric and categorical features'''
numeric = X_train.select_dtypes(exclude=["object"])
categorical = X_train.select_dtypes(["object"])

In [10]:
num_attribs = list(numeric)
cat_attribs = list(categorical)
'''Separate numeric and categorical features'''
numeric = X_test.select_dtypes(exclude=["object"])
categorical = X_test.select_dtypes(["object"])

In [11]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """Encode categorical features as a numeric array.
    The input to this transformer should be a matrix of integers or strings,
    denoting the values taken on by categorical (discrete) features.
    The features can be encoded using a one-hot aka one-of-K scheme
    (``encoding='onehot'``, the default) or converted to ordinal integers
    (``encoding='ordinal'``).
    This encoding is needed for feeding categorical data to many scikit-learn
    estimators, notably linear models and SVMs with the standard kernels.
    Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
    Parameters
    ----------
    encoding : str, 'onehot', 'onehot-dense' or 'ordinal'
        The type of encoding to use (default is 'onehot'):
        - 'onehot': encode the features using a one-hot aka one-of-K scheme
          (or also called 'dummy' encoding). This creates a binary column for
          each category and returns a sparse matrix.
        - 'onehot-dense': the same as 'onehot' but returns a dense array
          instead of a sparse matrix.
        - 'ordinal': encode the features as ordinal integers. This results in
          a single column of integers (0 to n_categories - 1) per feature.
    categories : 'auto' or a list of lists/arrays of values.
        Categories (unique values) per feature:
        - 'auto' : Determine categories automatically from the training data.
        - list : ``categories[i]`` holds the categories expected in the ith
          column. The passed categories are sorted before encoding the data
          (used categories can be found in the ``categories_`` attribute).
    dtype : number type, default np.float64
        Desired dtype of output.
    handle_unknown : 'error' (default) or 'ignore'
        Whether to raise an error or ignore if a unknown categorical feature is
        present during transform (default is to raise). When this is parameter
        is set to 'ignore' and an unknown category is encountered during
        transform, the resulting one-hot encoded columns for this feature
        will be all zeros.
        Ignoring unknown categories is not supported for
        ``encoding='ordinal'``.
    Attributes
    ----------
    categories_ : list of arrays
        The categories of each feature determined during fitting. When
        categories were specified manually, this holds the sorted categories
        (in order corresponding with output of `transform`).
    Examples
    --------
    Given a dataset with three features and two samples, we let the encoder
    find the maximum value per feature and transform the data to a binary
    one-hot encoding.
    >>> from sklearn.preprocessing import CategoricalEncoder
    >>> enc = CategoricalEncoder(handle_unknown='ignore')
    >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
    ... # doctest: +ELLIPSIS
    CategoricalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
              encoding='onehot', handle_unknown='ignore')
    >>> enc.transform([[0, 1, 1], [1, 0, 4]]).toarray()
    array([[ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.],
           [ 0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.]])
    See also
    --------
    sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
      integer ordinal features. The ``OneHotEncoder assumes`` that input
      features take on values in the range ``[0, max(feature)]`` instead of
      using the unique values.
    sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
      dictionary items (also handles string-valued features).
    sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
      encoding of dictionary items or strings.
    """

    
    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                 handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown

        
    def fit(self, X, y=None):
        """Fit the CategoricalEncoder to X.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_feature]
            The data to determine the categories of each feature.
        Returns
        -------
        self
        """

        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' "
                        "or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
                             " encoding='ordinal'")

        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
        n_samples, n_features = X.shape

        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                valid_mask = np.in1d(Xi, self.categories[i])
                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(np.sort(self.categories[i]))

        self.categories_ = [le.classes_ for le in self._label_encoders_]

        return self

    
    def transform(self, X):
        """Transform X using one-hot encoding.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to encode.
        Returns
        -------
        X_out : sparse matrix or a 2-d array
            Transformed input.
        """
        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            valid_mask = np.in1d(X[:, i], self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    X[:, i][~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(X[:, i])

        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)

        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        indices = np.cumsum(n_values)

        column_indices = (X_int + indices[:-1]).ravel()[mask]
        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                n_features)[mask]
        data = np.ones(n_samples * n_features)[mask]

        out = sparse.csc_matrix((data, (row_indices, column_indices)),
                                shape=(n_samples, indices[-1]),
                                dtype=self.dtype).tocsr()
        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return out

In [12]:
'''
Transformation and Feature Scaling:
All transformation pipelines in one place.
'''
class DataFrameSelector1(BaseEstimator, TransformerMixin):
    
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values

    
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]


class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
    
    def __init__(self, sparse_output=False):
          self.sparse_output = sparse_output
    
    def fit(self, X, y=None):
        self.enc = LabelBinarizer(sparse_output=self.sparse_output)
        self.enc.fit(X)
        return self
    
    def transform(self, X, y=None):
          return self.enc.transform(X)
        

class CustomBinarizer(BaseEstimator, TransformerMixin):
    
    def __init__(self, sparse_output=False):
          self.sparse_output = sparse_output
    
    def fit(self, X, y=None):
        self.label_binarizer_ = LabelBinarizer(sparse_output=self.sparse_output).fit(X)
        return self
    
    def transform(self, X):
        return self.label_binarizer_.transform(X)


num_pipeline = Pipeline([
                        ('selector', DataFrameSelector(num_attribs)),
                        ('std_scaler', StandardScaler()),
                        ])

cat_pipeline = Pipeline([
                        ('selector', DataFrameSelector(cat_attribs)),
                        ('encoder', CategoricalEncoder(encoding="onehot-dense")),
                        ])

full_pipeline = FeatureUnion(transformer_list=[
                            ("num_pipeline", num_pipeline),
                            ("cat_pipeline", cat_pipeline),
                            ])

In [13]:
'''Transform the data using pipeline'''
print("Raw Data: ", X_train.shape)
X_train_prepared = full_pipeline.fit_transform(X_train)
print("Prepared Data: ", X_train_prepared.shape)

Raw Data:  (669112, 34)
Prepared Data:  (669112, 159)


In [14]:
'''Transform the data using pipeline'''
print("Raw Data: ", X_test.shape)
X_test_prepared = full_pipeline.fit_transform(X_test)
print("Prepared Data: ", X_test_prepared.shape)

Raw Data:  (669112, 34)
Prepared Data:  (669112, 159)


In [16]:
'''Reset graph in case of reusing'''
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)
    
def fetch_batch(epoch, batch_index, batch_size, instances=X_train_prepared.shape[0]):
    np.random.seed(epoch * n_batches + batch_index)
    indices = np.random.randint(instances, size=batch_size)  
    X_batch = X_train_prepared[indices] 
    y_batch = y_train[indices]
    return X_batch, y_batch
    
reset_graph()


'''Neural Network with TensorFlow''' 
n_inputs = X_train_prepared.shape[1]
hidden1_amount = 66
hidden2_amount = 66
n_outputs = 2

'''Placeholders'''
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

'''Architectural Structure'''
with tf.name_scope('dnn'):
    hidden1 = tf.layers.dense(X, hidden1_amount, activation=tf.nn.relu, name="first_layer")
    hidden2 = tf.layers.dense(hidden1, hidden2_amount, activation=tf.nn.relu, name="second_layer")
    logits = tf.layers.dense(hidden2, n_outputs, name="outputs")
    
'''Loss Functions'''
with tf.name_scope("loss"):
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,logits=logits)
    loss = tf.reduce_mean(cross_entropy, name="loss")
    
'''Optimizer'''
learning_rate = 0.01

with tf.name_scope("train"):
    optimizer = tf.train.AdamOptimizer(learning_rate)
    best_op = optimizer.minimize(loss)
    
'''Evaluation'''
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
init = tf.global_variables_initializer()

W1201 00:39:46.121841 140539129694016 deprecation.py:323] From <ipython-input-16-e2dda702bf04>:29: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dense instead.
W1201 00:39:46.131278 140539129694016 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [18]:
batch_size = 250
n_batches = int(np.ceil(X_train_prepared.shape[0]/ batch_size))
n_epochs = 10

with tf.Session() as sess:
    init.run()
    
    for epoch in range(n_epochs):
        for batch_index in range(n_batches):
            X_batch, y_batch = fetch_batch(epoch, batch_index, batch_size)
            sess.run(best_op, feed_dict={X: X_batch, y: y_batch})
        loss_val, acc_val = sess.run([loss, accuracy], feed_dict={X: X_test_prepared, y: y_test})
        print(epoch+1, "Loss:{:.5f}\t Accuracy:{:.3f}%".format(loss_val, acc_val * 100))
        
    saver = tf.train.Saver()
    saver.save(sess, "logs/trained_variables.ckpt")
    sess.close()

1 Loss:0.47246	 Accuracy:78.473%
2 Loss:0.47403	 Accuracy:78.451%
3 Loss:0.47213	 Accuracy:78.536%
4 Loss:0.47171	 Accuracy:78.525%
5 Loss:0.47194	 Accuracy:78.480%
6 Loss:0.47246	 Accuracy:78.546%
7 Loss:0.47311	 Accuracy:78.482%
8 Loss:0.47789	 Accuracy:78.526%
9 Loss:0.47352	 Accuracy:78.460%
10 Loss:0.47585	 Accuracy:78.436%
