## Predicting Loan Risk - German Credit Data 
### Logistic Regression - TensorFlow 


    ### Data Source: Kaggle, UCI

## TensorFlow Classification Implementation 

In [7]:
import pandas as pd

In [8]:
df = pd.read_csv('C:\\Users\\Rakib\\Downloads\\Loan Prediction-LogitReg-Skickit-TF-Github\\creditdata.csv')

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [10]:
df = df.drop(['Unnamed: 0', 'Checking account','Credit amount','Saving accounts'], axis=1)

    ## had to drop off these columns -- were getting error with these columns in model training
    ## used tf.feature_column.categorical_column_with_vocabulary_list - did not work for these columns
    ## will try to figure out what happened

In [11]:
df.columns

Index(['Age', 'Sex', 'Job', 'Housing', 'Duration', 'Purpose', 'Risk'], dtype='object')

In [15]:
df.columns

Index(['Age', 'Sex', 'Job', 'Housing', 'Duration', 'Purpose', 'Risk'], dtype='object')

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
Age         1000 non-null int64
Sex         1000 non-null object
Job         1000 non-null int64
Housing     1000 non-null object
Duration    1000 non-null int64
Purpose     1000 non-null object
Risk        1000 non-null object
dtypes: int64(3), object(4)
memory usage: 54.8+ KB


In [17]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,1000.0,35.546,11.375469,19.0,27.0,33.0,42.0,75.0
Job,1000.0,1.904,0.653614,0.0,2.0,2.0,2.0,3.0
Duration,1000.0,20.903,12.058814,4.0,12.0,18.0,24.0,72.0


    ## Fixing labels in Data

In [18]:
df['Risk'].unique()

array(['good', 'bad'], dtype=object)

In [19]:
def label_fix(label):
    if label=='bad':
        return 0
    else:
        return 1

In [20]:
df['Risk'] = df['Risk'].apply(label_fix)

In [21]:
df['Risk'].unique()

array([1, 0], dtype=int64)

    ## Splitting data set

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
x_data = df.drop('Risk',axis=1)
y_labels = df['Risk']
X_train, X_test, y_train, y_test = train_test_split(x_data,y_labels,test_size=0.3,random_state=101)

In [24]:
df.columns

Index(['Age', 'Sex', 'Job', 'Housing', 'Duration', 'Purpose', 'Risk'], dtype='object')

In [25]:
import tensorflow as tf

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
Age         1000 non-null int64
Sex         1000 non-null object
Job         1000 non-null int64
Housing     1000 non-null object
Duration    1000 non-null int64
Purpose     1000 non-null object
Risk        1000 non-null int64
dtypes: int64(4), object(3)
memory usage: 54.8+ KB


In [27]:
df['Sex'].unique()

array(['male', 'female'], dtype=object)

In [28]:
df['Housing'].unique()

array(['own', 'free', 'rent'], dtype=object)

In [29]:
df['Purpose'].unique()

array(['radio/TV', 'education', 'furniture/equipment', 'car', 'business',
       'domestic appliances', 'repairs', 'vacation/others'], dtype=object)

    ## Tensor Flow feature columns

In [30]:
sex = tf.feature_column.categorical_column_with_vocabulary_list("Sex", ['male', 'female'])
housing = tf.feature_column.categorical_column_with_vocabulary_list("Housing", ['own', 'free', 'rent'])
purpose = tf.feature_column.categorical_column_with_hash_bucket("Purpose", hash_bucket_size=8)

In [31]:
df.columns

Index(['Age', 'Sex', 'Job', 'Housing', 'Duration', 'Purpose', 'Risk'], dtype='object')

In [32]:
age = tf.feature_column.numeric_column("Age")
job = tf.feature_column.numeric_column("Job")
duration = tf.feature_column.numeric_column("Duration")

In [33]:
feat_cols = [age,sex,job,housing,duration,purpose]

In [34]:
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,
                                                batch_size=1000,
                                                num_epochs=None,
                                                shuffle=True)

In [35]:
model = tf.estimator.LinearClassifier(feature_columns=feat_cols)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_tf_random_seed': 1, '_model_dir': 'C:\\Users\\Rakib\\AppData\\Local\\Temp\\tmphe38_f07', '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_keep_checkpoint_max': 5, '_save_summary_steps': 100, '_save_checkpoints_secs': 600}


    ## Training Model

In [36]:
model.train(input_fn=input_func,steps=5000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\Rakib\AppData\Local\Temp\tmphe38_f07\model.ckpt.
INFO:tensorflow:step = 1, loss = 693.146
INFO:tensorflow:global_step/sec: 136.348
INFO:tensorflow:step = 101, loss = 570.45 (0.735 sec)
INFO:tensorflow:global_step/sec: 127.807
INFO:tensorflow:step = 201, loss = 562.203 (0.784 sec)
INFO:tensorflow:global_step/sec: 139.598
INFO:tensorflow:step = 301, loss = 579.774 (0.714 sec)
INFO:tensorflow:global_step/sec: 139.738
INFO:tensorflow:step = 401, loss = 564.426 (0.716 sec)
INFO:tensorflow:global_step/sec: 139.416
INFO:tensorflow:step = 501, loss = 574.872 (0.717 sec)
INFO:tensorflow:global_step/sec: 139.326
INFO:tensorflow:step = 601, loss = 560.588 (0.720 sec)
INFO:tensorflow:global_step/sec: 138.83
INFO:tensorflow:step = 701, loss = 551.919 (0.720 sec)
INFO:tensorflow:global_step/sec: 139.236
INFO:tensorflow:step = 801, loss = 583.306 (0.716 sec)
INFO:tensorflow:global_step/sec: 139.964
INFO

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x2a1e25d60f0>

In [37]:
pred_fn = tf.estimator.inputs.pandas_input_fn(x=X_test,
                                             batch_size=len(X_test),
                                             shuffle=False)

In [38]:
prediction = list(model.predict(input_fn=pred_fn))

INFO:tensorflow:Restoring parameters from C:\Users\Rakib\AppData\Local\Temp\tmphe38_f07\model.ckpt-5000


In [39]:
prediction[0]

{'class_ids': array([1], dtype=int64),
 'classes': array([b'1'], dtype=object),
 'logistic': array([ 0.70546973], dtype=float32),
 'logits': array([ 0.87348199], dtype=float32),
 'probabilities': array([ 0.2945303 ,  0.70546973], dtype=float32)}

In [40]:
final_preds = []
for pred in prediction:
    final_preds.append(pred['class_ids'][0])

In [41]:
final_preds[:10]

[1, 1, 1, 1, 0, 1, 1, 1, 1, 1]

### Evaluation

In [42]:
from sklearn.metrics import classification_report

In [43]:
print(classification_report(y_test,final_preds))

             precision    recall  f1-score   support

          0       0.60      0.22      0.32        96
          1       0.72      0.93      0.81       204

avg / total       0.68      0.70      0.65       300



In [44]:
from sklearn.metrics import jaccard_similarity_score
jaccard_similarity_score(y_test, final_preds)

0.70333333333333337

In [45]:
from sklearn.metrics import log_loss
log_loss(y_test, final_preds)

10.246703563181054