# Kiva Project - Example Notebook

Predict whether a Kiva loan application will default.

- Stephen W. Thomas
- Used for MMAI 891.

# Preliminaries: Inspect and Set up environment

In [None]:
!which python

/usr/local/bin/python


In [None]:
!python --version

Python 3.7.10


In [None]:
!echo $PYTHONPATH

/env/python


In [None]:
!pip install unidecode textstat
!pip install -q -U keras-tuner
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import kerastuner as kt

import pandas as pd


from sklearn.model_selection import train_test_split


Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/9e/25/723487ca2a52ebcee88a34d7d1f5a4b80b793f179ee0f62d5371938dfa01/Unidecode-1.2.0-py2.py3-none-any.whl (241kB)
[K     |████████████████████████████████| 245kB 10.6MB/s 
[?25hCollecting textstat
[?25l  Downloading https://files.pythonhosted.org/packages/ca/b1/ab40a00b727a0d209402d1be6aa3f1bc75bd03678b59ace8507b08bf12f5/textstat-0.7.0-py3-none-any.whl (99kB)
[K     |████████████████████████████████| 102kB 7.6MB/s 
[?25hCollecting pyphen
[?25l  Downloading https://files.pythonhosted.org/packages/7c/5a/5bc036e01389bc6a6667a932bac3e388de6e7fa5777a6ff50e652f60ec79/Pyphen-0.10.0-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 21.1MB/s 
[?25hInstalling collected packages: unidecode, pyphen, textstat
Successfully installed pyphen-0.10.0 textstat-0.7.0 unidecode-1.2.0
[K     |████████████████████████████████| 71kB 6.0MB/s 
[?25h  Building wheel for keras-tuner (setup.py) ... [?25

# Read Data

We'll read the data from the links that Uncle Steve provided.

In [None]:
# The labeled training data
df_train = pd.read_csv("https://drive.google.com/uc?export=download&id=1dzzVbgHphbCf7kvq9IKiIhwzmxPbuH4s")
df_test =  pd.read_csv("https://drive.google.com/uc?export=download&id=1EVWfyqQOd_W2uTKrr4JTD2iFrEZHoOHT")

X_train_raw = df_train['en_clean']
X_test_raw =  df_test['en_clean']

Y_train = df_train['defaulted']


# Pre-Process Data


In [None]:
#!pip install unidecode
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

import re
import unidecode

def preprocessing(ds):    
    #Remove special characters
    ds = re.sub(r'[^\w\s]', '', ds)
    # Lower case
    ds = ds.lower()
    #Remove non-unicode characters
    ds = unidecode.unidecode(ds)
    
    return ''.join(ds) 


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


# Apply Pre-processing and Split the data into Training and Validation

In [None]:
X_train = X_train_raw.apply(lambda x: preprocessing(x))
X_test =  X_test_raw.apply(lambda x: preprocessing(x))

X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, random_state=42)



In [None]:
X_train

6131    mary was born 41 years ago in empuyiankat kaji...
5814    gladys wangari maina is 48 years old she is ma...
5703    catherine is 33 years old and married with 3 c...
2805    michael gaturi is a 30yearold young man who is...
733     jane is married with grown up children she is ...
                              ...                        
3772    wangui is the chairlady of this group she is a...
5191    alice wairimu kimani is 37 years old married a...
5226    nicholas is 29 years old married and has one c...
5390    hudson is a married man with nine children all...
860     nancy nzisa is 38 and is a single mother of on...
Name: en_clean, Length: 4603, dtype: object

In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6138 entries, 0 to 6137
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   loan_id    6138 non-null   int64 
 1   en_clean   6138 non-null   object
 2   defaulted  6138 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 144.0+ KB


# Tokenize & Pad Input Data

In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)

X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

In [None]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 2000

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_val = pad_sequences(X_val, padding='post', maxlen=maxlen)

X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# Define F1 Score as a custom performance metric

In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# Hyperparameter Tuning using Keras Tuner

In [None]:
from keras.models import Sequential
from keras import layers

def build_model(hp):

  embedding_dim = 200


  model = Sequential()
  model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, trainable = True,
                           input_length=maxlen))
  hp_filters = hp.Int('filters', min_value=64, max_value=320, step=32)
  hp_kernel = hp.Int('kernel_size', min_value=2, max_value=8, step=1)
  model.add(layers.Conv1D(filters=hp_filters,kernel_size=hp_kernel, activation='relu'))
  model.add(layers.GlobalMaxPool1D())
  hp_units = hp.Int('units', min_value=10, max_value=300,step=10)
  model.add(layers.Dense(units=hp_units, activation='relu'))
  model.add(layers.Dropout(0.4))
  model.add(layers.Dense(1, activation='sigmoid'))

  hp_learning_rate = hp.Choice('learning_rate', values=[1e-3,2e-3])

  model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
              loss='binary_crossentropy',
              metrics='accuracy')
  model.summary()

  return model

In [None]:
tuner = kt.Hyperband(build_model,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='my_dir',
                     project_name='intro_to_kt'
                    )

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2000, 100)         1649500   
_________________________________________________________________
conv1d (Conv1D)              (None, 1999, 64)          12864     
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 10)                650       
_________________________________________________________________
dropout (Dropout)            (None, 10)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 1,663,025
Trainable params: 1,663,025
Non-trainable params: 0
______________________________________________

# Define Callbacks, EarlyStopping and LR Reducing Functions

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ReduceLROnPlateau

def warmup(epoch, lr):
  return (lr +1e-6)

callback = [EarlyStopping(monitor= 'val_loss', patience=3, mode = 'max') ,
          LearningRateScheduler (warmup, verbose = 0),
             ReduceLROnPlateau(monitor = 'val_loss', 
                               factor = 1e-6 , 
                               patience = 2, 
                               mode ='max', 
                               min_lr = 1e-6)
              ]   

callback_new = [EarlyStopping(monitor= 'val_f1_m', patience=3, mode = 'max') ,
          LearningRateScheduler (warmup, verbose = 0),
             ReduceLROnPlateau(monitor = 'val_f1_m', 
                               factor = 0.01, 
                               patience = 3, 
                               mode ='max', 
                               min_lr = 0.0005)
              ]   


#Search for best Hyperparams

In [None]:
tuner.search(X_train, Y_train, epochs=50, validation_split=0.2, callbacks=[callback])

Trial 30 Complete [00h 00m 23s]
val_accuracy: 0.8653637170791626

Best val_accuracy So Far: 0.8881650567054749
Total elapsed time: 00h 08m 41s
INFO:tensorflow:Oracle triggered exit


In [None]:
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""The hyperparameter search is complete. The optimal number of filters in the first Conv
layer is {best_hps.get('filters')}, kernel size is {best_hps.get('kernel_size')}, the optimal units in first layer is {best_hps.get('units')} and {best_hps.get('learning_rate')}.
""")

The hyperparameter search is complete. The optimal number of filters in the first Conv
layer is 192, kernel size is 3, the optimal units in first layer is 50 and 0.001.



# Implement configuration of the best-tuned model

In [None]:
#Despite the optimal output of the Hypertuner, I made some manual adjustments, based on previous Tuning results to tinker around in order to get best results

embedding_dim = 200

model_new = Sequential()
model_new.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, trainable = True,
                           input_length=maxlen))
model_new.add(layers.Conv1D(160,5, activation='relu'))
model_new.add(layers.GlobalMaxPool1D())
model_new.add(layers.Dense(50, activation='relu'))
model_new.add(layers.Dense(1, activation='sigmoid'))
model_new.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=[f1_m])
model_new.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2000, 200)         3299000   
_________________________________________________________________
conv1d (Conv1D)              (None, 1996, 160)         160160    
_________________________________________________________________
global_max_pooling1d (Global (None, 160)               0         
_________________________________________________________________
dense (Dense)                (None, 50)                8050      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
Total params: 3,467,261
Trainable params: 3,467,261
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model_new.fit(X_train, Y_train,
                    epochs=10,
                    verbose=True,
                    callbacks =callback_new,
                    validation_data=(X_val, Y_val),
                    batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


In [None]:
model_new.evaluate(X_val, Y_val)



[0.47793343663215637, 0.8456417918205261]

# Predict results on test set and write them to CSV file for submission to Kaggle

In [None]:
df_result = pd.DataFrame(model_new.predict_classes(X_test))
df_result.to_csv("result_cnn.csv")




# Confusion Matrix

In [None]:
df_pred = pd.DataFrame(model_new.predict_classes(X_val))



In [None]:
from sklearn.metrics import classification_report

print(classification_report(Y_val, df_pred))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87       776
           1       0.90      0.80      0.85       759

    accuracy                           0.86      1535
   macro avg       0.86      0.86      0.86      1535
weighted avg       0.86      0.86      0.86      1535

