# Task for Today  

***

## Kiva Loan Type Prediction  

Given *data about Kiva crowdfunding loans*, let's try to predict the **type** of a given loan.  
  
We will use a TensorFlow neural network to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import tensorflow as tf

from sklearn.metrics import confusion_matrix, classification_report

In [None]:
data = pd.read_csv('../input/data-science-for-good-kiva-crowdfunding/kiva_loans.csv')

In [None]:
data

In [None]:
data.info()

# Helper Functions

In [None]:
def encode_dates(df, column):
    df = df.copy()
    
    df[column] = pd.to_datetime(df[column])
    
    df[column + "_year"] = df[column].apply(lambda x: x.year)
    df[column + "_month"] = df[column].apply(lambda x: x.month)
    df[column + "_day"] = df[column].apply(lambda x: x.day)
    
    df[column + "_hour"] = df[column].apply(lambda x: x.hour)
    df[column + "_minute"] = df[column].apply(lambda x: x.minute)
    df[column + "_second"] = df[column].apply(lambda x: x.second)
    
    df = df.drop(column, axis=1)
    
    return df

In [None]:
def get_male_count(x):
    count = 0
    for gender in str(x).split(', '):
        if gender == 'male':
            count += 1
    return count

def get_female_count(x):
    count = 0
    for gender in str(x).split(', '):
        if gender == 'female':
            count += 1
    return count

In [None]:
def onehot_encode(df, columns_with_prefixes):
    df = df.copy()
    
    for column, prefix in columns_with_prefixes:
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    
    return df

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop id column
    df = df.drop('id', axis=1)
    
    # Drop use and tags columns (avoiding NLP)
    df = df.drop(['use', 'tags'], axis=1)
    
    # Drop country and date columns (redundant information)
    df = df.drop(['country', 'date'], axis=1)
    
    # Drop region column (high-cardinality)
    df = df.drop('region', axis=1)
    
    # Extract date features
    df = encode_dates(df, column='posted_time')
    df = encode_dates(df, column='disbursed_time')
    df = encode_dates(df, column='funded_time')
    
    # Engineer gender count features
    df['male_count'] = df['borrower_genders'].apply(get_male_count)
    df['female_count'] = df['borrower_genders'].apply(get_female_count)
    df = df.drop('borrower_genders', axis=1)
    
    # One-hot encode nominal features
    nominal_features = [
        ('activity', "act"),
        ('sector', "sec"),
        ('country_code', "ctc"),
        ('currency', "cur"),
        ('partner_id', "pid")
    ]
    df = onehot_encode(df, columns_with_prefixes=nominal_features)
    
    # Split df into X and y
    y = df['repayment_interval']
    X = df.drop('repayment_interval', axis=1)
    
    # Encode labels
    label_mapping = {
        'bullet': 0,
        'weekly': 1,
        'monthly': 2,
        'irregular': 3
    }
    y = y.replace(label_mapping)
    
    # Fill in remaining missing values with column means
    missing_value_columns = X.loc[:, X.isna().sum() > 0].columns
    for column in missing_value_columns:
        X[column] = X[column].fillna(X[column].mean())
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X with a standard scaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
def evaluate_model(model, X_test, y_test):
    
    results = model.evaluate(X_test, y_test, verbose=0)
    print("    Test Loss: {:.4f}".format(results[0]))
    print("Test Accuracy: {:.2f}%".format(results[1] * 100))
    
    y_pred = np.argmax(model.predict(X_test), axis=1)
    
    cm = confusion_matrix(y_test, y_pred)
    clr = classification_report(y_test, y_pred, target_names=['bullet', 'weekly', 'monthly', 'irregular'])
    
    plt.figure(figsize=(10, 10))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', cbar=False)
    plt.xticks(np.arange(4) + 0.5, ['bullet', 'weekly', 'monthly', 'irregular'])
    plt.yticks(np.arange(4) + 0.5, ['bullet', 'weekly', 'monthly', 'irregular'])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()
    
    print("Classification Report:\n----------------------\n", clr)

# Preprocessing

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train

# Training

In [None]:
inputs = tf.keras.Input(shape=(X_train.shape[1],))
x = tf.keras.layers.Dense(128, activation='relu')(inputs)
x = tf.keras.layers.Dense(128, activation='relu')(x)
outputs = tf.keras.layers.Dense(4, activation='softmax')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)


model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)


model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=100,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

# Results

In [None]:
evaluate_model(model, X_test, y_test)

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/Mi-MF1p40h8