# Industry Accelerators - Utilities Payment Risk Prediction Model

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import json
import datetime
import time
import joblib

from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn import metrics


import random
import string
# need to install this version of imbalanced-learn 
#!pip install imbalanced-learn
#from imblearn.over_sampling import SMOTE

!pip install imbalanced-learn==0.7.0
from imblearn.over_sampling import SMOTE


In [None]:
# get the ratio of the data passed in for this cycle vs the previous cycle and vs the average of the lookback window
# eg how does this cycle's bill compare to the average for the previous cycle?
def cur_month_vs_historical_summary(df, col, customer_id_col, lookback_window):
    # how does this month's data compare to the previous cycle?
    df[col + '_PREVIOUS_MONTH'] = df.groupby(customer_id_col)[col].shift(1)
    df['RATIO_THIS_MONTH_' + col + '_VS_LAST_MONTH'] = df[col] / df[col + '_PREVIOUS_MONTH']
    # how does this cycles's data compare to the average of the lookback window?
    
    # get the average of the lookback window
    df[col + '_AVG_LOOKBACK_WINDOW'] = df.groupby(customer_id_col)[col].shift(1).rolling(lookback_window).mean()
    df['RATIO_THIS_MONTH_' + col + '_VS_AVG_LOOKBACK_WINDOW'] = df[col] / df[col + '_AVG_LOOKBACK_WINDOW']
    
    df.drop([col + '_AVG_LOOKBACK_WINDOW', col + '_PREVIOUS_MONTH'], axis=1, inplace=True)
    
    return df

In [3]:
def plot_feature_importance(feature_list, feature_importances, title='Feature Importance Plot'):
    """
    Function to Plot Feature Importances
    """
    features = feature_list
    importances = feature_importances
    indices = np.argsort(importances)[-10:]
    
    plt.figure(figsize=(12,7))
    plt.title(title, fontsize=16, fontweight='bold')
    plt.barh(range(len(indices)), importances[indices], color='b', align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices])
    plt.xlabel('Relative Importance')
  
    return

In [5]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(project_id='133e0829-3321-420c-a25e-e7d551a02ece')
pc = project.project_context

In [None]:
my_file = project.get_file('Bill Payment View.csv')
my_file.seek(0)
df_prep = pd.read_csv(my_file)

df_prep.head()

In [None]:
target_col = 'MISSED_PAYMENT'
# used to create the target variable
overdue_balance_col = 'OVERDUE_BALANCE'
billing_date_col = 'BILLING_DATE'
customer_id_col = 'CUSTOMER_ID'
# in months
lookback_window = 3
l_cols_to_summarise = ['BASE_USAGE', 'ALTERNATE_USAGE', 'TOTAL_TO_PAY']

l_categorical_for_dummy_vars = ['SMART_METER_COMMENTS', 'CITY', 'MARITAL_STATUS', 'EDUCATION', 'SEGMENT', 'EMPLOYMENT', 'CREDIT_HISTORY', 'BUILDING_TYPE']

l_numerical_features = ['TOTAL_TO_PAY', 'BASE_USAGE', 'ALTERNATE_USAGE', 'STANDING_CHARGE', 'BASE_CHARGE', 'ALTERNATE_CHARGE', 'LEVY', 'TOTAL_TO_PAY',
                       'AGE', 'IS_REGISTERED_FOR_ALERTS', 'OWNS_HOME', 'COMPLAINTS', 'HAS_THERMOSTAT', 'HAS_HOME_AUTOMATION', 'PV_ZONING', 'WIND_ZONING', 
                        'IS_CAR_OWNER', 'HAS_EV', 'HAS_PV', 'HAS_WIND', 'EBILL', 'IN_WARRANTY']

user_inputs_dict = {'target_col' : target_col, 'overdue_balance_col' : overdue_balance_col, 'billing_date_col' : billing_date_col,
                    'customer_id_col' : customer_id_col, 'lookback_window' : lookback_window, 'l_cols_to_summarise' : l_cols_to_summarise}

In [None]:
# sort by billing date for each customer
df_prep = df_prep.sort_values([customer_id_col, billing_date_col])
# shift the overdue balance back 1 record per customer to create our target variable
# we want to know on the billing date whether the customer misses the payment
df_prep[target_col] = df_prep.groupby(customer_id_col)[overdue_balance_col].shift(-1)

df_prep.loc[(df_prep[target_col] != 0) & (~df_prep[target_col].isna()), target_col] = 1