## Production Inference
- In this notebook the trained model will be used to do inference and find the target clients.

### Import Libraries
- Import all essential libraries

In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

### Load Data
- Load the complete data for inference

In [2]:
soc_dem_data = pd.read_excel('data/DataScientist_CaseStudy_Dataset.xlsx', sheet_name='Soc_Dem')
products_actBalance_data = pd.read_excel('data/DataScientist_CaseStudy_Dataset.xlsx', sheet_name='Products_ActBalance')
inflow_outflow_data = pd.read_excel('data/DataScientist_CaseStudy_Dataset.xlsx', sheet_name='Inflow_Outflow')
sales_revenues_data = pd.read_excel('data/DataScientist_CaseStudy_Dataset.xlsx', sheet_name='Sales_Revenues')
combined_data= pd.concat([soc_dem_data.set_index('Client',inplace=False),products_actBalance_data.set_index('Client',inplace=False),inflow_outflow_data.set_index('Client',inplace=False)],axis=1,sort=False).reset_index()
combined_data.rename(columns = {'index':'Client'})

#Fill the Nan values.
sex_mod = combined_data.Sex.mode()[0]
combined_data['Sex'] = combined_data['Sex'].fillna(sex_mod)
process_col_list = ["Count_SA","Count_MF","Count_OVD","Count_CC","Count_CL","ActBal_SA","ActBal_MF","ActBal_OVD","ActBal_CC","ActBal_CL"]
combined_data[process_col_list]=combined_data[process_col_list].fillna(0) 
combined_data.dropna(inplace=True)

print(combined_data.head(2))
print("Combined Shape : ", combined_data.shape)

   Client Sex  Age  Tenure  Count_CA  Count_SA  Count_MF  Count_OVD  Count_CC  \
0     909   M   21      27         1       0.0       0.0        1.0       0.0   
1    1217   M   38     165         1       0.0       0.0        0.0       0.0   

   Count_CL  ...   VolumeDeb  VolumeDeb_CA  VolumeDebCash_Card  \
0       1.0  ...  450.678571    448.892857          178.571429   
1       0.0  ...  714.285714    714.285714            0.000000   

   VolumeDebCashless_Card  VolumeDeb_PaymentOrder  TransactionsDeb  \
0                     0.0              166.571429              8.0   
1                     0.0              714.285714              1.0   

   TransactionsDeb_CA  TransactionsDebCash_Card  TransactionsDebCashless_Card  \
0                 7.0                       1.0                           0.0   
1                 1.0                       0.0                           0.0   

   TransactionsDeb_PaymentOrder  
0                           4.0  
1                           1.0  


### Load Model and Metadata from data processing and training for Inferece
- Load the model and metadata for inferece.

In [3]:
#Load StandardScaler metadata.
consumer_loan_scaler = StandardScaler()
with open('inference_metadata/consumer_loan_scaler.pkl','rb') as f:
    consumer_loan_scaler = pickle.load(f)

credit_card_scaler = StandardScaler()
with open('inference_metadata/credit_card_scaler.pkl','rb') as f:
    credit_card_scaler = pickle.load(f)

mutual_fund_scaler = StandardScaler()
with open('inference_metadata/mutual_fund_scaler.pkl','rb') as f:
    mutual_fund_scaler = pickle.load(f)

In [4]:
#Load PCA metadata
consumer_loan_pca = PCA()
with open('inference_metadata/consumer_loan_pca.pkl','rb') as f:
    consumer_loan_pca = pickle.load(f)

credit_card_pca = PCA()
with open('inference_metadata/credit_card_pca.pkl','rb') as f:
    credit_card_pca = pickle.load(f)

mutual_fund_pca = PCA()
with open('inference_metadata/mutual_fund_pca.pkl','rb') as f:
    mutual_fund_pca = pickle.load(f)


In [5]:
#Selected columns for each models metadata
CL_Selected_Cols=[]
CC_Selected_Cols=[]
MF_Selected_Cols=[]

with open('inference_metadata/CL_Selected_Cols.pkl','rb') as f:
    CL_Selected_Cols = pickle.load(f)

with open('inference_metadata/CC_Selected_Cols.pkl','rb') as f:
    CC_Selected_Cols = pickle.load(f)

with open('inference_metadata/MF_Selected_Cols.pkl','rb') as f:
    MF_Selected_Cols = pickle.load(f)

In [6]:
#Load XGB Models for inferece.

#Consumer Load Model
xgb_classifier_CL=xgb.XGBClassifier()
with open('model/xgb_classifier_CL.pkl','rb') as f:
    xgb_classifier_CL = pickle.load(f)

xgb_regressor_CL = xgb.XGBRegressor()
with open('model/xgb_regressor_CL.pkl','rb') as f:
    xgb_regressor_CL = pickle.load(f)

#Credit Card Model
xgb_classifier_CC = xgb.XGBClassifier()
with open('model/xgb_classifier_CC.pkl','rb') as f:
    xgb_classifier_CC = pickle.load(f)

xgb_regressor_CC = xgb.XGBRegressor()
with open('model/xgb_regressor_CC.pkl','rb') as f:
    xgb_regressor_CC = pickle.load(f)

#Mutual Fund Model
xgb_classifier_MF = xgb.XGBClassifier()
with open('model/xgb_classifier_MF.pkl','rb') as f:
    xgb_classifier_MF = pickle.load(f)

xgb_regressor_MF = xgb.XGBRegressor()
with open('model/xgb_regressor_MF.pkl','rb') as f:
    xgb_regressor_MF = pickle.load(f)

print("Completed model load")

Completed model load


#### Data Preprocessing Pipeline
- In this section the data preprocessing will be done before the model inference 

In [7]:
def data_preprocessing_pipeline(data,standard_scalar,pca,selected_cols):
    # Convert Sex from string to numerical value.
    data['Sex']= data['Sex'].replace(['M', 'F'],[0, 1])

    #Remove correlated columns and Client column
    data = data.drop(columns=['Client','VolumeCred_CA','TransactionsCred_CA','VolumeDeb_CA','TransactionsDeb_CA'])

    columns_to_standerdize =['Sex', 'Age', 'Tenure', 'Count_CA', 'Count_SA', 'Count_MF',
       'Count_OVD', 'Count_CC', 'Count_CL', 'ActBal_CA', 'ActBal_SA',
       'ActBal_MF', 'ActBal_OVD', 'ActBal_CC', 'ActBal_CL', 'VolumeCred',
       'TransactionsCred', 'VolumeDeb', 'VolumeDebCash_Card',
       'VolumeDebCashless_Card', 'VolumeDeb_PaymentOrder', 'TransactionsDeb',
       'TransactionsDebCash_Card', 'TransactionsDebCashless_Card',
       'TransactionsDeb_PaymentOrder']

    
    #Standerdize the columns data
    data[columns_to_standerdize] = standard_scalar.transform(data[columns_to_standerdize])

    #Keep the selected columns 
    data = data[selected_cols]

    data = pca.transform(data)

    return data

In [8]:
#Test the data processing pipeline
#data_preprocessing_pipeline(combined_data,consumer_loan_scaler,consumer_loan_pca,CL_Selected_Cols)
#data_preprocessing_pipeline(combined_data,credit_card_scaler,credit_card_pca,CC_Selected_Cols)
#data_preprocessing_pipeline(combined_data,mutual_fund_scaler,mutual_fund_pca,MF_Selected_Cols)

#### Model Prediction Pipeline
- In this section the classification and regression pipeline will be coded

In [9]:
def classification_model_prediction(model,data):
    predictions = model.predict(data)
    #predictions_probability = model.predict_proba(data)[:, 1]
    return predictions

In [10]:
#Test Classification
#predict= classification_model_prediction(xgb_classifier_CL,data_preprocessing_pipeline(combined_data,consumer_loan_scaler,consumer_loan_pca,CL_Selected_Cols))
#predict = classification_model_prediction(xgb_classifier_CC,data_preprocessing_pipeline(combined_data,credit_card_scaler,credit_card_pca,CC_Selected_Cols))
#predict= classification_model_prediction(xgb_classifier_MF,data_preprocessing_pipeline(combined_data,mutual_fund_scaler,mutual_fund_pca,MF_Selected_Cols))

In [11]:
def regression_model_prediction(model,data):
    predictions = model.predict(data)
    return predictions

In [12]:
#Test Regression
#predict = regression_model_prediction(xgb_regressor_CL,data_preprocessing_pipeline(combined_data,consumer_loan_scaler,consumer_loan_pca,CL_Selected_Cols))
#predict = regression_model_prediction(xgb_regressor_CC,data_preprocessing_pipeline(combined_data,credit_card_scaler,credit_card_pca,CC_Selected_Cols))
#predict = regression_model_prediction(xgb_regressor_MF,data_preprocessing_pipeline(combined_data,mutual_fund_scaler,mutual_fund_pca,MF_Selected_Cols))
#predict

#### Preprocessing Pipeline
- This section handles the preprocessing part of the classification and refression models.

In [13]:
#Get possible candidates by checking classification model.
def get_possible_client_candidates(clientID,classification_prediction,revenue_prediction,product_key):
    df=pd.DataFrame(classification_prediction, columns=['classification_prediction'])
    df['revenue_prediction'] = pd.DataFrame(revenue_prediction)
    df['Client'] = clientID
    df = df[df['classification_prediction']!=0]
    df['product_key']=product_key
    df = df.sort_values(by=['revenue_prediction'],ascending=False)
    df = df.drop(columns=['classification_prediction'])
    return df

In [14]:
#test client candidates
#classification_prediction = classification_model_prediction(xgb_classifier_CL,data_preprocessing_pipeline(combined_data,consumer_loan_scaler,consumer_loan_pca,CL_Selected_Cols))
#revenue_predict = regression_model_prediction(xgb_regressor_CL,data_preprocessing_pipeline(combined_data,consumer_loan_scaler,consumer_loan_pca,CL_Selected_Cols))
#get_possible_client_candidates(combined_data['Client'],classification_prediction,revenue_predict,'CL')

In [15]:
def combine_and_select_highest_revenue(df_list):
    df = pd.concat(df_list)
    df = df.sort_values(by=['revenue_prediction'],ascending=False)
    df = df.drop_duplicates(subset='Client', keep="first")
    return df

In [16]:
def get_top_n_rows_product(df,product_key,top):
    df = df[df['product_key']==product_key].head(top)
    return df

#### Run the Full Inference Pipeline
- Run the full pipline to generate the results.

In [17]:
#Consumer Loan
cl_processed_data = data_preprocessing_pipeline(combined_data,consumer_loan_scaler,consumer_loan_pca,CL_Selected_Cols)
cl_client_predict= classification_model_prediction(xgb_classifier_CL,cl_processed_data)
cl_revenue_predict = regression_model_prediction(xgb_regressor_CL,cl_processed_data)
cl_client_candidate = get_possible_client_candidates(combined_data['Client'],cl_client_predict,cl_revenue_predict,'CL')

#Credit Card
cc_processed_data = data_preprocessing_pipeline(combined_data,credit_card_scaler,credit_card_pca,CC_Selected_Cols)
cc_client_predict = classification_model_prediction(xgb_classifier_CC,cc_processed_data)
cc_revenue_predict = regression_model_prediction(xgb_regressor_CC,cc_processed_data)
cc_client_candidate = get_possible_client_candidates(combined_data['Client'],cc_client_predict,cc_revenue_predict,'CC')


#Mutual Fund
mf_processed_data = data_preprocessing_pipeline(combined_data,mutual_fund_scaler,mutual_fund_pca,MF_Selected_Cols)
mf_client_predict= classification_model_prediction(xgb_classifier_MF,mf_processed_data)
mf_revenue_predict = regression_model_prediction(xgb_regressor_MF,mf_processed_data)
mf_client_candidate = get_possible_client_candidates(combined_data['Client'],mf_client_predict,mf_revenue_predict,'MF')


#Above section generated all the clinet candidates with predicted revenues.

#1.Combine the data, 2. sort data 3. Keep top revenue of a client to avoid duplicate marketing.
combined_data = combine_and_select_highest_revenue([cl_client_candidate,cc_client_candidate,mf_client_candidate])

#Get the top 100 clients
top_cl = get_top_n_rows_product(combined_data,'CL',100)
top_cc = get_top_n_rows_product(combined_data,'CC',100)
top_mf = get_top_n_rows_product(combined_data,'MF',100)

#Get the total revenue.
total_cl_predicted_revenue = top_cl['revenue_prediction'].sum()
total_cc_predicted_revenue = top_cc['revenue_prediction'].sum()
total_mf_predicted_revenue = top_mf['revenue_prediction'].sum()
total_predicted_revenue = total_cl_predicted_revenue + total_cc_predicted_revenue + total_mf_predicted_revenue

predicted_revenue_details = ['Consumer Loan Predicted Revenue : ' + str(total_cl_predicted_revenue),
                             'Credit Card Predicted Revenue : ' + str(total_cc_predicted_revenue),
                             'Mutual Fund Predicted Revenue : ' + str(total_mf_predicted_revenue),
                             'Total Predicted Revenue : ' + str(total_predicted_revenue)]

In [20]:
#Save the prediction client results.
top_cl.sort_values(by=['Client'],ascending=True)['Client'].to_csv('prediction_result/ConsumerLoanClient.csv',index=None)
top_cc.sort_values(by=['Client'],ascending=True)['Client'].to_csv('prediction_result/CreditCardClient.csv',index=None)
top_mf.sort_values(by=['Client'],ascending=True)['Client'].to_csv('prediction_result/MutualFundClient.csv',index=None)

with open("prediction_result/PredictedRevenue.txt", 'w') as revenue_txt:
    for revenue_details in predicted_revenue_details:
        revenue_txt.write(revenue_details + '\n')
