### Importing relevant packages

In [3]:
import numpy as np
import pandas as pd
import pomegranate
from pomegranate import *
import matplotlib.pyplot as plt
import pickle
import joblib
import warnings
warnings.filterwarnings('ignore')

### Loading our data

In [8]:
df = pd.read_csv('clean_data.csv')
df

Unnamed: 0,grade,sub_grade,home_ownership,verification_status,purpose,addr_state,initial_list_status,good_bad,term,emp_length_int,mths_since_issue_d,int_rate,annual_inc,mths_since_last_delinq
0,B,B2,RENT,Verified,credit_card,AZ,f,good,3 years,7,85,fair,30k,unknown
1,C,C4,RENT,Source Verified,car,GA,f,bad,5 years,1,85,modrate,30k,unknown
2,C,C5,RENT,Not Verified,small_business,IL,f,good,3 years,7,85,high,20k,unknown
3,C,C1,RENT,Source Verified,other,CA,f,good,3 years,7,85,modrate,50k,less than 56
4,B,B5,RENT,Source Verified,other,OR,f,good,5 years,1,85,modrate,80k,less than 56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
466047,C,C2,MORTGAGE,Source Verified,debt_consolidation,TX,w,good,5 years,3,49,modrate,120k,unknown
466048,D,D5,MORTGAGE,Verified,debt_consolidation,TN,f,bad,5 years,7,49,high,80k,unknown
466049,D,D1,MORTGAGE,Verified,debt_consolidation,OH,f,good,5 years,5,49,high,50k,+56
466050,A,A4,OWN,Verified,credit_card,CA,w,good,3 years,1,49,low,90k,less than 30


In [9]:
df.drop(['sub_grade','addr_state'],axis=1,inplace=True)
df

Unnamed: 0,grade,home_ownership,verification_status,purpose,initial_list_status,good_bad,term,emp_length_int,mths_since_issue_d,int_rate,annual_inc,mths_since_last_delinq
0,B,RENT,Verified,credit_card,f,good,3 years,7,85,fair,30k,unknown
1,C,RENT,Source Verified,car,f,bad,5 years,1,85,modrate,30k,unknown
2,C,RENT,Not Verified,small_business,f,good,3 years,7,85,high,20k,unknown
3,C,RENT,Source Verified,other,f,good,3 years,7,85,modrate,50k,less than 56
4,B,RENT,Source Verified,other,f,good,5 years,1,85,modrate,80k,less than 56
...,...,...,...,...,...,...,...,...,...,...,...,...
466047,C,MORTGAGE,Source Verified,debt_consolidation,w,good,5 years,3,49,modrate,120k,unknown
466048,D,MORTGAGE,Verified,debt_consolidation,f,bad,5 years,7,49,high,80k,unknown
466049,D,MORTGAGE,Verified,debt_consolidation,f,good,5 years,5,49,high,50k,+56
466050,A,OWN,Verified,credit_card,w,good,3 years,1,49,low,90k,less than 30


In [10]:
df['emp_length_int']=df['emp_length_int'].astype(np.str)
df['mths_since_issue_d']= df['mths_since_issue_d'].astype(np.str)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466052 entries, 0 to 466051
Data columns (total 12 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   grade                   466052 non-null  object
 1   home_ownership          466052 non-null  object
 2   verification_status     466052 non-null  object
 3   purpose                 466052 non-null  object
 4   initial_list_status     466052 non-null  object
 5   good_bad                466052 non-null  object
 6   term                    466052 non-null  object
 7   emp_length_int          466052 non-null  object
 8   mths_since_issue_d      466052 non-null  object
 9   int_rate                466052 non-null  object
 10  annual_inc              466052 non-null  object
 11  mths_since_last_delinq  466052 non-null  object
dtypes: object(12)
memory usage: 42.7+ MB


In [13]:
df.columns

Index(['grade', 'home_ownership', 'verification_status', 'purpose',
       'initial_list_status', 'good_bad', 'term', 'emp_length_int',
       'mths_since_issue_d', 'int_rate', 'annual_inc',
       'mths_since_last_delinq'],
      dtype='object')

In [14]:
df.to_csv('bayes_demo.csv',index=False)

In [17]:
df['emp_length_int'].value_counts()

7    190261
1    164751
3     58770
5     52270
Name: emp_length_int, dtype: int64

### Automating the pipeline

#### Getting conditional probability distribution

In [7]:
def get_conditional_prob_distribution(df,child,parents):
    
    parent_list = [df[i] for i in parents]
    
    df1= pd.crosstab(df[child[0]],parent_list).apply(lambda r: r/r.sum(), axis=0)
    
    c=pd.crosstab(df[child[0]],parent_list).apply(lambda r: r/r.sum(), axis=0).columns.values

    r=pd.crosstab(df[child[0]],parent_list).apply(lambda r: r/r.sum(), axis=0).index.values
    
    A=[]
    for i in range(df1.shape[0]):
        for j in range(df1.shape[1]):
            w=[]

            
            if type(c[j])==str:
                w.append(c[j])
            else:
                w= [u  for u in (c[j])]
            w.append(r[i])
            w.append(df1.iloc[i,j])
            A.append(w)
            
            
        
    return A


In [8]:
get_conditional_prob_distribution(df,['term'],['annual_inc','purpose'])

[['+140k', 'car', ' 3 years', 0.7620967741935484],
 ['+140k', 'credit_card', ' 3 years', 0.7482856664994146],
 ['+140k', 'debt_consolidation', ' 3 years', 0.6661630382528673],
 ['+140k', 'educational', ' 3 years', 1.0],
 ['+140k', 'home_improvement', ' 3 years', 0.6591136287904032],
 ['+140k', 'house', ' 3 years', 0.6548223350253807],
 ['+140k', 'major_purchase', ' 3 years', 0.719640179910045],
 ['+140k', 'medical', ' 3 years', 0.7674418604651163],
 ['+140k', 'moving', ' 3 years', 0.7633136094674556],
 ['+140k', 'other', ' 3 years', 0.7122247532270312],
 ['+140k', 'renewable_energy', ' 3 years', 0.76],
 ['+140k', 'small_business', ' 3 years', 0.720677146311971],
 ['+140k', 'vacation', ' 3 years', 0.875],
 ['+140k', 'wedding', ' 3 years', 0.7387387387387387],
 ['100k', 'car', ' 3 years', 0.6616541353383458],
 ['100k', 'credit_card', ' 3 years', 0.7166936790923825],
 ['100k', 'debt_consolidation', ' 3 years', 0.631484794275492],
 ['100k', 'educational', ' 3 years', 1.0],
 ['100k', 'home_

#### Getting marginal probability distribution

In [9]:
def get_marginal_prob_distribution(df,variable):
    
    df1= df[variable].value_counts(normalize=True)
    
    d=dict(zip(df1.index,df1.values))


    return d

In [10]:
get_marginal_prob_distribution(df,'annual_inc')

{'50k': 0.1472067494614335,
 '60k': 0.14266004651841424,
 '70k': 0.11617373168659291,
 '40k': 0.11570168135744509,
 '80k': 0.1005338460085999,
 '90k': 0.07513968398376147,
 '120k': 0.0708161321054303,
 '100k': 0.06071210937835263,
 '30k': 0.06008986121720323,
 '+140k': 0.059055641859706645,
 '140k': 0.03671478719112889,
 '20k': 0.015195729231931202}

### Setting nodes and corresponding cpds

In [11]:
annual_inc = DiscreteDistribution(get_marginal_prob_distribution(df,'annual_inc'))

emp_length_int = DiscreteDistribution(get_marginal_prob_distribution(df,'emp_length_int'))

purpose = DiscreteDistribution(get_marginal_prob_distribution(df,'purpose'))


In [12]:
home_ownership= ConditionalProbabilityTable(
    get_conditional_prob_distribution(df,['home_ownership'],['annual_inc','emp_length_int']), 
    [annual_inc, emp_length_int]

) 

In [13]:
term= ConditionalProbabilityTable(
    get_conditional_prob_distribution(df,['term'],['annual_inc','purpose']), 
    [annual_inc, purpose]

) 

In [14]:
int_rate= ConditionalProbabilityTable(
    get_conditional_prob_distribution(df,['int_rate'],['term','purpose']), 
    [term,purpose]

) 

In [15]:
grade= ConditionalProbabilityTable(
    get_conditional_prob_distribution(df,['grade'],['home_ownership','emp_length_int','int_rate']), 
    [home_ownership,emp_length_int,int_rate]

) 

In [16]:
mths_since_last_delinq= ConditionalProbabilityTable(
    get_conditional_prob_distribution(df,['mths_since_last_delinq'],['purpose','emp_length_int']), 
    [purpose,emp_length_int]

) 

In [17]:
creditworthiness= ConditionalProbabilityTable(
    get_conditional_prob_distribution(df,['good_bad'],['grade','mths_since_last_delinq']), 
    [grade,mths_since_last_delinq]

) 

### Setting Nodes

In [18]:
s1 = Node(annual_inc, name="annual_inc")
s2 = Node(emp_length_int, name="emp_length_int")
s3 = Node(purpose, name="purpose")
s4 = Node(home_ownership, name="home_ownership")
s5 = Node(term, name="term")
s6 = Node(grade, name="grade")
s7 = Node(int_rate, name="int_rate")
s8 = Node(mths_since_last_delinq, name="mths_since_last_delinq")
s9 = Node(creditworthiness, name="creditworthiness")

### Constructing connections

In [19]:
model = BayesianNetwork("Credit Worthiness")

model.add_nodes(s1, s2, s3, s4, s5, s6, s7, s8,s9)
model.add_edge(s1, s4)
model.add_edge(s1, s5)
model.add_edge(s2, s4)
model.add_edge(s2, s6)
model.add_edge(s2, s8)
model.add_edge(s3, s5)
model.add_edge(s3, s7)
model.add_edge(s3, s8)
model.add_edge(s4, s6)
model.add_edge(s5, s7)
model.add_edge(s6, s9)
model.add_edge(s7, s6)
model.add_edge(s8, s9)

model.bake()

In [33]:
a=model.predict_proba([{'int_rate':'high',"purpose":'house'},{"purpose":'car'}])
len(a)

2

In [21]:
def predict(x):

    keys= [i.to_dict()['name'] for i in model.states]
    values = [None if i == 'None' else i for i in x]
    mapping_dict= dict(zip(keys,values))
    mapping_dict_filtered = {k: v for k, v in mapping_dict.items() if v is not None}
    prediction= model.predict_proba([mapping_dict_filtered])[0][8].to_dict()['parameters'][0]
    no = np.round(prediction['bad'],2)
    yes= np.round(prediction['good'],2)
    return f'Probability of {keys[8]} given your observations equals: yes:{yes} no:{no}'

    

In [22]:
predict(['None','None','None','None','None','G','None','None'])

'Probability of creditworthiness given your observations equals: yes:0.72 no:0.28'

In [23]:
predict(['50k', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None'])

'Probability of creditworthiness given your observations equals: yes:0.89 no:0.11'

### Saving our model

In [None]:
my_model=model.to_json()

In [None]:
pickle.dump(my_model, open('model.pkl','wb'))

In [6]:
model_json = pickle.load(open('model.pkl', 'rb'))

In [7]:
model=BayesianNetwork.from_json(model_json)

### Requirements

In [40]:
pomegranate.__version__

'0.14.5'

### Testing our api

In [1]:
import requests

### Defining our url

In [2]:
base_site="https://credit-model-iti.herokuapp.com/predict_api"

### Sending GET request

In [3]:
r = requests.get(base_site)
r.json()

'Credit Risk APP is working'

In [6]:
r

<Response [200]>

### Sending POST request

In [5]:
r= requests.post(url=base_site,json=[{"int_rate":"low"},{"purpose":'car'}])
r.json()

[{'bad': 0.04300280246781625, 'good': 0.9569971975321837},
 {'bad': 0.0937635425417053, 'good': 0.9062364574582947}]

### Batch predictions

In [160]:
r= requests.post(url=base_site,json=[{"grade":'A',"int_rate":"low"},
                                     {"grade":"G","int_rate":"sharp"}])
r.json()

[{'bad': 0.038342358255419706, 'good': 0.9616576417445802},
 {'bad': 0.27913852054406485, 'good': 0.720861479455935}]