In [1]:
import pandas as pd
import numpy as np
import dateutil.tz
import datetime as dt
import json

from io import StringIO
import urllib3
import logging
import sys
from json import dumps
import time

def get_ymd(datetime):
    year = datetime.year
    month = datetime.month
    day = datetime.day
            
    if month < 10:
        month = '0' + str(month)
    if day < 10:
        day = '0' + str(day)
    return year, month, day

def first_day_next_month(date):
    return (date.replace(day=1) + dt.timedelta(days=32)).replace(day=1)

def last_second_of_month(date: str) -> str:
    return str((pd.event_timestamp(date) + pd.offsets.MonthEnd(0)).date()) + " 23:59:59"

def first_second_of_month(date: str) -> str:
    return str((pd.event_timestamp(date) + pd.offsets.MonthBegin(0)).date()) + " 00:00:00"

streamer = StringIO()

def setup_logging():
    logger = logging.getLogger()
    for h in logger.handlers:
        logger.removeHandler(h)
     
    h = logging.StreamHandler(stream = streamer)
    h.setFormatter(logging.Formatter("%(asctime)s %(levelname)s: %(message)s",
                              "%Y-%m-%d %H:%M:%S"))
    logger.addHandler(h)
    logger.setLevel(logging.INFO)
    return logger

def query_log(query_id, table, logger):
    status = wr.athena.get_query_execution(query_id)['Status']['State']
    if wr.athena.get_query_execution(query_id)['Status']['State'] in ['FAILED', 'CANCELLED']:
        logger.critical(table + ': query is in ' + status + ' State. ' + 'QueryID: ' + query_id)
    else:
        logger.info(table + ': query is in ' + status + ' State. ' + 'QueryID: ' + query_id)
    return None


In [2]:
# !python -m pip install amazon-textract-caller --upgrade
# !python -m pip install amazon-textract-response-parser --upgrade

In [3]:
# !pip install 

In [4]:
import re
bool(re.match('^[a-zA-Z]', ''))

False

# Static Raw Feature 

In [5]:
def get_cibil_score(cibil_info):
    for item in cibil_info:
        if item.isnumeric():
            return item
    return ''

In [6]:
def get_static_raw_cibil_features(df):
    dfs=[]
    for idx,row in df.iterrows():
        
        user_id = row.name
        gender = row['gender']
        name = row['name']
        total_email = len(row['email'])
        total_phone_nos = len(row['phone_no'])
        dob = row['dob']
        user_identifier = row['user_identifier']
        # remove from startic in final version
#         age = get_age(date(int(dob_list[2]),int(dob_list[1]),int(dob_list[0])))
        cibil_score = get_cibil_score(row['cibil_info_with_factors'])
        total_address = len(row['address'])
        total_loans = len(row['account_info'])

        df_static = pd.DataFrame({ 'user_id' : [user_id], 'name':[name],'gender':[gender], 'total_email':[total_email],'dob':[dob],'cibil_score':[cibil_score],
                       'total_address':[total_address],'total_loans':[total_loans],'total_phone_nos': [total_phone_nos], 'user_identifier':[user_identifier] })
        dfs.append(df_static)
    return pd.concat(dfs)
    

In [7]:
! ls ../cibil_data/train/

cibil_out  cibil_pdf  errors.csv  new_cibil_out  new_cibil_pdf


In [8]:
# !ls ../cibil_data/parsed_data/

In [9]:
import glob 
all_csv = glob.glob('../cibil_data/train/cibil_out/*.csv') + glob.glob('../cibil_data/train/new_cibil_out/*.csv')

In [10]:
# all_csv

In [11]:
dfs = []
for csv in all_csv:
    dfs.append(pd.read_csv(csv))
dfs = pd.concat(dfs)

In [12]:
dfs.reset_index(inplace=True)

In [13]:
dfs.columns

Index(['index', 'name', 'gender', 'cibil_info_with_factors', 'cibil_name',
       'user_identifier', 'dob', 'address', 'phone_no', 'email', 'acc_summary',
       'account_info', 'enquiry', 'account_info_new'],
      dtype='object')

In [14]:
change_cols = ['cibil_info_with_factors', 'address', 'phone_no', 'email', 'acc_summary', 'account_info','enquiry','account_info_new']

In [15]:
for col in change_cols:
    dfs[col] = dfs[col].apply(lambda x:eval(x))

In [16]:
# dfs['enquiry'][10]

In [17]:
df_static = get_static_raw_cibil_features(dfs)

In [18]:
df_static

Unnamed: 0,user_id,name,gender,total_email,dob,cibil_score,total_address,total_loans,total_phone_nos,user_identifier
0,0,SHAIK HUSAN PASHA S/O SHAIK,MALE,4,02/02/1980,645,4,11,4,"{'PAN': 'HQPPS0603K', 'VOTER ID': 'YAV1651389'..."
0,1,GINAJALA VEERAJU POTHU RAJU,MALE,0,01/01/1966,,1,0,1,{'VOTER ID': 'IMZ1676123'}
0,2,KORRA RAVI,MALE,0,02/04/1980,712,4,1,2,"{'PAN': 'BUKPK3225N', 'RATION CARD': '12357056..."
0,3,SANAGAPALLI SARADHA,FEMALE,1,21/07/1993,757,3,7,4,"{'PAN': 'FMEPS5341P', 'VOTER ID': 'ZAF0660556'..."
0,4,MEKA SRINU,MALE,0,01/01/1987,747,2,1,1,"{'PAN': 'HPOPM6548R', 'VOTER ID': 'UDD0141804'}"
...,...,...,...,...,...,...,...,...,...,...
0,1311,VINOREXLINE AR,FEMALE,4,05/06/1991,738,4,17,4,"{'PAN': 'CIJPV7471D', 'VOTER ID': 'TRQ0365726'..."
0,1312,SAGAR VASANTRAO LANDGE LANDGE,MALE,0,01/06/1986,694,4,7,4,"{'PAN': 'ADMPL4711E', 'DRIVING LICENSE': 'MH12..."
0,1313,RANKIREDDY NAGESWARA RAO,MALE,2,02/04/1976,654,2,8,4,"{'PAN': 'DUUPR3358M', 'UNIVERSAL ID': '3674329..."
0,1314,PANTHAM KRISHNA,MALE,0,18/05/1990,521,2,1,3,"{'PAN': 'DPWPP4740C', 'UNIVERSAL ID': '4037596..."


## DPD Table

In [19]:
# df_nested_list['AccountInformation']

   - proper case/form for overdue calculation for all type of loans 
   - STD ~0 or not for v0 ? 
   - skip overdue for v0 
           -- schedule pmt 

In [20]:
# (df_nested_list['AccountInformation'][0][0]['AMOUNTS']['SanctionedAmount'])

In [21]:
def get_timestamp(dpd_list):
    final_json = {}
    if (len(dpd_list)) > 0:
        
        yrs = list(dpd_list.keys())[::-1]
        
        for item in yrs:
            dpd_elem = dpd_list[item]
            for k,v in dpd_elem.items():
    #             print(k,v)
                final_json[k+'-'+item]=v
    return final_json
    

In [22]:
df_static.columns

Index(['user_id', 'name', 'gender', 'total_email', 'dob', 'cibil_score',
       'total_address', 'total_loans', 'total_phone_nos', 'user_identifier'],
      dtype='object')

In [23]:
# dfs

In [24]:
for k,v in (dfs['account_info'][0][0]['DPD_INFO']).items():
    print(pd.to_datetime(k,format='%m-%y'))

2022-07-01 00:00:00
2022-06-01 00:00:00
2022-05-01 00:00:00
2022-04-01 00:00:00
2022-03-01 00:00:00
2022-02-01 00:00:00
2022-01-01 00:00:00
2021-12-01 00:00:00
2021-11-01 00:00:00
2021-10-01 00:00:00
2021-09-01 00:00:00
2021-08-01 00:00:00
2021-07-01 00:00:00


In [25]:
def get_dpd_raw_table(df):
    dfs = [] 
    
    for idx,row in df.iterrows():
        user_id = row.name 
        acc_info_list = row['account_info']
        dpd_info_json = row['account_info_new']
        for i in range(len(acc_info_list)):
            loan_id = i 
            loan_info = acc_info_list[i]
            sanc_amount =''
            loan_type = loan_info['ACCOUNT']['TYPE']
            ownership = loan_info['ACCOUNT']['ownership']
            if 'sanctioned' in loan_info['AMOUNTS']:
                sanc_amount=loan_info['AMOUNTS']['sanctioned']
            interest_rate = loan_info['AMOUNTS']['interest_rate']
            repayment_tenure = loan_info['AMOUNTS']['repay_tenure']
            emi_amount = loan_info['AMOUNTS']['emi']
            pmt_freq = loan_info['AMOUNTS']['pmt_freq']
            open_date = loan_info['DATES']['opened']
            closed_date = loan_info['DATES']['closed']
#             dpd_list = loan_info['PaymentHistory']['dayPayDue']
            timestamp_json = dpd_info_json[i]
            for k,v in timestamp_json.items():
                is_open=1
                is_closed=0
                ## dd-mm-yyyy
                
                
#                     print(closed_date)
                    

                curr_timestamp = pd.to_datetime(k,format='%m-%y')
                closed_date_m_y = pd.to_datetime(closed_date,dayfirst=True)
                try:
                    if len(closed_date.split('-'))>=3:

                        if curr_timestamp.year==closed_date_m_y.year and curr_timestamp.month==closed_date_m_y.month:
                            is_open=0
                            is_closed=1
                except:
                    is_open=1
                    is_closed=0

                df_dpd = pd.DataFrame({'user_id':[user_id],'loan_id':[loan_id],'timestamp':[k],'dpd':[v],'loan_type':[loan_type],'ownership':[ownership],'sanc_amount':[sanc_amount], 'interest_rate':[interest_rate], 
                              'repayment_tenure':[repayment_tenure],'emi_amount':[emi_amount],'pmt_freq':[pmt_freq],'open_date':[open_date],
                              'closed_date':[closed_date],'is_open':[is_open],'is_closed':[is_closed]})
                dfs.append(df_dpd)
                

                    
                        
    return pd.concat(dfs)
            

In [26]:
df_raw_dpd = get_dpd_raw_table(dfs)

In [27]:
# df_static[df_static.user_id==0]

In [28]:
set(df_raw_dpd[df_raw_dpd['user_id']==0].loan_id.tolist())

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}

In [29]:
 df_raw_dpd['timestamp_new'] = df_raw_dpd['timestamp'].apply(lambda x:pd.to_datetime(x,format='%m-%y'))


In [30]:
# df_raw_dpd['dpd'].value_counts()

In [31]:
def post_process_raw_dpd(df):
#     df['next_loan_id'] = df['loan_id'].shift(-1)

    df['next_user_id'] = df['user_id'].shift(-1)
#     df['next_loan_id'] = df['next_loan_id'].fillna(0)
    df['next_user_id'] = df['next_user_id'].fillna(0)
#     df['next_loan_id'] = df['next_loan_id'].apply(lambda x: int(x)  )
    df['next_user_id'] = df['next_user_id'].apply(lambda x: int(x) )

    return df

In [32]:
df_raw_dpd.shape

(148684, 16)

In [33]:
df_raw_dpd_new = post_process_raw_dpd(df_raw_dpd)

In [34]:
df_raw_dpd_new.shape

(148684, 17)

In [35]:
# df_raw_dpd_new[:35]

In [36]:
df_raw_dpd_new['timestamp_new'].min()

Timestamp('2003-08-01 00:00:00')

In [37]:
df_raw_dpd_new.sort_values(['user_id','timestamp_new'],inplace=True)

In [38]:
df_raw_dpd_new.columns

Index(['user_id', 'loan_id', 'timestamp', 'dpd', 'loan_type', 'ownership',
       'sanc_amount', 'interest_rate', 'repayment_tenure', 'emi_amount',
       'pmt_freq', 'open_date', 'closed_date', 'is_open', 'is_closed',
       'timestamp_new', 'next_user_id'],
      dtype='object')

In [39]:
# df_raw_dpd_new.reset_index(inplace=True)

In [40]:
df_raw_dpd_new[['timestamp', 'dpd', 'loan_type', 'ownership',
       'sanc_amount', 'interest_rate', 'repayment_tenure', 'emi_amount','open_date', 'closed_date']][37:48]

Unnamed: 0,timestamp,dpd,loan_type,ownership,sanc_amount,interest_rate,repayment_tenure,emi_amount,open_date,closed_date
0,4-19,0,Two-wheeler Loan,INDIVIDUAL,,,30.0,3500.0,29-10-2018,12-05-2021
0,5-19,0,Two-wheeler Loan,INDIVIDUAL,,,30.0,3500.0,29-10-2018,12-05-2021
0,6-19,0,Two-wheeler Loan,INDIVIDUAL,,,30.0,3500.0,29-10-2018,12-05-2021
0,7-19,0,Two-wheeler Loan,INDIVIDUAL,,,30.0,3500.0,29-10-2018,12-05-2021
0,8-19,0,Personal Loan,INDIVIDUAL,137896.0,,36.0,6072.0,09-07-2019,
0,8-19,0,Commercial Vehicle Loan,JOINT,1214322.0,,45.0,39000.0,22-05-2019,
0,8-19,0,Two-wheeler Loan,INDIVIDUAL,,,30.0,3500.0,29-10-2018,12-05-2021
0,9-19,0,Personal Loan,INDIVIDUAL,137896.0,,36.0,6072.0,09-07-2019,
0,9-19,0,Commercial Vehicle Loan,JOINT,1214322.0,,45.0,39000.0,22-05-2019,
0,9-19,0,Two-wheeler Loan,INDIVIDUAL,,,30.0,3500.0,29-10-2018,12-05-2021


In [41]:
# count=0
# for i,row in df_raw_dpd_new.iterrows():
#     try:
#         a = int(row['sanc_amount'])
#     except:
#         print(i)
#         count+=1
# print(count)

In [42]:
## Steps
## fill NA with mean values corresponding to the feature columns 
## preapre dummy overdue for each u,l pair 
##  double check the is_open and is_closed ! seems wrong for now 
## -> prepare overdue feature based on other attributes 

In [43]:
# float('3013353')

In [44]:
def get_sanc_amt(sanc_string):
    if sanc_string=='' or sanc_string is None:
        return 0
    sanc_string = sanc_string.replace('.',',')
    sanc_elems =sanc_string.split(',')
    final_amt=''
    for item in sanc_elems:
        final_amt+=item
    return eval(final_amt)

In [45]:
get_sanc_amt('3,123,234')

3123234

In [46]:
 df_raw_dpd_new['sanc_amount_temp'] =  df_raw_dpd_new['sanc_amount'].apply(lambda x: get_sanc_amt(x))

In [47]:
# df_raw_dpd_new['interest_rate_temp'] = df_raw_dpd_new['interest_rate'].apply(lambda x: eval(x) if (x!='-' and  x!= None) else 0)


In [48]:
# df_raw_dpd_new['emi_temp'] = df_raw_dpd_new['emi_amount'].apply(lambda x: eval(x) if (x!='-' and  x!= None) else 0)


In [49]:
sanc_amt_loan_type_mean_df = df_raw_dpd_new.groupby('loan_type').agg({'sanc_amount_temp':'mean'}).reset_index()

In [50]:
# interest_rate_loan_type_mean_df = df_raw_dpd_new.groupby('loan_type').agg({'interest_rate_temp':'mean'}).reset_index()

In [51]:
# emi_loan_type_mean_df = df_raw_dpd_new.groupby('loan_type').agg({'emi_temp':'mean'}).reset_index()

In [52]:
# set(df_raw_dpd_new[df_raw_dpd_new['repayment_tenure']=='-']['sanc_amount'].tolist()) #[['open_date','closed_date','emi_amount','interest_rate']]

In [53]:
# df_raw_dpd_new['closed_date'].max()

##  Enquiry Table 

In [54]:
# dfs['enquiry'][0]

    {'member': 'NOT DISCLOSED',
      'enquiry_date': '29-09-2022',
      'enquiry_purpose': 'Two-wheeler Loan',
      'enquiry_amt': '500,000'},

In [55]:
def get_enquiry_table(df): 
    dfs = []
    for idx,row in df.iterrows():
        user_id = row.name
        enq_info = row['enquiry']
        for elem in enq_info:
            date = elem['enquiry_date']
            enq_purpose = elem['enquiry_purpose']
            try:
                
                enq_amt = get_sanc_amt(elem['enquiry_amt'])
            except:
                enq_amt=0
#                 print(elem)
#                 break
            df_enq = pd.DataFrame({'user_id':[user_id],'date':[date],'enq_purpose':[enq_purpose],'enq_amount':[enq_amt]})
            dfs.append(df_enq)
    return pd.concat(dfs)

In [56]:
df_enquiry = get_enquiry_table(dfs)

In [57]:
df_enquiry.head(2)

Unnamed: 0,user_id,date,enq_purpose,enq_amount
0,0,08-07-2021,Personal Loan,0
0,0,08-07-2021,Personal Loan,0


In [58]:
df_enquiry['date'].value_counts()

NOT DISCLOSED    41
                 39
13-07-2022       26
20-09-2022       26
29-09-2021       25
                 ..
10-09-2016        1
10-09-2015        1
02-07-2015        1
08-09-2012        1
03-11-2013        1
Name: date, Length: 3017, dtype: int64

In [61]:
def get_timestamp(date):
    t=date
    try:
        t=pd.to_datetime(date)
    except:
        t=''
#         print(date)
#         continue
    return t

In [62]:
df_enquiry['timestamp'] = df_enquiry['date'].apply(get_timestamp)

  df_enquiry['timestamp'] = df_enquiry['date'].apply(get_timestamp)


In [63]:
df_enquiry =  df_enquiry[df_enquiry['timestamp']!='']

In [65]:
df_enquiry.shape

(12310, 5)

# Feature Table 

In [66]:
df_static.columns

Index(['user_id', 'name', 'gender', 'total_email', 'dob', 'cibil_score',
       'total_address', 'total_loans', 'total_phone_nos', 'user_identifier'],
      dtype='object')

In [67]:
df_static.head(2)

Unnamed: 0,user_id,name,gender,total_email,dob,cibil_score,total_address,total_loans,total_phone_nos,user_identifier
0,0,SHAIK HUSAN PASHA S/O SHAIK,MALE,4,02/02/1980,645.0,4,11,4,"{'PAN': 'HQPPS0603K', 'VOTER ID': 'YAV1651389'..."
0,1,GINAJALA VEERAJU POTHU RAJU,MALE,0,01/01/1966,,1,0,1,{'VOTER ID': 'IMZ1676123'}


In [68]:
df_static[df_static['user_id']==0]['total_email'].tolist()[0]

4

In [69]:
df_raw_dpd.columns

Index(['user_id', 'loan_id', 'timestamp', 'dpd', 'loan_type', 'ownership',
       'sanc_amount', 'interest_rate', 'repayment_tenure', 'emi_amount',
       'pmt_freq', 'open_date', 'closed_date', 'is_open', 'is_closed',
       'timestamp_new', 'next_user_id', 'sanc_amount_temp'],
      dtype='object')

    user_id/key	timestamp	total_email	open_loans	closed_loans	total_loans 	overdue	gender	age	enquiry_till_date	tot_enquiry_purpose	tot_enquiry_amount	DPD	interest_rate	tenure	sanctioned_amount	loan_type	ownership	score

In [70]:
def get_flags(acc_info):
    for acc in acc_info:
        dpd_list = acc['PaymentHistory']['dayPayDue']
        timestamp_json = get_timestamp(dpd_list)
        for k,v in timestamp_json.items():
            if v=='900':
                return True
    return False

In [71]:
# df_nested_list[df_nested_list['AccountInformation'].apply(lambda x: get_flags(x))]

In [72]:
# df_raw_dpd_new[df_raw_dpd_new['dpd']=='900']

In [73]:
df_raw_dpd_new['dpd'].value_counts()

000    84679
STD    19344
XXX    18240
900     2339
030     1183
       ...  
719        1
596        1
627        1
535        1
863        1
Name: dpd, Length: 825, dtype: int64

In [74]:
def fill_zero_val_from_loan_type(sanc_amount,loan_type):
    sanc_list =sanc_amt_loan_type_mean_df[sanc_amt_loan_type_mean_df['loan_type']==loan_type]['sanc_amount_temp'].tolist()
    if len(sanc_list)>0:  
        sanc_amt_loan_type = sanc_list[0]
        if int(sanc_amount)==0:
            return sanc_amt_loan_type
    return sanc_amount
    

In [75]:
df_raw_dpd_new['sanc_amt_final'] = df_raw_dpd_new.apply(lambda x:fill_zero_val_from_loan_type(x['sanc_amount_temp'],x['loan_type']),axis=1)

In [76]:
# df_raw_dpd_new[df_raw_dpd_new['sanc_amt_final']==0]

In [77]:
## use this for feature only 
def bucket_dpd(dpd_val):
    final_val=dpd_val
    if dpd_val=='STD' or dpd_val=='XXX':
        final_val=0  
    if dpd_val.isnumeric():
        dpd_val=int(dpd_val)
        if dpd_val<10:
            final_val=0
        elif dpd_val>=10 and dpd_val<20:
            final_val=1
        elif dpd_val>=20 and dpd_val<30:
            final_val=2
        elif dpd_val>=30 and dpd_val<40:
            final_val=3
        elif dpd_val>=40 and dpd_val<50:
            final_val=4
        elif dpd_val>=50 and dpd_val<60:
            final_val=5
        elif dpd_val>=60 and dpd_val<70:
            final_val=6
        elif dpd_val>=70 and dpd_val<80:
            final_val=7
        elif dpd_val>=80 and dpd_val<90:
            final_val=8
        else:
            final_val=9
    else:
        final_val=0
    return final_val
    
        

In [78]:
def modified_dpd(dpd_val):
    final_val=dpd_val
    if dpd_val=='STD' or dpd_val=='XXX':
        final_val=0  
    if dpd_val.isnumeric():
        dpd_val=int(dpd_val)
    else:
        final_val=0
    return int(final_val)

In [79]:
df_raw_dpd_new['modified_dpd'] = df_raw_dpd_new['dpd'].apply(modified_dpd)

In [80]:
# df_raw_dpd_new['modified_dpd'].dtype

In [81]:
df_raw_dpd_new['dpd_bucket'] = df_raw_dpd_new['dpd'].apply(bucket_dpd)

In [82]:
# df_raw_dpd_new['dpd_y'].value_counts()

In [83]:
# df_enquiry['enq_purpose'].nunique()

In [84]:
# df_enquiry[df_enquiry['timestamp']<'2014-01-01']

In [85]:
((pd.to_datetime('2022-10-1')- pd.to_datetime('1-2-2002')).days)/365

20.75890410958904

In [86]:
 (pd.date_range("2022-10-1", periods=3, freq="-1MS")).tolist()[1]

Timestamp('2022-09-01 00:00:00', freq='-1MS')

In [87]:
now = pd.to_datetime("2022-10-1")-    pd.Timedelta('1D')
last2 = now - pd.DateOffset(months=2)
last3 = now - pd.DateOffset(months=3)

In [88]:
# df_raw_dpd_new.query("(@now >= timestamp_new >= @last2) | (@now >= timestamp_new >= @last3)")

In [89]:
## 01-06-2022  (02-06-2022)
## 01-07-2022
## 01-08-2022
## 31-08-2022
## 01-09-2022
## month level date time range 

In [90]:
from datetime import date
 
def get_age(birthdate,curr_date):
    today = date.today()
    age = curr_date.year - birthdate.year - ((curr_date.month, curr_date.day) < (birthdate.month, birthdate.day))
    return age

In [91]:
a  = df_raw_dpd_new['timestamp_new'].nlargest(2).tolist()[1]

In [92]:
a

Timestamp('2022-11-01 00:00:00')

In [93]:
from datetime import date
feat_dict = {'Gold Loan' : 'gl', 
             'Personal Loan' : 'personal',
             'Commercial Vehicle Loan' : 'cvl',
             'Credit Card' : 'cc'
            }
def get_all_loan_type_feats(df,loan_type):
    new_df = df[df['loan_type']==loan_type]
    max_date = df_raw_dpd['timestamp_new'].max()
    user_id = df_raw_dpd['user_id'].tolist()[0]
    curr_date =  date.today()

    timestamp_new = max_date
    print((new_df.dpd.tolist()))
    try:
        
        last_3_months_dpd= sum(new_df[(new_df['timestamp_new'].apply(lambda x: x <timestamp_new and x>=(timestamp_new-pd.DateOffset(months=3))))].dpd.tolist())

        #last_3_months_dpd =  sum(new_df[(new_df['timestamp_new'].apply(lambda x: x in (pd.date_range(prev_day, periods=95, freq="-1D"))))]['modified_dpd'].tolist())
        last_6_months_dpd = sum(new_df[(new_df['timestamp_new'].apply(lambda x: x <timestamp_new and x>=(timestamp_new-pd.DateOffset(months=6))))].dpd.tolist())
        last_12_months_dpd = sum(new_df[(new_df['timestamp_new'].apply(lambda x: x <timestamp_new and x>=(timestamp_new-pd.DateOffset(months=12))))].dpd.tolist())
        last_36_months_dpd = sum(new_df[(new_df['timestamp_new'].apply(lambda x: x <timestamp_new and x>=(timestamp_new-pd.DateOffset(months=36))))].dpd.tolist())
    except:
        last_3_months_dpd=last_6_months_dpd=last_12_months_dpd=last_36_months_dpd=0
    df_l = pd.DataFrame({'user_id':[user_id], 'datetime_formatted':[max_date], 'last_3_months_dpd' :[last_3_months_dpd],'last_6_months_dpd' : [last_6_months_dpd],'last_12_months_dpd':[last_12_months_dpd],'last_36_months_dpd':[last_36_months_dpd]
                        })
#     df_l= df[df['loan_type']==loan_type] #'Gold Loan'    
    final_cols = ['user_id','datetime_formatted']
    col_list = ['last_3_months_dpd', 'last_6_months_dpd', 'last_12_months_dpd',
       'last_36_months_dpd']
    df_grp = df_l[['user_id','datetime_formatted', 'last_3_months_dpd', 'last_6_months_dpd', 'last_12_months_dpd',
       'last_36_months_dpd']]
    key_to_add = feat_dict[loan_type]
    for col in col_list:
        temp = col + '_' +key_to_add
        final_cols.append(temp)
    df_grp.columns = final_cols
    
    return df_grp

In [94]:
df_raw_dpd_new.columns

Index(['user_id', 'loan_id', 'timestamp', 'dpd', 'loan_type', 'ownership',
       'sanc_amount', 'interest_rate', 'repayment_tenure', 'emi_amount',
       'pmt_freq', 'open_date', 'closed_date', 'is_open', 'is_closed',
       'timestamp_new', 'next_user_id', 'sanc_amount_temp', 'sanc_amt_final',
       'modified_dpd', 'dpd_bucket'],
      dtype='object')

In [95]:
list(feat_dict.keys())

['Gold Loan', 'Personal Loan', 'Commercial Vehicle Loan', 'Credit Card']

In [96]:
## steps to eval all fileds : 
## make raw_dpd as central table as this contains all the info regarding user_id and loan_id
## user_id , key --> get from the raw_dpd_table
## for this pair eval all the fields  | for static directly query from static_raw_df | for enq perform all ops from enq_table 
## eval timestamp based on mm-yyyy from the dpd table to eval the dpd related features 
## static -> 'gender', 'total_email', 'dob', 'age', 'cibil_score',
##       'total_address', 'total_loans', 'total_phone_nos'

def get_cibil_feature_table(df_raw_static,df_raw_dpd,df_raw_enquiry):
    dfs_all_feat = []
    count=0
    all_users = list(set(df_raw_dpd['user_id'].tolist()))
    curr_date=date.today()
    print(len(all_users))
    for i in range(len(all_users)):
        user_id = all_users[i]
#         key = user_id
        user_dpd_df = df_raw_dpd[df_raw_dpd['user_id']==user_id]
#         all_loans = list(set(user_dpd_df['loan_id'].tolist()))
        
#         print(len(all_loans))
#         for j in range(len(all_loans)):            
#             loan_id=all_loans[j]
        key =str(user_id)
        new_df = df_raw_dpd[(df_raw_dpd['user_id']==user_id)]
        if new_df.shape[0]>1:
            count+=1

#                     print(new_df['timestamp_new'].nlargest(2).tolist()[1])
            time_list = new_df.sort_values('timestamp_new',ascending=False)['timestamp_new'].tolist()
            timestamp = time_list[0]
            timestamp_new = time_list[1]

            total_email = df_raw_static[df_raw_static['user_id']==user_id]['total_email'].tolist()[0]
            gender = df_raw_static[df_raw_static['user_id']==user_id]['gender'].tolist()[0]
            dob = df_raw_static[df_raw_static['user_id']==user_id]['dob'].tolist()[0]
            age = get_age(pd.to_datetime(dob),curr_date)
        #         age = df_raw_static[df_raw_static['user_id']==row['user_id']]['age'].tolist()[0]
            cibil_score = df_raw_static[df_raw_static['user_id']==user_id]['cibil_score'].tolist()[0]
            total_address = df_raw_static[df_raw_static['user_id']==user_id]['total_address'].tolist()[0]
        #         total_loans = df_raw_static[df_raw_static['user_id']==row['user_id']]['total_loans'].tolist()[0]
            total_phone_nos = df_raw_static[df_raw_static['user_id']==user_id]['total_phone_nos'].tolist()[0]

            total_loans = len(set(new_df[(new_df['timestamp_new']<=timestamp_new)
                                   &(new_df['is_open']==1)]['loan_id'].tolist()))
            closed_loans = len(set(new_df[(new_df['timestamp_new']<=timestamp_new)
                                   &(new_df['is_closed']==1)]['loan_id'].tolist()))

            open_loans = total_loans - closed_loans
            dpd = new_df[new_df['timestamp_new']==timestamp]['modified_dpd'].tolist()[0]
            dpd_bucket = new_df['dpd_bucket'].mean()
    #         dpd_provided = row['dpd']
            sanc_amount = new_df['sanc_amt_final'].tolist()[0]
            ## mm-yyyy 

            last_3_months_dpd= sum(new_df[(new_df['timestamp_new'].apply(lambda x: x <=timestamp_new and x>=(timestamp_new-pd.DateOffset(months=3))))]['modified_dpd'].tolist())

            #last_3_months_dpd =  sum(new_df[(new_df['timestamp_new'].apply(lambda x: x in (pd.date_range(prev_day, periods=95, freq="-1D"))))]['modified_dpd'].tolist())
            last_6_months_dpd = sum(new_df[(new_df['timestamp_new'].apply(lambda x: x <=timestamp_new and x>=(timestamp_new-pd.DateOffset(months=6))))]['modified_dpd'].tolist())
            last_12_months_dpd = sum(new_df[(new_df['timestamp_new'].apply(lambda x: x <=timestamp_new and x>=(timestamp_new-pd.DateOffset(months=12))))]['modified_dpd'].tolist())
            last_36_months_dpd = sum(new_df[(new_df['timestamp_new'].apply(lambda x: x <=timestamp_new and x>=(timestamp_new-pd.DateOffset(months=36))))]['modified_dpd'].tolist())

                    ## enquiry level features -> total_enq_till_date  | unique_enquiry_purpose  | total_enq_amt 
            total_enq_till_date = df_raw_enquiry[(df_raw_enquiry['user_id']==user_id) & (df_raw_enquiry['timestamp']<=timestamp_new) & (df_raw_enquiry['timestamp']>timestamp_new-pd.DateOffset(months=3))].shape[0]
#                 unique_enquiry_purpose = df_raw_enquiry[(df_raw_enquiry['user_id']==row['user_id']) & (df_raw_enquiry['timestamp']<timestamp_new)]['enq_purpose'].nunique()
#                 total_enq_amt = df_raw_enquiry[(df_raw_enquiry['user_id']==row['user_id']) & (df_raw_enquiry['timestamp']<timestamp_new)]['enq_amount'].sum()
            final_json={
                'key':[key] , 'timestamp':[timestamp] ,'datetime_formatted' : [timestamp_new], 'cibil_score' : [cibil_score], 
                'total_email' : [total_email], 'gender' : [gender], 'age' : [age] ,'open_loans':[open_loans],'closed_loans':[closed_loans] ,  'total_address' : [total_address],
                'sanc_amount':[sanc_amount],'total_loans' : [total_loans], 'total_phone_nos' : [total_phone_nos], 'dpd':[dpd],'dpd_bucket':[dpd_bucket],
                'last_3_months_dpd' :[last_3_months_dpd],'last_6_months_dpd' : [last_6_months_dpd],'last_12_months_dpd':[last_12_months_dpd],'last_36_months_dpd':[last_36_months_dpd],
#                     'next_3_months_dpd' : [next_3_months_dpd],'next_6_months_dpd':[next_6_months_dpd] , 'next_12_months_dpd':[next_12_months_dpd], 'next_36_months_dpd':[next_36_months_dpd],
                'total_enq_till_date':[total_enq_till_date]
#                     ,'unique_enquiry_purpose':[unique_enquiry_purpose],'total_enq_amt':[total_enq_amt]

            }
            col_dict = {'last_3_months_dpd':3, 'last_6_months_dpd':6, 'last_12_months_dpd':12,
       'last_36_months_dpd':36}
            for loan_type in list(feat_dict.keys()):
                key_to_add = feat_dict[loan_type]
                for col in list(col_dict.keys()):
                    temp = col + '_' +key_to_add
                    
                    final_json[temp] = sum(new_df[(new_df['loan_type']==loan_type)&(new_df['timestamp_new'].apply(lambda x: x <=timestamp_new and x>=(timestamp_new-pd.DateOffset(months=col_dict[col]))))]['modified_dpd'].tolist())
        
            df_feature = pd.DataFrame(final_json)
            dfs_all_feat.append(df_feature)
        else:
            continue
#                 print(user_id,loan_id)
           #         for item in list(feat_dict.keys()):
#             feat_df= get_all_loan_type_feats(user_dpd_df,item)
# #             print(feat_df)
#             u_df = u_df.merge(feat_df,on=['user_id','datetime_formatted'],how='left')
    
#         dfs.append(u_df)
    print(len(dfs_all_feat))
    df_all = pd.concat(dfs_all_feat)
    print(count)
    
    return df_all
        
        
    

In [97]:
# 

In [98]:
# set(df_raw_dpd_new['loan_id'].tolist())

In [99]:
# df_raw_dpd_new[]

In [100]:
df_raw_dpd_new[(df_raw_dpd_new['user_id']==0) & (df_raw_dpd_new['loan_id']==2)].shape[0]

19

In [101]:
df_raw_dpd_new[(df_raw_dpd_new['user_id']==0)& (df_raw_dpd_new['loan_id']==0)].sort_values('timestamp_new',ascending=False)['timestamp_new'].tolist()[2]

Timestamp('2022-05-01 00:00:00')

In [102]:
df_cibil_feature =  get_cibil_feature_table(df_static,df_raw_dpd_new,df_enquiry)

1035
1021
1021


In [103]:
df_cibil_feature

Unnamed: 0,key,timestamp,datetime_formatted,cibil_score,total_email,gender,age,open_loans,closed_loans,total_address,...,last_12_months_dpd_personal,last_36_months_dpd_personal,last_3_months_dpd_cvl,last_6_months_dpd_cvl,last_12_months_dpd_cvl,last_36_months_dpd_cvl,last_3_months_dpd_cc,last_6_months_dpd_cc,last_12_months_dpd_cc,last_36_months_dpd_cc
0,0,2022-07-01,2022-07-01,645,4,MALE,42,7,4,4,...,658,686,185,491,925,1134,0,0,0,0
0,2,2022-07-01,2022-06-01,712,0,MALE,42,1,0,4,...,0,0,0,0,0,0,0,0,0,0
0,3,2022-07-01,2022-06-01,757,1,FEMALE,29,1,6,3,...,0,0,0,0,0,0,0,0,0,0
0,6,2018-01-01,2017-12-01,,0,FEMALE,38,1,0,1,...,0,0,0,0,0,0,0,0,0,0
0,7,2022-09-01,2022-08-01,617,3,MALE,46,5,8,4,...,11700,32400,308,396,407,479,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,1311,2022-10-01,2022-10-01,738,4,FEMALE,31,11,6,4,...,0,0,0,0,0,0,0,0,0,0
0,1312,2022-08-01,2022-07-01,694,0,MALE,36,5,2,4,...,0,0,107,114,142,164,0,0,0,0
0,1313,2022-09-01,2022-09-01,654,2,MALE,46,3,5,2,...,170,170,65,130,253,395,0,0,0,0
0,1314,2022-09-01,2022-08-01,521,0,MALE,32,1,0,2,...,0,0,3600,6300,11700,31456,0,0,0,0


In [104]:
# 

In [105]:
df_cibil_feature.to_csv('cibil_features_pdf_v2.csv',index=False)

In [106]:
df_cibil_feature = pd.read_csv('cibil_features_pdf_v2.csv')

# Next Steps 
       for a given user loan type and timestamp_new  
  -- groupby user and loan_type and timestamp_new 
  
  -- replicate same for enquiry / enquiry_type 
   --join these two at uid level
   

In [107]:
df_cibil_feature['user_id'] = df_cibil_feature['key']#.apply(lambda x:int(x.split('-')[0]))
# df_cibil_feature['loan_id'] = df_cibil_feature['key'].apply(lambda x:int(x.split('-')[1]))


In [108]:
# just for curr use case | not to use this method for prod 
# next step change the dpd level feats 
# df

In [109]:
df_cibil_feature.tail(20)

Unnamed: 0,key,timestamp,datetime_formatted,cibil_score,total_email,gender,age,open_loans,closed_loans,total_address,...,last_36_months_dpd_personal,last_3_months_dpd_cvl,last_6_months_dpd_cvl,last_12_months_dpd_cvl,last_36_months_dpd_cvl,last_3_months_dpd_cc,last_6_months_dpd_cc,last_12_months_dpd_cc,last_36_months_dpd_cc,user_id
1001,1294,2022-09-01,2022-08-01,764.0,0,FEMALE,29,1,0,1,...,0,0,0,0,0,0,0,0,0,1294
1002,1296,2022-08-01,2022-07-01,705.0,0,MALE,47,2,0,1,...,0,0,0,0,0,0,0,0,0,1296
1003,1297,2022-09-01,2022-08-01,709.0,0,MALE,51,3,1,4,...,0,0,0,0,0,0,0,0,0,1297
1004,1298,2020-04-01,2020-03-01,726.0,0,MALE,28,0,1,1,...,0,0,0,0,0,0,0,0,0,1298
1005,1299,2022-07-01,2022-07-01,655.0,1,MALE,39,3,7,4,...,0,2367,3827,5928,7380,0,0,0,0,1299
1006,1300,2022-08-01,2022-08-01,686.0,0,FEMALE,51,2,2,3,...,0,0,0,0,0,0,0,0,0,1300
1007,1302,2022-08-01,2022-08-01,706.0,3,MALE,44,9,15,4,...,0,596,1258,2120,2833,0,0,0,0,1302
1008,1303,2022-09-01,2022-09-01,617.0,3,MALE,27,9,5,4,...,0,86,86,86,162,0,0,0,0,1303
1009,1304,2022-07-01,2022-06-01,729.0,0,FEMALE,26,1,0,1,...,0,0,0,0,0,0,0,0,0,1304
1010,1305,2022-07-01,2022-06-01,758.0,1,FEMALE,46,1,1,4,...,0,0,0,0,0,0,0,0,0,1305


In [110]:
def get_loan_type_ownership(row):
    row['loan_type'] = df_raw_dpd_new[(df_raw_dpd_new['user_id']==row['user_id']) & (df_raw_dpd_new['timestamp_new']==row['datetime_formatted'])]['loan_type'].tolist()[0]
    row['ownership'] = df_raw_dpd_new[(df_raw_dpd_new['user_id']==row['user_id']) & (df_raw_dpd_new['timestamp_new']==row['datetime_formatted'])]['ownership'].tolist()[0]
    return row

In [111]:
df_cibil_feature = df_cibil_feature.apply(get_loan_type_ownership,axis=1)

In [112]:
def get_enquiry_type(row):
    enq_list = df_enquiry[df_enquiry['user_id']==row['user_id']]['enq_purpose'].tolist()
    if len(enq_list)>0:
        
        row['enquiry_purpose'] = df_enquiry[df_enquiry['user_id']==row['user_id']]['enq_purpose'].tolist()[0]
    else:
        row['enquiry_purpose']=''
    return row

In [113]:
# def get_tot_enq_amt

In [114]:
# 

In [115]:
# df_cibil_feature = df_cibil_feature.apply(get_enquiry_type,axis=1)

In [116]:
## l1 9-11 
## l1 10-11
## l1 

In [117]:
df_cibil_feature.columns

Index(['key', 'timestamp', 'datetime_formatted', 'cibil_score', 'total_email',
       'gender', 'age', 'open_loans', 'closed_loans', 'total_address',
       'sanc_amount', 'total_loans', 'total_phone_nos', 'dpd', 'dpd_bucket',
       'last_3_months_dpd', 'last_6_months_dpd', 'last_12_months_dpd',
       'last_36_months_dpd', 'total_enq_till_date', 'last_3_months_dpd_gl',
       'last_6_months_dpd_gl', 'last_12_months_dpd_gl',
       'last_36_months_dpd_gl', 'last_3_months_dpd_personal',
       'last_6_months_dpd_personal', 'last_12_months_dpd_personal',
       'last_36_months_dpd_personal', 'last_3_months_dpd_cvl',
       'last_6_months_dpd_cvl', 'last_12_months_dpd_cvl',
       'last_36_months_dpd_cvl', 'last_3_months_dpd_cc',
       'last_6_months_dpd_cc', 'last_12_months_dpd_cc',
       'last_36_months_dpd_cc', 'user_id', 'loan_type', 'ownership'],
      dtype='object')

In [118]:
df_cibil_feature.dtypes

key                              int64
timestamp                       object
datetime_formatted              object
cibil_score                    float64
total_email                      int64
gender                          object
age                              int64
open_loans                       int64
closed_loans                     int64
total_address                    int64
sanc_amount                    float64
total_loans                      int64
total_phone_nos                  int64
dpd                              int64
dpd_bucket                     float64
last_3_months_dpd                int64
last_6_months_dpd                int64
last_12_months_dpd               int64
last_36_months_dpd               int64
total_enq_till_date              int64
last_3_months_dpd_gl             int64
last_6_months_dpd_gl             int64
last_12_months_dpd_gl            int64
last_36_months_dpd_gl            int64
last_3_months_dpd_personal       int64
last_6_months_dpd_persona

In [119]:
obj_cols = ['last_3_months_dpd', 'last_6_months_dpd', 'last_12_months_dpd',
       'last_36_months_dpd']

In [120]:
for col in obj_cols:
    df_cibil_feature[col] = df_cibil_feature[col].apply(lambda x:float(x) if x is not None else 0)

In [1252]:
test_cibil_feats = df_cibil_feature[['user_id','loan_type','datetime_formatted', 'last_3_months_dpd', 'last_6_months_dpd', 'last_12_months_dpd',
       'last_36_months_dpd']].groupby(['user_id','loan_type','datetime_formatted']).sum().reset_index()

In [1253]:
# test_cibil_feats

In [121]:
f_all = df_cibil_feature[['user_id','datetime_formatted', 'last_3_months_dpd', 'last_6_months_dpd', 'last_12_months_dpd',
       'last_36_months_dpd']]

In [122]:
# f_all

In [123]:
f_all.columns =['user_id','datetime_formatted', 'last_3_months_dpd_all', 'last_6_months_dpd_all', 'last_12_months_dpd_all',
       'last_36_months_dpd_all']

In [124]:
df_cibil_feature = df_cibil_feature.merge(f_all,on=['user_id','datetime_formatted'],how='left')

In [125]:
# created f_all | f_gold_loan | f_cv | f_personal | f_credit_card 

In [126]:
# test_cibil_feats[test_cibil_feats['last_3_months_dpd']!=0]

In [127]:
# test_cibil_feats.columns = ['user_id','loan_type','datetime_formatted', 'last_3_months_dpd_on_user_id_loan_type', 'last_6_months_dpd_on_user_id_loan_type', 'last_12_months_dpd_on_user_id_loan_type',
#        'last_36_months_dpd_on_user_id_loan_type','next_3_months_dpd_on_user_id_loan_type', 'next_6_months_dpd_on_user_id_loan_type', 'next_12_months_dpd_on_user_id_loan_type',
#        'next_36_months_dpd_on_user_id_loan_type']

In [129]:
# test_cibil_feats['loan_type'].value_counts()

In [1222]:
# df_cibil_feature[(df_cibil_feature['loan_type']=='Gold Loan') & ( df_cibil_feature['dpd_bucket']>0)][['key','dpd']]

In [1219]:
test_cibil_feats.shape

(422, 7)

In [1220]:
# df_cibil_feature[df_cibil_feature['key']=='91-4']

In [1221]:
df_cibil_feature.columns

Index(['key', 'timestamp', 'datetime_formatted', 'cibil_score', 'total_email',
       'gender', 'age', 'open_loans', 'closed_loans', 'total_address',
       'sanc_amount', 'total_loans', 'total_phone_nos', 'dpd', 'dpd_bucket',
       'last_3_months_dpd', 'last_6_months_dpd', 'last_12_months_dpd',
       'last_36_months_dpd', 'total_enq_till_date', 'user_id', 'loan_type',
       'ownership', 'enquiry_purpose', 'last_3_months_dpd_all',
       'last_6_months_dpd_all', 'last_12_months_dpd_all',
       'last_36_months_dpd_all'],
      dtype='object')

In [1064]:
# df_cibil_feature = df_cibil_feature.merge(test_cibil_feats,on=['user_id','loan_type','datetime_formatted'],how='left')

In [1065]:
# test_enq_feats = df_cibil_feature[['user_id','enquiry_purpose','datetime_formatted', 'total_enq_till_date', 'unique_enquiry_purpose',
#        'total_enq_amt']].groupby(['user_id','enquiry_purpose','datetime_formatted']).sum().reset_index()

In [1066]:
# test_enq_feats.head(1)

In [1067]:
# test_enq_feats.columns = ['user_id','enquiry_purpose','datetime_formatted', 'total_enq_till_date_on_user_id_enq_purpose', 'unique_enquiry_purpose_on_user_id_enq_purpose',
#        'total_enq_amt_on_user_id_enq_purpose']

In [1068]:
# df_cibil_feature = df_cibil_feature.merge(test_enq_feats,on=['user_id','enquiry_purpose','datetime_formatted'],how='left')

In [1069]:
# df_cibil_feature.shape

In [1070]:
feat_dict = {'Gold Loan' : 'gl', 
             'Personal Loan' : 'personal',
             'Commercial Vehicle Loan' : 'cvl',
             'Credit Card' : 'cc'
            }

In [1071]:
def get_all_loan_type_feats(df,loan_type):
    df_l= df[df['loan_type']==loan_type] #'Gold Loan'
    
    final_cols = ['user_id','datetime_formatted']
    col_list = ['last_3_months_dpd', 'last_6_months_dpd', 'last_12_months_dpd',
       'last_36_months_dpd']
    df_grp = df_l[['user_id','datetime_formatted', 'last_3_months_dpd', 'last_6_months_dpd', 'last_12_months_dpd',
       'last_36_months_dpd']].groupby(['user_id','datetime_formatted']).sum().reset_index()
    key_to_add = feat_dict[loan_type]
    for col in col_list:
        temp = col + '_' +key_to_add
        final_cols.append(temp)
    df_grp.columns = final_cols
    df = df.merge(df_grp,on=['user_id','datetime_formatted'],how='left')
    return df

In [1259]:
# for item in list(feat_dict.keys()):
#     df_cibil_feature= get_all_loan_type_feats(df_cibil_feature,item)

In [130]:
# df_cibil_feature

In [131]:
df_cibil_feature.to_csv('all_feat_cibil_pdf_v2.csv',index=False)

In [132]:
df_cibil_feature = pd.read_csv('all_feat_cibil_pdf_v2.csv')

# V1 model - XGBoost | Loan Type Feats

In [133]:
df_cibil_feat = pd.read_csv('all_feat_cibil_pdf_v2.csv')

In [134]:
df_cibil_feat.shape

(1021, 43)

In [135]:
df_cibil_feat.user_id.nunique()

1021

In [136]:
# df_cibil_feat[['user_id','loan_id']].drop_duplicates()

In [137]:
# 2436-337

In [138]:
df_cibil_feat.columns

Index(['key', 'timestamp', 'datetime_formatted', 'cibil_score', 'total_email',
       'gender', 'age', 'open_loans', 'closed_loans', 'total_address',
       'sanc_amount', 'total_loans', 'total_phone_nos', 'dpd', 'dpd_bucket',
       'last_3_months_dpd', 'last_6_months_dpd', 'last_12_months_dpd',
       'last_36_months_dpd', 'total_enq_till_date', 'last_3_months_dpd_gl',
       'last_6_months_dpd_gl', 'last_12_months_dpd_gl',
       'last_36_months_dpd_gl', 'last_3_months_dpd_personal',
       'last_6_months_dpd_personal', 'last_12_months_dpd_personal',
       'last_36_months_dpd_personal', 'last_3_months_dpd_cvl',
       'last_6_months_dpd_cvl', 'last_12_months_dpd_cvl',
       'last_36_months_dpd_cvl', 'last_3_months_dpd_cc',
       'last_6_months_dpd_cc', 'last_12_months_dpd_cc',
       'last_36_months_dpd_cc', 'user_id', 'loan_type', 'ownership',
       'last_3_months_dpd_all', 'last_6_months_dpd_all',
       'last_12_months_dpd_all', 'last_36_months_dpd_all'],
      dtype='object'

In [139]:
# df_cibil_feat.sort_values(['user_id','loan_id','datetime_formatted'])[:15]

In [140]:
# df_cibil_feat.sort_values(['user_id','loan_id','datetime_formatted'],inplace=True)

In [141]:
# df_cibil_feat['open_loans_prev'] = df_cibil_feat['open_loans'].shift(1)

In [142]:
# df_cibil_feat['open_loans_prev'].fillna(0,inplace=True)

In [143]:
# df_cibil_feat['user_id_prev'] = df_cibil_feat['user_id'].shift(1)

In [144]:
# df_cibil_feat[['user_id','loan_id','closed_date']][:20]

In [145]:
df_cibil_feat['key'].nunique()

1021

In [146]:
# df_cibil_feat[df_cibil_feat['key']=='0-1']

In [147]:
# 0-1 , 0-11 > 0-2

In [148]:
# df_cibil_feat1 =  df_cibil_feat[(df_cibil_feat['open_loans_prev']!=df_cibil_feat['open_loans']) & (df_cibil_feat['total_loans']>1)]

In [149]:
# df_cibil_feat.groupby('key').agg(lambda x: x.tolist())


In [150]:
# df_cibil_feat1[['key','open_loans','closed_loans','total_loans']][:20]

In [151]:
# df_cibil_feat.sort_values('key',inplace=True)

In [152]:
## user X loan -> remove first loan based on timestamp (loan_open_date)

In [153]:
df_cibil_feat

Unnamed: 0,key,timestamp,datetime_formatted,cibil_score,total_email,gender,age,open_loans,closed_loans,total_address,...,last_6_months_dpd_cc,last_12_months_dpd_cc,last_36_months_dpd_cc,user_id,loan_type,ownership,last_3_months_dpd_all,last_6_months_dpd_all,last_12_months_dpd_all,last_36_months_dpd_all
0,0,2022-07-01,2022-07-01,645.0,4,MALE,42,7,4,4,...,0,0,0,0,Personal Loan,JOINT,414.0,979.0,1612.0,1877.0
1,2,2022-07-01,2022-06-01,712.0,0,MALE,42,1,0,4,...,0,0,0,2,Business Loan Priority Sector,INDIVIDUAL,0.0,0.0,0.0,0.0
2,3,2022-07-01,2022-06-01,757.0,1,FEMALE,29,1,6,3,...,0,0,0,3,Gold Loan,INDIVIDUAL,50.0,50.0,50.0,750.0
3,6,2018-01-01,2017-12-01,,0,FEMALE,38,1,0,1,...,0,0,0,6,Gold Loan,INDIVIDUAL,0.0,0.0,0.0,0.0
4,7,2022-09-01,2022-08-01,617.0,3,MALE,46,5,8,4,...,0,0,0,7,Commercial Vehicle Loan,INDIVIDUAL,4393.0,7452.0,13233.0,34131.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1016,1311,2022-10-01,2022-10-01,738.0,4,FEMALE,31,11,6,4,...,0,0,0,1311,Commercial Vehicle Loan,JOINT,0.0,0.0,0.0,0.0
1017,1312,2022-08-01,2022-07-01,694.0,0,MALE,36,5,2,4,...,0,0,0,1312,Commercial Vehicle Loan,JOINT,188.0,195.0,223.0,245.0
1018,1313,2022-09-01,2022-09-01,654.0,2,MALE,46,3,5,2,...,0,0,0,1313,,INDIVIDUAL,282.0,530.0,889.0,1086.0
1019,1314,2022-09-01,2022-08-01,521.0,0,MALE,32,1,0,2,...,0,0,0,1314,Commercial Vehicle Loan,INDIVIDUAL,3600.0,6300.0,11700.0,31456.0


In [154]:
# df_cibil_feat[['user_id','loan_id']].drop_duplicates()

In [155]:
df_cibil_feat['datetime_formatted'] = pd.to_datetime(df_cibil_feat.datetime_formatted, errors = 'coerce')

# first_loan_df = df_cibil_feat.groupby(['user_id', 'loan_id']).agg({'datetime_formatted': 'min'}).reset_index()
# first_loan_df['time_rank'] = df_cibil_feat.groupby(['user_id'])['datetime_formatted'].rank('dense', ascending=True)
# first_loan_df = first_loan_df[first_loan_df.time_rank !=1]

# df_cibil_feat = pd.merge(df_cibil_feat, first_loan_df, how = 'inner') 

In [156]:
# df_cibil_feat[df_cibil_feat[]]

In [157]:
# first_loan_df = df_cibil_feat.sort_values(['user_id','datetime_formatted']).groupby('user_id').agg({'loan_id': lambda x: list(x)[0]
#                                  }).reset_index()

In [158]:
df_cibil_feat

Unnamed: 0,key,timestamp,datetime_formatted,cibil_score,total_email,gender,age,open_loans,closed_loans,total_address,...,last_6_months_dpd_cc,last_12_months_dpd_cc,last_36_months_dpd_cc,user_id,loan_type,ownership,last_3_months_dpd_all,last_6_months_dpd_all,last_12_months_dpd_all,last_36_months_dpd_all
0,0,2022-07-01,2022-07-01,645.0,4,MALE,42,7,4,4,...,0,0,0,0,Personal Loan,JOINT,414.0,979.0,1612.0,1877.0
1,2,2022-07-01,2022-06-01,712.0,0,MALE,42,1,0,4,...,0,0,0,2,Business Loan Priority Sector,INDIVIDUAL,0.0,0.0,0.0,0.0
2,3,2022-07-01,2022-06-01,757.0,1,FEMALE,29,1,6,3,...,0,0,0,3,Gold Loan,INDIVIDUAL,50.0,50.0,50.0,750.0
3,6,2018-01-01,2017-12-01,,0,FEMALE,38,1,0,1,...,0,0,0,6,Gold Loan,INDIVIDUAL,0.0,0.0,0.0,0.0
4,7,2022-09-01,2022-08-01,617.0,3,MALE,46,5,8,4,...,0,0,0,7,Commercial Vehicle Loan,INDIVIDUAL,4393.0,7452.0,13233.0,34131.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1016,1311,2022-10-01,2022-10-01,738.0,4,FEMALE,31,11,6,4,...,0,0,0,1311,Commercial Vehicle Loan,JOINT,0.0,0.0,0.0,0.0
1017,1312,2022-08-01,2022-07-01,694.0,0,MALE,36,5,2,4,...,0,0,0,1312,Commercial Vehicle Loan,JOINT,188.0,195.0,223.0,245.0
1018,1313,2022-09-01,2022-09-01,654.0,2,MALE,46,3,5,2,...,0,0,0,1313,,INDIVIDUAL,282.0,530.0,889.0,1086.0
1019,1314,2022-09-01,2022-08-01,521.0,0,MALE,32,1,0,2,...,0,0,0,1314,Commercial Vehicle Loan,INDIVIDUAL,3600.0,6300.0,11700.0,31456.0


In [159]:
def grouper_df(df,key='key'):
    
    ## idea is to remove first loan  based on timestamp 
    
    
#     df = df[df['total_loans']>1]
#     dfs =[]
#     for i in range(92):
#         new_df = df[df['user_id']==i]
#         remove_loan_id = fist_loan_df[fist_loan_df['user_id']==i]['loan_id'].tolist()[0]
#         new_df = new_df[new_df['loan_id'] !=remove_loan_id]
#         dfs.append(new_df)
#     df=pd.concat(dfs)
# #     df = df[df['loan_id']!=0]
# #     df.reset_index(inplace=True)
    col_list = list(df.columns)
    col_list.remove('key')
    
    print(col_list)
    obj = df.groupby('key')
    new_df = pd.DataFrame()
    new_df['key'] = obj['key'].first()
    for col in col_list:
        new_df[col] = obj[col].apply(lambda x:list(x)[0])
    new_df.index = range(len(new_df))
    new_df.sort_values(['user_id','loan_id'],inplace=True)
    return new_df

In [160]:
# df_new = grouper_df(df_cibil_feat)

In [161]:
# df_new[df_new['user_id']==90]

In [162]:
# df_cibil_feat.sort_values(['user_id','datetime_formatted'])[:20]

In [163]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

## convert feats to numeric

In [164]:
# df_cibil_feat[df_cibil_feat['key']=='0-5']
# u0-0
# u0- 0 kth 
# u0 -1 (1st change point)


In [165]:
# df_new

In [166]:
df_cibil_feat

Unnamed: 0,key,timestamp,datetime_formatted,cibil_score,total_email,gender,age,open_loans,closed_loans,total_address,...,last_6_months_dpd_cc,last_12_months_dpd_cc,last_36_months_dpd_cc,user_id,loan_type,ownership,last_3_months_dpd_all,last_6_months_dpd_all,last_12_months_dpd_all,last_36_months_dpd_all
0,0,2022-07-01,2022-07-01,645.0,4,MALE,42,7,4,4,...,0,0,0,0,Personal Loan,JOINT,414.0,979.0,1612.0,1877.0
1,2,2022-07-01,2022-06-01,712.0,0,MALE,42,1,0,4,...,0,0,0,2,Business Loan Priority Sector,INDIVIDUAL,0.0,0.0,0.0,0.0
2,3,2022-07-01,2022-06-01,757.0,1,FEMALE,29,1,6,3,...,0,0,0,3,Gold Loan,INDIVIDUAL,50.0,50.0,50.0,750.0
3,6,2018-01-01,2017-12-01,,0,FEMALE,38,1,0,1,...,0,0,0,6,Gold Loan,INDIVIDUAL,0.0,0.0,0.0,0.0
4,7,2022-09-01,2022-08-01,617.0,3,MALE,46,5,8,4,...,0,0,0,7,Commercial Vehicle Loan,INDIVIDUAL,4393.0,7452.0,13233.0,34131.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1016,1311,2022-10-01,2022-10-01,738.0,4,FEMALE,31,11,6,4,...,0,0,0,1311,Commercial Vehicle Loan,JOINT,0.0,0.0,0.0,0.0
1017,1312,2022-08-01,2022-07-01,694.0,0,MALE,36,5,2,4,...,0,0,0,1312,Commercial Vehicle Loan,JOINT,188.0,195.0,223.0,245.0
1018,1313,2022-09-01,2022-09-01,654.0,2,MALE,46,3,5,2,...,0,0,0,1313,,INDIVIDUAL,282.0,530.0,889.0,1086.0
1019,1314,2022-09-01,2022-08-01,521.0,0,MALE,32,1,0,2,...,0,0,0,1314,Commercial Vehicle Loan,INDIVIDUAL,3600.0,6300.0,11700.0,31456.0


In [167]:
df_cibil_feat.dtypes

key                                     int64
timestamp                              object
datetime_formatted             datetime64[ns]
cibil_score                           float64
total_email                             int64
gender                                 object
age                                     int64
open_loans                              int64
closed_loans                            int64
total_address                           int64
sanc_amount                           float64
total_loans                             int64
total_phone_nos                         int64
dpd                                     int64
dpd_bucket                            float64
last_3_months_dpd                     float64
last_6_months_dpd                     float64
last_12_months_dpd                    float64
last_36_months_dpd                    float64
total_enq_till_date                     int64
last_3_months_dpd_gl                    int64
last_6_months_dpd_gl              

In [168]:
# df_cibil_feat['gender'].describe()

## change points 

## avg time series of dpd on user level 

In [169]:
df_cibil_feat['gender'] = df_cibil_feat['gender'].apply(lambda x: 1 if x=='Male' else 0)

In [170]:
def get_loan_type_encoder(loan_type):
    if loan_type=='Commercial Vehicle Loan':
        return 5
    elif loan_type=='Consumer Loan':
        return 4
    elif loan_type=='Gold Loan':
        return 3
    elif loan_type=='Personal Loan':
        return 2
    elif loan_type=='Credit Card':
        return 1
    else:
        return 0

In [171]:
list_loan_types = list(df_cibil_feat['loan_type'].unique())

In [172]:
for item in list_loan_types:
    print(item,df_cibil_feat[df_cibil_feat['loan_type']==item].shape[0]*100/df_cibil_feat.shape[0])

Personal Loan 8.81488736532811
Business Loan Priority Sector 2.546523016650343
Gold Loan 14.005876591576886
Commercial Vehicle Loan 25.367286973555338
 Auto Loan 1.860920666013712
Consumer Loan 14.789422135161606
nan 0.0
Two-wheeler Loan 6.170421155729676
Auto Loan 1.762977473065622
Other 3.1341821743388834
 Gold Loan 0.6856023506366308
 Housing Loan 0.1958863858961802
 Personal Loan 0.2938295788442703
  0.3917727717923604
 Commercial Vehicle Loan 0.6856023506366308
 Loan Against Bank Deposits 0.0979431929480901
 Business Loan Priority Sector 0.48971596474045054
Aut Loan 0.0979431929480901
 Two-wheeler Loan 0.1958863858961802
Microfinance-Business Loan 0.881488736532811
Credit Card 2.742409402546523
Loan to Professional 0.0979431929480901
Business Loan General 0.6856023506366308
Property Loan 1.0773751224289911
 Consumer Loan 0.881488736532811
Housing Loan 1.3712047012732616
Overdraft 0.0979431929480901
 E 0.1958863858961802
 Property Loan 0.0979431929480901
Loan Against Shares / Secur

In [173]:
df_cibil_feat['loan_type'] = df_cibil_feat['loan_type'].apply(get_loan_type_encoder)

In [174]:
df_cibil_feat['ownership'].unique()

array([' JOINT', ' INDIVIDUAL', ' GUARANTOR', '- JOINT'], dtype=object)

In [175]:
for item in list(df_cibil_feat['ownership'].unique()):
    print(item,df_cibil_feat[df_cibil_feat['ownership']==item].shape[0]*100/df_cibil_feat.shape[0])

 JOINT 18.609206660137122
 INDIVIDUAL 75.02448579823702
 GUARANTOR 6.268364348677767
- JOINT 0.0979431929480901


In [176]:
def get_ownership_encoder(ownership):
    if ownership=='Individual':
        return 3
    elif ownership=='Guarantor':
        return 2
    else:
        return 1

In [177]:
df_cibil_feat['ownership'] = df_cibil_feat['ownership'].apply(get_ownership_encoder)

In [179]:
# df_cibil_feat['enquiry_purpose'].unique()

In [181]:
# for item in list(df_cibil_feat['enquiry_purpose'].unique()):
#     print(item,df_cibil_feat[df_cibil_feat['enquiry_purpose']==item].shape[0]*100/df_cibil_feat.shape[0])

In [1298]:
def get_enquiry_purpose_encoder(enquiry_purpose):
    if enquiry_purpose=='Commercial Vehicle Loan':
        return 5
    elif enquiry_purpose=='Auto Loan (Personal)':
        return 4
    elif enquiry_purpose=='Tractor Loan':
        return 3
    elif enquiry_purpose=='Business Loan – General':
        return 2
    else:
        return 1

In [182]:
# df_cibil_feat['enquiry_purpose'] = df_cibil_feat['enquiry_purpose'].apply(get_enquiry_purpose_encoder)

## training and test split 

In [183]:
## 14 pdfs 
## 


In [184]:
## 619 pdf | ~ 1week   
## some 569 training pdf | (38 k user * loan )
##(2.4k change points)
## -> 

In [185]:
# df_cibil_feat = df_cibil_feat[df_cibil_feat['loan_type']>4]

In [186]:
df_cibil_feat

Unnamed: 0,key,timestamp,datetime_formatted,cibil_score,total_email,gender,age,open_loans,closed_loans,total_address,...,last_6_months_dpd_cc,last_12_months_dpd_cc,last_36_months_dpd_cc,user_id,loan_type,ownership,last_3_months_dpd_all,last_6_months_dpd_all,last_12_months_dpd_all,last_36_months_dpd_all
0,0,2022-07-01,2022-07-01,645.0,4,0,42,7,4,4,...,0,0,0,0,2,1,414.0,979.0,1612.0,1877.0
1,2,2022-07-01,2022-06-01,712.0,0,0,42,1,0,4,...,0,0,0,2,0,1,0.0,0.0,0.0,0.0
2,3,2022-07-01,2022-06-01,757.0,1,0,29,1,6,3,...,0,0,0,3,3,1,50.0,50.0,50.0,750.0
3,6,2018-01-01,2017-12-01,,0,0,38,1,0,1,...,0,0,0,6,3,1,0.0,0.0,0.0,0.0
4,7,2022-09-01,2022-08-01,617.0,3,0,46,5,8,4,...,0,0,0,7,5,1,4393.0,7452.0,13233.0,34131.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1016,1311,2022-10-01,2022-10-01,738.0,4,0,31,11,6,4,...,0,0,0,1311,5,1,0.0,0.0,0.0,0.0
1017,1312,2022-08-01,2022-07-01,694.0,0,0,36,5,2,4,...,0,0,0,1312,5,1,188.0,195.0,223.0,245.0
1018,1313,2022-09-01,2022-09-01,654.0,2,0,46,3,5,2,...,0,0,0,1313,0,1,282.0,530.0,889.0,1086.0
1019,1314,2022-09-01,2022-08-01,521.0,0,0,32,1,0,2,...,0,0,0,1314,5,1,3600.0,6300.0,11700.0,31456.0


In [187]:
df_cibil_feat['loan_type'].value_counts()

0    350
5    259
4    151
3    143
2     90
1     28
Name: loan_type, dtype: int64

In [188]:
# df_nested_list

In [189]:
# len(df_nested_list['AccountInformation'][0])

In [190]:
## remove age and total from feats | relative age from date 
## 

In [191]:
# df_ci

In [192]:
df_cibil_feat.iloc[8]

key                                             13
timestamp                               2022-07-01
datetime_formatted             2022-06-01 00:00:00
cibil_score                                  731.0
total_email                                      1
gender                                           0
age                                             33
open_loans                                       3
closed_loans                                    12
total_address                                    4
sanc_amount                                50000.0
total_loans                                     15
total_phone_nos                                  4
dpd                                              0
dpd_bucket                                0.586854
last_3_months_dpd                             21.0
last_6_months_dpd                             29.0
last_12_months_dpd                           129.0
last_36_months_dpd                           891.0
total_enq_till_date            

In [193]:
def next_3_month_bucet(next_3_months_dpd_on_user_id_loan_type):
    if next_3_months_dpd_on_user_id_loan_type==0:
        return 0
#     elif next_3_months_dpd_on_user_id_loan_type>0 and next_3_months_dpd_on_user_id_loan_type <=30:
#         return 1
#     elif next_3_months_dpd_on_user_id_loan_type>30 and next_3_months_dpd_on_user_id_loan_type<=90:
#         return 2
    else:
        return 1

In [194]:
df_cibil_feat['output'] = df_cibil_feat['dpd'].apply(next_3_month_bucet)

In [195]:
# df_cibil_feat['next_6_months_dpd'].describe()

In [196]:
df_cibil_feat['output'].value_counts()

0    850
1    171
Name: output, dtype: int64

In [197]:
df_cibil_feat.columns

Index(['key', 'timestamp', 'datetime_formatted', 'cibil_score', 'total_email',
       'gender', 'age', 'open_loans', 'closed_loans', 'total_address',
       'sanc_amount', 'total_loans', 'total_phone_nos', 'dpd', 'dpd_bucket',
       'last_3_months_dpd', 'last_6_months_dpd', 'last_12_months_dpd',
       'last_36_months_dpd', 'total_enq_till_date', 'last_3_months_dpd_gl',
       'last_6_months_dpd_gl', 'last_12_months_dpd_gl',
       'last_36_months_dpd_gl', 'last_3_months_dpd_personal',
       'last_6_months_dpd_personal', 'last_12_months_dpd_personal',
       'last_36_months_dpd_personal', 'last_3_months_dpd_cvl',
       'last_6_months_dpd_cvl', 'last_12_months_dpd_cvl',
       'last_36_months_dpd_cvl', 'last_3_months_dpd_cc',
       'last_6_months_dpd_cc', 'last_12_months_dpd_cc',
       'last_36_months_dpd_cc', 'user_id', 'loan_type', 'ownership',
       'last_3_months_dpd_all', 'last_6_months_dpd_all',
       'last_12_months_dpd_all', 'last_36_months_dpd_all', 'output'],
      dtyp

In [487]:
X = df_cibil_feat[[ 'total_email',
       'gender', 'age', 'open_loans', 'closed_loans', 'total_address','sanc_amount',
        'total_phone_nos', 'total_enq_till_date', 'loan_type', 'ownership',
#        'last_3_months_dpd_gl',
#        'last_6_months_dpd_gl', 'last_12_months_dpd_gl',
#        'last_36_months_dpd_gl',
        'last_3_months_dpd_personal', 'last_6_months_dpd_personal',
       'last_12_months_dpd_personal', 'last_36_months_dpd_personal',
        'last_3_months_dpd_cvl', 'last_6_months_dpd_cvl',
       'last_12_months_dpd_cvl', 'last_36_months_dpd_cvl',
#         'last_3_months_dpd_cc', 'last_6_months_dpd_cc', 'last_12_months_dpd_cc',
#        'last_36_months_dpd_cc',
        'output','last_3_months_dpd_all', 'last_6_months_dpd_all',
       'last_12_months_dpd_all','last_36_months_dpd_all'
       ]]

In [519]:
# !mkdir ../cibil_data/parsed_data_v2


In [518]:
# !cp -R ../cibil_data/parsed_data/ ../cibil_data/parsed_data_v2/

In [490]:
# df

In [491]:
y= df_cibil_feat[['output']]

In [492]:
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split


In [493]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1,random_state=42)


In [494]:
from sklearn.metrics import DistanceMetric


In [495]:
# import imblearn
# print(imblearn.__version__)

In [496]:
# X_train['output'].value_counts()

In [497]:
df_train_0 = X_train[X_train['output']==0]
df_train_1 = X_train[X_train['output']==1]
df_train_0 = df_train_0.sample(n=171*2, random_state=42)
df_train_sample = df_train_0.append(df_train_1)
df_train_sample.shape 

(499, 24)

In [498]:
y= df_train_sample[['output']]

In [499]:
X_train=df_train_sample

In [500]:
X_train.shape, X_test.shape

((499, 24), (103, 24))

In [501]:
y_train=X_train[['output']]

In [502]:
X_train = X_train.drop('output',axis=1)

In [503]:
y_train['output'].value_counts()

0    342
1    157
Name: output, dtype: int64

In [504]:
y_test['output'].value_counts()

0    89
1    14
Name: output, dtype: int64

In [505]:
xgb_classifier =XGBClassifier(n_estimators=1000)


In [506]:
from sklearn.utils.class_weight import compute_sample_weight
sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=y_train['output'] #provide your own target name
)
# 
# xgb_classifier.fit(X, y, sample_weight=sample_weights)

In [507]:
xgb_classifier.fit(X_train,y_train)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=1000, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [508]:
X_test = X_test.drop('output',axis=1)

In [509]:
predictions = xgb_classifier.predict(X_test)


In [510]:
y_proba = xgb_classifier.predict_proba(X_test)

In [511]:
print("Accuracy of Model::",accuracy_score(y_test,predictions))


Accuracy of Model:: 0.8349514563106796


In [512]:
y_test['output'].value_counts()

0    89
1    14
Name: output, dtype: int64

In [513]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve,classification_report

def plot_roc_curve(true_y, y_prob):
    """
    plots the roc curve based of the probabilities
    """

    fpr, tpr, thresholds = roc_curve(true_y, y_prob)
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

In [514]:
print(confusion_matrix(y_test,predictions))

[[78 11]
 [ 6  8]]


In [515]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.93      0.88      0.90        89
           1       0.42      0.57      0.48        14

    accuracy                           0.83       103
   macro avg       0.67      0.72      0.69       103
weighted avg       0.86      0.83      0.85       103



In [516]:
import pickle
file_name = "xgb_cibil_classifier_v2.pkl"

# save
pickle.dump(xgb_classifier, open(file_name, "wb"))


In [220]:
# load
xgb_model_loaded = pickle.load(open(file_name, "rb"))


In [222]:
# !cp xgb_cibil_classifier.pkl ../model_dir/

In [237]:
# xgb_model_loaded.predict(X_test)

### modify csvs

In [243]:
import glob 

In [292]:
# !ls /home/ubuntu/fin_services/cibil_data/test/cibil_pdf/*

In [316]:

(pd.read_csv(glob.glob('/home/ubuntu/fin_services/cibil_data/parsed_data/MADARI SRINIVAS CIBIL_Report - Shaik Wassi.csv')[0])['enquiry'])

0    [{'member_name': 'TVS CREDIT', 'enquiry_date':...
Name: enquiry, dtype: object

In [308]:
# pd.read_csv(glob.glob('/home/ubuntu/fin_services/cibil_data/parsed_data/*.csv')[0])

In [304]:
len(test_data)

29

In [248]:
df_t = pd.read_csv(test_data[0])

In [264]:
df_t.columns

Index(['name', 'gender', 'cibil_name', 'user_identifier', 'dob', 'address',
       'phone_no', 'email', 'acc_summary', 'enquiry', 'cibil_score', 'ACCOUNT',
       'DATES', 'AMOUNTS', 'DPD_INFO'],
      dtype='object')

In [266]:
df_t.head()

Unnamed: 0,name,gender,cibil_name,user_identifier,dob,address,phone_no,email,acc_summary,enquiry,cibil_score,ACCOUNT,DATES,AMOUNTS,DPD_INFO
0,NAVNATH HANUMANT PHADKE S/O HANUMANT,MALE,CIBILTUSC3,"{'PAN': 'BHEPP5637G', 'VOTER ID': 'KXH1109412'...",01/06/1977,"{'address': ' BURDE VASTI, MALI PETHA, CHARHOL...",919822600000.0,NOEMAIL@YESBANK.IN,"{'total': ' 62', 'overdue': ' 2', 'zero_balanc...","{'member_name': 'TVS CREDIT', 'enquiry_date': ...",739.0,"{'member_name': 'NOT DISCLOSED', 'account_numb...","{'opened': '22-08-2022', 'last_payment': '', '...","{'sanctioned': ' 380,000', 'current_balance': ...",{'8-22': '000'}
1,,,,,,{'address': ' AP H NO 756 KALUS TAL KHED DIST ...,9822597000.0,NAVNATH57@GMAIL.COM,,"{'member_name': 'NOT DISCLOSED', 'enquiry_date...",,"{'member_name': 'NOT DISCLOSED', 'account_numb...","{'opened': '13-07-2022', 'last_payment': '', '...","{'sanctioned': ' 160,000', 'current_balance': ...","{'8-22': '000', '7-22': '000'}"
2,,,,,,{'address': ' A/P KALUS TAL KHED DIST PUNE NEA...,9822597000.0,NULL@GMAIL.COM,,"{'member_name': 'NOT DISCLOSED', 'enquiry_date...",,"{'member_name': 'NOT DISCLOSED', 'account_numb...","{'opened': '17-06-2022', 'last_payment': '', '...","{'sanctioned': ' 3,892,000', 'current_balance'...","{'7-22': '000', '6-22': '000'}"
3,,,,,,{'address': ' 616 MALAVADI RASTACHYA KHALIL FA...,9822597000.0,NA@MAIL.COM,,"{'member_name': 'NOT DISCLOSED', 'enquiry_date...",,"{'member_name': 'NOT DISCLOSED', 'account_numb...","{'opened': '24-05-2022', 'last_payment': ' 22-...","{'sanctioned': ' 4,340,231', 'current_balance'...","{'8-22': '000', '7-22': '000', '6-22': '000', ..."
4,,,,,,,,,,"{'member_name': 'NOT DISCLOSED', 'enquiry_date...",,"{'member_name': 'NOT DISCLOSED', 'account_numb...","{'opened': '27-04-2022', 'last_payment': ' 15-...","{'sanctioned': ' 83,873', 'current_balance': '...","{'8-22': '000', '7-22': '000', '6-22': '000', ..."


In [289]:
def df_aligner(df):
    col_list = list(df.columns)
#     print(col_list)
    static_cols = ['name', 'gender', 'cibil_name', 'user_identifier', 'dob','cibil_score']
    col_dict = {}
    for col in col_list:
        list_val = df[col].isnull().tolist()
        col_val = df[col].tolist()
        if True in list_val:
            idx = list_val.index(True)
            
            col_dict[col]= [col_val[:idx]]
        else:
            col_dict[col]= [col_val]
#     print(col_dict)
    return pd.DataFrame(col_dict)

In [290]:
df_aligner(df_t)

Unnamed: 0,name,gender,cibil_name,user_identifier,dob,address,phone_no,email,acc_summary,enquiry,cibil_score,ACCOUNT,DATES,AMOUNTS,DPD_INFO
0,[ NAVNATH HANUMANT PHADKE S/O HANUMANT],[ MALE],[CIBILTUSC3],"[{'PAN': 'BHEPP5637G', 'VOTER ID': 'KXH1109412...",[ 01/06/1977],"[{'address': ' BURDE VASTI, MALI PETHA, CHARHO...","[919822596539.0, 9822596539.0, 9822596539.0, 9...","[NOEMAIL@YESBANK.IN, NAVNATH57@GMAIL.COM, NULL...","[{'total': ' 62', 'overdue': ' 2', 'zero_balan...","[{'member_name': 'TVS CREDIT', 'enquiry_date':...",[739.0],"[{'member_name': 'NOT DISCLOSED', 'account_num...","[{'opened': '22-08-2022', 'last_payment': '', ...","[{'sanctioned': ' 380,000', 'current_balance':...","[{'8-22': '000'}, {'8-22': '000', '7-22': '000..."


In [520]:
all_files = glob.glob('../cibil_data/parsed_data_v2/parsed_data/*.csv')

In [521]:
df_val = []
for file in all_files:
    df_val.append(pd.read_csv(file))

In [522]:
df_val = pd.concat(df_val)

In [523]:
df_val = df_val[df_val['output'].notna()]

In [401]:
df_val.shape

(44, 16)

In [525]:
def get_cibil_score(cibil_info):
    t=0
    for item in eval(cibil_info):
        if item.isnumeric():
            t=int(item)
            return int(item)
    if t==0:
        print(cibil_info)
    return -1

In [526]:
df_val['cibil_score'] = df_val['cibil_info_with_factors'].apply(lambda x:get_cibil_score(x))

['CIBILTUSC3', '-1']


In [414]:
df_val.dtypes

name                        object
gender                      object
cibil_info_with_factors     object
cibil_name                  object
user_identifier             object
dob                         object
address                     object
phone_no                    object
email                       object
acc_summary                 object
account_info                object
enquiry                     object
account_info_new            object
output_dict                 object
output                     float64
cibil_score                  int64
dtype: object

In [527]:
df_val = df_val[df_val['cibil_score'].notna()]

In [528]:
df_val

Unnamed: 0,name,gender,cibil_info_with_factors,cibil_name,user_identifier,dob,address,phone_no,email,acc_summary,account_info,enquiry,account_info_new,output_dict,output,cibil_score
0,NAVNATH HANUMANT PHADKE S/O HANUMANT,MALE,"['CIBILTUSC3', 'PRESENCE OF SEVERE DELINQUENCY...",CIBILTUSC3,"{'PAN': 'BHEPP5637G', 'VOTER ID': 'KXH1109412'...",01/06/1977,"[{'address': ' BURDE VASTI, MALI PETHA, CHARHO...","['919822596539', '9822596539', '9822596539', '...","['NOEMAIL@YESBANK.IN', 'NAVNATH57@GMAIL.COM', ...","{'total': ' 62', 'overdue': ' 2', 'zero_balanc...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member_name': 'TVS CREDIT', 'enquiry_date':...","[{'8-22': '000'}, {'8-22': '000', '7-22': '000...","{0: 0.36550528, 1: 0.6344947}",1.0,739
0,TEJAVATHU ARUNA,FEMALE,"['CIBILTUSC3', 'INCREASE IN NON-MORTGAGE INDEB...",CIBILTUSC3,"{'VOTER ID': 'TAC0122887', 'RATION CARD': 'WAP...",01/01/1986,[{'address': ' MANGAPURAM NELAKONDAPALLI CHERU...,"['8179280881', '9618105941']",[],"{'total': ' 5', 'overdue': ' 0', 'zero_balance...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member': 'NOT DISCLOSED', 'enquiry_date': '...","[{'9-22': 'STD'}, {'9-22': 'STD', '8-22': 'STD...","{0: 0.99934155, 1: 0.00065845955}",0.0,709
0,MA DHARI SRINIVAS,MALE,"['CIBILTUSC3', '741']",CIBILTUSC3,"{'PAN': 'BEIPM2300E', 'RATION CARD': 'WAP15844...",06/04/1981,"[{'address': ' HYD HYDERABAD 500037', 'categor...","['9848312117', '9849842096']",[],"{'total': ' 9', 'overdue': ' 0', 'zero_balance...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member_name': 'TVS CREDIT', 'enquiry_date':...","[{'1-20': '000', '12-19': '000', '11-19': '000...","{0: 0.99970436, 1: 0.0002956314}",0.0,741
0,ALUGUNURI CHINNA UPPALAIAH A YELLAIAH,MALE,"['CIBILTUSC3', 'INCREASE IN NON-MORTGAGE INDEB...",CIBILTUSC3,"{'PAN': 'ASRPA3107F', 'UNIVERSAL ID': '2638872...",24/04/1982,[{'address': ' HNO 15 957 3 SRI SAI RESDY PHAN...,"['919959010019', '9959010019']",[],"{'total': ' 12', 'overdue': ' 0', 'zero_balanc...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member': 'NOT DISCLOSED', 'enquiry_date': '...","[{'7-22': 'STD'}, {'3-22': 'STD', '2-22': 'STD...","{0: 0.9998512, 1: 0.00014877407}",0.0,755
0,RAMANJANEYULU BOYA,MALE,"['CIBILTUSC3', 'HIGH BALANCE BUILD-UP ON NON-M...",CIBILTUSC3,"{'PAN': 'ERIPR8243B', 'UNIVERSAL ID': '7887841...",24/05/1995,"[{'address': ' S/O CHANDRA SEKHAR, D NO HANUMA...","['8309140705', '8247679163', '9160658705', '82...",['PRAMANJINEYULUBOYA1995@GMAIL.COM'],"{'total': ' 4', 'overdue': ' 0', 'zero_balance...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member': 'NOT DISCLOSED', 'enquiry_date': '...","[{'8-22': '000', '7-22': '000'}, {'8-22': '000...","{0: 0.9993405, 1: 0.0006595379}",0.0,745
0,RAMESH KUMAR,MALE,"['CIBILTUSC3', 'HIGH BALANCE BUILD-UP ON NON-M...",CIBILTUSC3,"{'PAN': 'JCAPK0774E', 'VOTER ID': 'SOY0593129'...",15/07/1995,[{'address': ' WARD 01 GUMANA KA WAS AJARI BAR...,"['8441904646', '8595621027', '9828440763', '84...",['RAMESHMAHALA62@GMAIL.COM'],"{'total': ' 4', 'overdue': ' 0', 'zero_balance...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member': 'NOT DISCLOSED', 'enquiry_date': '...","[{'10-22': '000', '9-22': '000', '8-22': '000'...","{0: 0.9989606, 1: 0.0010393843}",0.0,758
0,JAI SINGH S/O SWAROOP SINGH,MALE,"['CIBILTUSC3', 'PRESENCE OF DELINQUENCY AS OF ...",CIBILTUSC3,"{'PAN': 'FAYPS5323H', 'VOTER ID': 'SFK/0557462...",12/05/1991,[{'address': ' NAMAN ENTERPRISES IC 13 GUJAR B...,"['09785257344', '9785257344', '01419785257344']","['JAISINGH3357@GMAIL.COM', 'NOMAIL@NOMAIL.COM']","{'total': ' 6', 'overdue': ' 1', 'zero_balance...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member': 'NOT DISCLOSED', 'enquiry_date': '...","[{'8-22': '011', '7-22': '000', '6-22': '000',...","{0: 0.97579336, 1: 0.02420664}",0.0,705
0,BORRACHINNAVENKATARAO NARESH,MALE,"['CIBILTUSC3', 'PRESENCE OF DELINQUENCY', '701...",CIBILTUSC3,"{'PAN': 'CDVPN7402B', 'VOTER ID': 'WQN1143229'...",10/09/1994,[{'address': ' TXT_CUSTADR_ADD1_8494502 533124...,"['919550076291', '9550076291']",[],"{'total': ' 5', 'overdue': ' 1', 'zero_balance...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member': 'NOT DISCLOSED', 'enquiry_date': '...","[{'7-22': '047', '6-22': '046', '5-22': '016',...","{0: 0.13821691, 1: 0.8617831}",1.0,701
0,VENKATESH VENKATA SWAMY TH,MALE,"['CIBILTUSC3', 'PRESENCE OF SEVERE DELINQUENCY...",CIBILTUSC3,"{'PAN': 'BHCPT4233Q', 'DRIVING LICENSE': 'DLFA...",04/07/1980,[{'address': ' SOT VENKATA SWAMI H NO 46 GARLA...,['919959024709'],[],"{'total': ' 12', 'overdue': ' 2', 'zero_balanc...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member_name': 'TVS CREDIT', 'enquiry_date':...","[{'8-21': '000'}, {'2-22': 'STD', '1-22': '000...","{0: 0.8311287, 1: 0.16887127}",0.0,724
0,KAMMARI ANAND KUMAR,FEMALE,"['CIBILTUSC3', 'PRESENCE OF DELINQUENCY AS OF ...",CIBILTUSC3,"{'PAN': 'AUWPA7167Q', 'DRIVING LICENSE': 'DLFA...",15/05/1984,[{'address': ' KAMMARI IHNO13THUNKIMETLA.VILL....,['8106109317'],[],"{'total': ' 3', 'overdue': ' 0', 'zero_balance...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member': 'NOT DISCLOSED', 'enquiry_date': '...","[{'6-18': '000'}, {'6-21': 'STD', '5-21': 'STD...","{0: 0.8536272, 1: 0.14637278}",0.0,687


In [529]:
df_val['cibil_score'].describe()

count     44.000000
mean     703.500000
std      118.375536
min       -1.000000
25%      690.000000
50%      727.000000
75%      752.750000
max      790.000000
Name: cibil_score, dtype: float64

In [530]:
df_val[df_val['cibil_score']>727].shape

(22, 16)

In [531]:
df_val[df_val['cibil_score']>727]['output'].tolist().count(0)

20

In [534]:
df_val['zero_prob'] = df_val['output_dict'].apply(lambda x:eval(x)[0])

In [535]:
new_df_val = df_val.sort_values('zero_prob',ascending=False)

In [547]:
new_df_val['name'] = new_df_val['name'].apply(lambda x:x.strip())

In [536]:
new_df_val[new_df_val['cibil_score']>600][['cibil_score','zero_prob']].corr()

Unnamed: 0,cibil_score,zero_prob
cibil_score,1.0,0.425129
zero_prob,0.425129,1.0


In [559]:
new_df_val

Unnamed: 0,name,gender,cibil_info_with_factors,cibil_name,user_identifier,dob,address,phone_no,email,acc_summary,account_info,enquiry,account_info_new,output_dict,output,cibil_score,zero_prob
0,RATHNAKAR REDDY KARRA,MALE,"['CIBILTUSC3', 'PRESENCE OF SEVERE DELINQUENCY...",CIBILTUSC3,"{'PAN': 'BRYPK9541E', 'UNIVERSAL ID': '9874053...",14/10/1985,"[{'address': ' THIMMAPUR 506316', 'category': ...","['9963774880', '9963774880', '9100492211', '99...",[],"{'total': ' 28', 'overdue': ' 0', 'zero_balanc...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member_name': 'NOT DISCLOSED', 'enquiry_dat...","[{'8-22': '000'}, {'7-22': '000'}, {'6-22': '0...","{0: 0.99998754, 1: 1.2454441e-05}",0.0,714,0.999988
0,PONNANA APPA NNA,MALE,"['CIBILTUSC3', '785']",CIBILTUSC3,"{'VOTER ID': 'IQV0325118', 'DRIVING LICENSE': ...",18/07/1990,"[{'address': '', 'category': ' Residence Addre...","['917093763494', '7093763494']","['PONNANAAPPANNA363@GMAIL.COM', 'NULL@EMAIL.COM']","{'total': ' 4', 'overdue': ' 0', 'zero_balance...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member': 'NOT DISCLOSED', 'enquiry_date': '...","[{'7-22': '000', '6-22': '000', '5-22': '000',...","{0: 0.9999851, 1: 1.4872751e-05}",0.0,785,0.999985
0,MEKA SYAMUNDESWARI,FEMALE,"['CIBILTUSC3', 'CREDIT AGE LESS THAN SIX MONTH...",CIBILTUSC3,"{'PAN': 'HPOPM6549Q', 'VOTER ID': 'UDD1082767'}",01/01/1991,[{'address': ' 2-5/A CHINA BHIMPALLI DEVIPATNA...,['9908844706'],[],"{'total': ' 1', 'overdue': ' 0', 'zero_balance...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member_name': 'NOT DISCLOSED', 'enquiry_dat...",[{'9-22': 'STD'}],"{0: 0.9999237, 1: 3.939138e-06, 2: 6.906948e-0...",0.0,747,0.999924
0,DUNDAPPA WALI SHIVAPUTRAPPA,MALE,"['CIBILTUSC3', 'HIGH BALANCE BUILD-UP ON NON-M...",CIBILTUSC3,"{'PAN': 'AVNPV0964A', 'VOTER ID': 'XXB3154721'...",04/06/1991,[{'address': ' GALAGALI GALAGALI GALAGALI 5873...,"['919632737145', '7411477145', '7411477145', '...",['NOMAIL@GMAIL.COM'],"{'total': ' 8', 'overdue': ' 0', 'zero_balance...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member_name': 'NOT DISCLOSED', 'enquiry_dat...","[{'10-22': '000', '9-22': '000'}, {'9-22': '00...","{0: 0.999881, 1: 0.0001189453}",0.0,746,0.999881
0,BHUVANAIKAVA RAO CHITTUKURI,MALE,"['CIBILTUSC3', 'PRESENCE OF DELINQUENCY', '744...",CIBILTUSC3,"{'PAN': 'ASKPC5326K', 'VOTER ID': 'IMZ1631846'...",04/06/1970,"[{'address': '', 'category': '', 'residential_...","['9347973292', '9494526598', '9494526598', '99...",['CHARANKOUSHIK38@GMAIL.COM'],"{'total': ' 16', 'overdue': ' 0', 'zero_balanc...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member': 'NOT DISCLOSED', 'enquiry_date': '...","[{'7-22': '000', '6-22': '000', '5-22': '000',...","{0: 0.9998757, 1: 0.00012425639}",0.0,744,0.999876
0,BUDDAPATHINI SARASWATHI,FEMALE,"['CIBILTUSC3', '790']",CIBILTUSC3,"{'PAN': 'CYKPB8432K', 'UNIVERSAL ID': '3920050...",05/10/1986,[{'address': ' SAME HYDERABAD HYDERABAD 500039...,"['6300927920', '6300927920', '6300927920', '73...",['SRIDHARBODABATHINI@GMAIL.COM'],"{'total': ' 4', 'overdue': ' 0', 'zero_balance...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member': 'NOT DISCLOSED', 'enquiry_date': '...","[{'8-22': '000'}, {'8-22': 'STD', '7-22': 'STD...","{0: 0.999863, 1: 0.0001369534}",0.0,790,0.999863
0,VAHIL MALANG SHAIKH,MALE,"['CIBILTUSC3', '752']",CIBILTUSC3,"{'PAN': 'CJKPS0305M', 'PASSPORT': 'E7237221', ...",05/06/1984,"[{'address': ' SOLAPUR ,SOLAPUR ,MAHARASHTRA 4...","['9552902243', '7058660123', '7058660123', '70...",[],"{'total': ' 5', 'overdue': ' 0', 'zero_balance...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member': 'NOT DISCLOSED', 'enquiry_date': '...","[{'8-19': 'STD', '7-19': 'STD', '6-19': 'STD',...","{0: 0.99985737, 1: 0.00014260989}",0.0,752,0.999857
0,ALUGUNURI CHINNA UPPALAIAH A YELLAIAH,MALE,"['CIBILTUSC3', 'INCREASE IN NON-MORTGAGE INDEB...",CIBILTUSC3,"{'PAN': 'ASRPA3107F', 'UNIVERSAL ID': '2638872...",24/04/1982,[{'address': ' HNO 15 957 3 SRI SAI RESDY PHAN...,"['919959010019', '9959010019']",[],"{'total': ' 12', 'overdue': ' 0', 'zero_balanc...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member': 'NOT DISCLOSED', 'enquiry_date': '...","[{'7-22': 'STD'}, {'3-22': 'STD', '2-22': 'STD...","{0: 0.9998512, 1: 0.00014877407}",0.0,755,0.999851
0,SARGUNARAJ J,MALE,"['CIBILTUSC3', '764']",CIBILTUSC3,"{'VOTER ID': 'KPT1604024', 'DRIVING LICENSE': ...",19/07/1985,[{'address': ' 108 NADAR NORTH STREET THALAIKU...,"['7373373726', '9791986944']",[],"{'total': ' 7', 'overdue': ' 0', 'zero_balance...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member': 'NOT DISCLOSED', 'enquiry_date': '...","[{'7-22': '000', '6-22': '000'}, {'7-22': '000...","{0: 0.9997729, 1: 0.0002270897}",0.0,764,0.999773
0,MA DHARI SRINIVAS,MALE,"['CIBILTUSC3', '741']",CIBILTUSC3,"{'PAN': 'BEIPM2300E', 'RATION CARD': 'WAP15844...",06/04/1981,"[{'address': ' HYD HYDERABAD 500037', 'categor...","['9848312117', '9849842096']",[],"{'total': ' 9', 'overdue': ' 0', 'zero_balance...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member_name': 'TVS CREDIT', 'enquiry_date':...","[{'1-20': '000', '12-19': '000', '11-19': '000...","{0: 0.99970436, 1: 0.0002956314}",0.0,741,0.999704


In [537]:
df_bb = pd.read_csv('../cibil_data/bb_val_sample/FS OwnBook - Loan Status Tracker Nov\'22 (Confidential) - 3.Cibil Check Master.csv')

In [543]:
df_bb.columns = df_bb.iloc[0]

In [550]:
df_bb.columns

Index(['TIMESTAMP', 'Month', 'Date', 'Zone', 'State', 'Branch Code',
       'Branch Name', 'CRO/BM Emp Code', 'CRO/BM Name', 'Borrower Name',
       'Borrower PAN', 'Applicant Mob #', 'Fasttrack ID',
       'Applicant \nCibil Score', 'Applicant\nCibil Report',
       'Applicant \nOK/NOT OK', 'Co-Applicant \nCibil Score',
       'Co-Applicant\nCibil Report', 'Co-Applicant \nOK/NOT OK',
       'Guarantor\nCibil Score', 'Guarantor\nCibil Report',
       'Guarantor\nOK/NOT OK', 'Soft Sanction\nY/N\n(MH, TS)',
       'MS1: Sanction Requested (Y/N)'],
      dtype='object', name=0)

In [553]:
df_bb['Borrower Name'] = df_bb['Borrower Name'].apply(lambda x:x.strip() if type(x)=='str' else x)

In [558]:
df_bb.merge(new_df_val,left_on='Borrower Name',right_on='name')[['name','cibil_score','output','Applicant \nOK/NOT OK']]

Unnamed: 0,name,cibil_score,output,Applicant \nOK/NOT OK
0,KAMMARI ANAND KUMAR,687,0.0,OK
1,UNNAMATLA SUDHA DEVI,687,0.0,NOT OK
2,BONTHI RANGA MANI,696,0.0,OK
3,RAMESH KUMAR,758,0.0,OK


In [562]:
new_df_val.reset_index(inplace=True)

In [566]:
new_df_val['user_identifier'][0]

"{'PAN': 'BRYPK9541E', 'UNIVERSAL ID': '987405362319'}"

In [570]:
def map_bb_files_with_val(df_bb,df_val):
    
    dfs=[]
    for i,r in df_val.iterrows():
        name=phone=cibil_score=output=is_ok=''
        name=r['name']
        uid = eval(r['user_identifier'])
        pan_id = ''
        if 'PAN' in uid:
            pan_id=uid['PAN']
        phone_nos = eval(r['phone_no'])
        
        if pan_id !='':
            is_ok_l = df_bb[df_bb['Borrower PAN']==pan_id]['Applicant \nOK/NOT OK'].tolist()
            if len(is_ok_l)>0:
                is_ok=is_ok_l[0]
            
        temp_df = pd.DataFrame({'name':[name],'cibil_score':[r['cibil_score']],'output':[r['output']], 'is_ok_bb':[is_ok]}
                              )
        dfs.append(temp_df)
    return pd.concat(dfs)
        
        
        
    

In [572]:
v1 = map_bb_files_with_val(df_bb,new_df_val)

In [576]:
v1['is_ok_bb'] = v1.apply(lambda x: 'NOT OK' if (x['cibil_score'])<600 else x['is_ok_bb'],axis=1)

In [577]:
v1[v1['is_ok_bb']!='']

Unnamed: 0,name,cibil_score,output,is_ok_bb
0,DUNDAPPA WALI SHIVAPUTRAPPA,746,0.0,OK
0,VAHIL MALANG SHAIKH,752,0.0,OK
0,MA DHARI SRINIVAS,741,0.0,OK
0,DINESH CHANGARA CHANGARA,763,0.0,OK
0,VILAS RAMRAO KANJALE,757,0.0,OK
0,KOPPISETTI SRINIVASU,670,0.0,OK
0,B SRIDHAR,728,0.0,OK
0,MR. CHAGANRAO EKNATH DIGHE,617,0.0,OK
0,VIPPARLA PRAKASAM,760,0.0,NOT OK
0,JAI SINGH S/O SWAROOP SINGH,705,0.0,OK


In [367]:
t = (pd.read_csv('../cibil_data/parsed_data/SALEEM SAYYAD CIBIL_Report - Shaik Wassi.csv'))
change_cols = ['cibil_info_with_factors', 'address', 'phone_no', 'email', 'acc_summary', 'account_info','enquiry','account_info_new']
for col in change_cols:
    t[col] = t[col].apply(lambda x:eval(x))

In [371]:
t

Unnamed: 0,name,gender,cibil_info_with_factors,cibil_name,user_identifier,dob,address,phone_no,email,acc_summary,account_info,enquiry,account_info_new
0,SAYYADSALEEM SAYYAD JALAL,MALE,"[CIBILTUSC3, 747]",CIBILTUSC3,"{'PAN': 'FTIPS5017M', 'VOTER ID': 'HLS2642262'...",01/01/1966,"[{'address': ' 155 WAGHI RAOD ,HASSAPUR TQ DIS...",[8888307285],[NULL@GMAIL.COM],"{'total': ' 1', 'overdue': ' 0', 'zero_balance...","[{'ACCOUNT': {'member_name': 'NOT DISCLOSED', ...","[{'member': 'NOT DISCLOSED', 'enquiry_date': '...","[{'7-22': '000', '6-22': '000', '5-22': '000',..."


In [368]:
def get_enquiry_table(df): 
    df_n = []
#     print(df.enquiry)
    for idx,row in df.iterrows():
        user_id = row.name
        enq_info = row['enquiry']
        for elem in enq_info:
            date = elem['enquiry_date']
            enq_purpose = elem['enquiry_purpose']
            try:
                enq_amt = get_sanc_amt(elem['enquiry_amount'])
            except:
                print(elem)
                break
            df_enq = pd.DataFrame({'user_id':[user_id],'date':[date],'enq_purpose':[enq_purpose],'enq_amount':[enq_amt]})
            df_n.append(df_enq)
         
    if len(df_n)<1:
        return pd.DataFrame({'user_id':[user_id],'date':[''],'enq_purpose':[''],'enq_amount':['']})
    return pd.concat(df_n)

In [369]:
get_enquiry_table(t)

Unnamed: 0,user_id,date,enq_purpose,enq_amount
0,0,07-09-2020,Commercial Vehicle Loan,484000
0,0,07-09-2020,Commercial Vehicle Loan,484000
0,0,05-09-2020,Commercial Vehicle Loan,495000


In [384]:
def get_dpd_raw_table(df):
    dfs = [] 
#     print()
    for idx,row in df.iterrows():
        user_id = row.name 
        acc_info_list = row['account_info']
        dpd_info_json = row['account_info_new']
        print(len(dpd_info_json))
#         print()
        for i in range(len(acc_info_list)):
            print(acc_info_list[i])
            loan_id = i 
            loan_info = acc_info_list[i]
            sanc_amount =''
            loan_type = loan_info['ACCOUNT']['TYPE']
            ownership = loan_info['ACCOUNT']['ownership']
            if 'sanctioned' in loan_info['AMOUNTS']:
                sanc_amount=loan_info['AMOUNTS']['sanctioned']
            interest_rate = loan_info['AMOUNTS']['interest_rate']
            repayment_tenure = loan_info['AMOUNTS']['repay_tenure']
            emi_amount = loan_info['AMOUNTS']['emi']
            pmt_freq = loan_info['AMOUNTS']['pmt_freq']
            open_date = loan_info['DATES']['opened']
            closed_date = loan_info['DATES']['closed']
#             dpd_list = loan_info['PaymentHistory']['dayPayDue']
            timestamp_json = dpd_info_json[i]
            for k,v in timestamp_json.items():
                is_open=1
                is_closed=0
                ## dd-mm-yyyy
                print(closed_date)
                
                
#                     print(closed_date)
                try:

                    curr_timestamp = pd.to_datetime(k,format='%m-%y')
                    closed_date_m_y = pd.to_datetime(closed_date,dayfirst=True)
                    if len(closed_date.split('-'))>=3:
                        
                        if curr_timestamp.year==closed_date_m_y.year and curr_timestamp.month==closed_date_m_y.month:
                            is_open=0
                            is_closed=1
                    df_dpd = pd.DataFrame({'user_id':[user_id],'loan_id':[loan_id],'timestamp':[k],'dpd':[v],'loan_type':[loan_type],'ownership':[ownership],'sanc_amount':[sanc_amount], 'interest_rate':[interest_rate], 
                                  'repayment_tenure':[repayment_tenure],'emi_amount':[emi_amount],'pmt_freq':[pmt_freq],'open_date':[open_date],
                                  'closed_date':[closed_date],'is_open':[is_open],'is_closed':[is_closed]})
                    dfs.append(df_dpd)
                except:
                    break
                

                    
                        
    return pd.concat(dfs)

In [388]:
pd.DataFrame({}).shape

(0, 0)