### In this notebook, I tried to generate features that correspond to user's payement behaviour and creditworthiness
### KEY STEPS:
#### *1. generate features like total number of non-closed loans for a particular ID total number of closed-loans for each ACCOUNT_TYPE*
#### *2. total loan sanctioned amount and total amount paid till date reported and certified for each ACCOUNT_TYPE*


In [None]:
import pandas as pd

In [3]:
df_all=pd.read_csv("train_all_loan.csv")
pd.set_option('display.max_columns', None)
df_main=pd.read_csv("missing_handled_main_train.csv")

In [4]:
account_type_df = pd.DataFrame(df_all['ACCOUNT_TYPE'].unique(), columns=['ACCOUNT_TYPE'])

In [5]:
# account_type_df

In [6]:
import pandas as pd
# Define loan categories by name, including "Corporate Credit Card" as a Personal Loan
loan_categories_by_name = {
    "Personal Loan": ["Personal Loan", "Short Term Personal Loan", "P2P Personal Loan", "Microfinance - Personal Loan", "Loan to Professional", "Temporary Overdraft", "Consumer Loan", "Two-wheeler Loan", "Auto Loan (Personal)"],
  "Property Loan": ["Property Loan", "Loan Against Bank Deposits", "Overdraft", "Loan Against Shares/Securities", "Used Car Loan", "Secured Credit Card"],
  "Business Loan": [
      "Business Loan - General", "Business Loan - Unsecured", "Business Loan - Secured",
      "Business Loan - Priority Sector - Agriculture", "Business Loan - Priority Sector - Small Business", 
      "Business Loan - Priority Sector - Others", "Microfinance - Business Loan", "GECL Loan Secured", 
      "Business Non-Funded Credit Facility - Priority Sector", "Construction Equipment Loan", 
      "GECL Loan Unsecured", "Business Loan Against Bank Deposits", "Business Non-Funded Credit Facility - General", 
      "Microfinance - Other", "Non-Funded Credit Facility", "Leasing", 
      "Business Non-Funded Credit Facility - Priority Sector - Others", "Fleet Card", 
      "Mudra Loans - Shishu / Kishor / Tarun", "Tractor Loan"
  ],
  "Housing Loan": ["Housing Loan", "Microfinance - Housing Loan", "Pradhan Mantri Awas Yojana - Credit Link Subsidy Scheme"],
  "Gold Loan": ["Gold Loan", "Priority Sector - Gold Loan"],
  "Kisan Credit Card": ["Kisan Credit Card"],
  "Education Loan": ["Education Loan"]
}

In [7]:
# Create a new column 'LOAN_CATEGORY' in df_all based on loan categories, including "Kisan Credit Card" and "Education Loan"
df_all['LOAN_CATEGORY'] = df_all['ACCOUNT_TYPE'].apply(lambda x: next((k for k, v in loan_categories_by_name.items() if x in v), None))

# Filter df_all for personal, property, business, gold, housing, kisan credit card, and education loans
# Count the occurrences for each loan category
personal_loans_count = df_all[df_all['LOAN_CATEGORY'] == 'Personal Loan'].groupby('ID').size().reset_index(name='PERSONAL_LOANS')
property_loans_count = df_all[df_all['LOAN_CATEGORY'] == 'Property Loan'].groupby('ID').size().reset_index(name='PROPERTY_LOANS')
business_loans_count = df_all[df_all['LOAN_CATEGORY'] == 'Business Loan'].groupby('ID').size().reset_index(name='BUSINESS_LOANS')
gold_loans_count = df_all[df_all['LOAN_CATEGORY'] == 'Gold Loan'].groupby('ID').size().reset_index(name='GOLD_LOANS')
housing_loans_count = df_all[df_all['LOAN_CATEGORY'] == 'Housing Loan'].groupby('ID').size().reset_index(name='HOUSING_LOANS')
kisan_credit_card_count = df_all[df_all['LOAN_CATEGORY'] == 'Kisan Credit Card'].groupby('ID').size().reset_index(name='KISAN_CREDIT_CARD_LOANS')
education_loans_count = df_all[df_all['LOAN_CATEGORY'] == 'Education Loan'].groupby('ID').size().reset_index(name='EDUCATION_LOANS')

# Merge the loan counts with the occurrences dataframe
occurrences = df_main.merge(personal_loans_count, on='ID', how='left')
occurrences = occurrences.merge(property_loans_count, on='ID', how='left')
occurrences = occurrences.merge(business_loans_count, on='ID', how='left')
occurrences = occurrences.merge(gold_loans_count, on='ID', how='left')
occurrences = occurrences.merge(housing_loans_count, on='ID', how='left')
occurrences = occurrences.merge(kisan_credit_card_count, on='ID', how='left')
occurrences = occurrences.merge(education_loans_count, on='ID', how='left')


In [8]:
# Count closed loans in each category, including "Kisan Credit Card" and "Education Loan"
closed_personal_loans = df_all[(df_all['LOAN_CATEGORY'] == 'Personal Loan') & (df_all['DATE_CLOSED'].notna())].groupby('ID').size().reset_index(name='CLOSED_PERSONAL_LOANS')
closed_property_loans = df_all[(df_all['LOAN_CATEGORY'] == 'Property Loan') & (df_all['DATE_CLOSED'].notna())].groupby('ID').size().reset_index(name='CLOSED_PROPERTY_LOANS')
closed_business_loans = df_all[(df_all['LOAN_CATEGORY'] == 'Business Loan') & (df_all['DATE_CLOSED'].notna())].groupby('ID').size().reset_index(name='CLOSED_BUSINESS_LOANS')
closed_gold_loans = df_all[(df_all['LOAN_CATEGORY'] == 'Gold Loan') & (df_all['DATE_CLOSED'].notna())].groupby('ID').size().reset_index(name='CLOSED_GOLD_LOANS')
closed_housing_loans = df_all[(df_all['LOAN_CATEGORY'] == 'Housing Loan') & (df_all['DATE_CLOSED'].notna())].groupby('ID').size().reset_index(name='CLOSED_HOUSING_LOANS')
closed_kisan_credit_card_loans = df_all[(df_all['LOAN_CATEGORY'] == 'Kisan Credit Card') & (df_all['DATE_CLOSED'].notna())].groupby('ID').size().reset_index(name='CLOSED_KISAN_CREDIT_CARD_LOANS')
closed_education_loans = df_all[(df_all['LOAN_CATEGORY'] == 'Education Loan') & (df_all['DATE_CLOSED'].notna())].groupby('ID').size().reset_index(name='CLOSED_EDUCATION_LOANS')

# Merge closed loan counts with the occurrences dataframe
occurrences = occurrences.merge(closed_personal_loans, on='ID', how='left')
occurrences = occurrences.merge(closed_property_loans, on='ID', how='left')
occurrences = occurrences.merge(closed_business_loans, on='ID', how='left')
occurrences = occurrences.merge(closed_gold_loans, on='ID', how='left')
occurrences = occurrences.merge(closed_housing_loans, on='ID', how='left')
occurrences = occurrences.merge(closed_kisan_credit_card_loans, on='ID', how='left')
occurrences = occurrences.merge(closed_education_loans, on='ID', how='left')


In [9]:
# Calculate the paid sums for each loan category, including "Kisan Credit Card" and "Education Loan"
housing_loans_paid_sum = df_all[df_all['LOAN_CATEGORY'] == 'Housing Loan'][df_all['DATE_CLOSED'].notna()].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset_index(name='HOUSING_LOANS_PAID_SUM')
personal_loans_paid_sum = df_all[df_all['LOAN_CATEGORY'] == 'Personal Loan'][df_all['DATE_CLOSED'].notna()].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset_index(name='PERSONAL_LOANS_PAID_SUM')
property_loans_paid_sum = df_all[df_all['LOAN_CATEGORY'] == 'Property Loan'][df_all['DATE_CLOSED'].notna()].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset_index(name='PROPERTY_LOANS_PAID_SUM')
business_loans_paid_sum = df_all[df_all['LOAN_CATEGORY'] == 'Business Loan'][df_all['DATE_CLOSED'].notna()].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset_index(name='BUSINESS_LOANS_PAID_SUM')
gold_loans_paid_sum = df_all[df_all['LOAN_CATEGORY'] == 'Gold Loan'][df_all['DATE_CLOSED'].notna()].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset_index(name='GOLD_LOANS_PAID_SUM')
kisan_credit_card_paid_sum = df_all[df_all['LOAN_CATEGORY'] == 'Kisan Credit Card'][df_all['DATE_CLOSED'].notna()].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset_index(name='KISAN_CREDIT_CARD_LOANS_PAID_SUM')
education_loans_paid_sum = df_all[df_all['LOAN_CATEGORY'] == 'Education Loan'][df_all['DATE_CLOSED'].notna()].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset_index(name='EDUCATION_LOANS_PAID_SUM')

# Merge the paid sums with the occurrences dataframe
occurrences = occurrences.merge(property_loans_paid_sum, on='ID', how='left')
occurrences = occurrences.merge(business_loans_paid_sum, on='ID', how='left')
occurrences = occurrences.merge(gold_loans_paid_sum, on='ID', how='left')
occurrences = occurrences.merge(housing_loans_paid_sum, on='ID', how='left')
occurrences = occurrences.merge(personal_loans_paid_sum, on='ID', how='left')
occurrences = occurrences.merge(kisan_credit_card_paid_sum, on='ID', how='left')
occurrences = occurrences.merge(education_loans_paid_sum, on='ID', how='left')


  housing_loans_paid_sum = df_all[df_all['LOAN_CATEGORY'] == 'Housing Loan'][df_all['DATE_CLOSED'].notna()].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset_index(name='HOUSING_LOANS_PAID_SUM')
  personal_loans_paid_sum = df_all[df_all['LOAN_CATEGORY'] == 'Personal Loan'][df_all['DATE_CLOSED'].notna()].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset_index(name='PERSONAL_LOANS_PAID_SUM')
  property_loans_paid_sum = df_all[df_all['LOAN_CATEGORY'] == 'Property Loan'][df_all['DATE_CLOSED'].notna()].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset_index(name='PROPERTY_LOANS_PAID_SUM')
  business_loans_paid_sum = df_all[df_all['LOAN_CATEGORY'] == 'Business Loan'][df_all['DATE_CLOSED'].notna()].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset_index(name='BUSINESS_LOANS_PAID_SUM')
  gold_loans_paid_sum = df_all[df_all['LOAN_CATEGORY'] == 'Gold Loan'][df_all['DATE_CLOSED'].notna()].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset

In [10]:
# Calculate the total sums for each loan category, including "Kisan Credit Card" and "Education Loan"
personal_loans_sum = df_all[df_all['LOAN_CATEGORY'] == 'Personal Loan'].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset_index(name='PERSONAL_LOANS_SUM')
property_loans_sum = df_all[df_all['LOAN_CATEGORY'] == 'Property Loan'].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset_index(name='PROPERTY_LOANS_SUM')
business_loans_sum = df_all[df_all['LOAN_CATEGORY'] == 'Business Loan'].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset_index(name='BUSINESS_LOANS_SUM')
gold_loans_sum = df_all[df_all['LOAN_CATEGORY'] == 'Gold Loan'].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset_index(name='GOLD_LOANS_SUM')
housing_loans_sum = df_all[df_all['LOAN_CATEGORY'] == 'Housing Loan'].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset_index(name='HOUSING_LOANS_SUM')
kisan_credit_card_sum = df_all[df_all['LOAN_CATEGORY'] == 'Kisan Credit Card'].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset_index(name='KISAN_CREDIT_CARD_LOANS_SUM')
education_loans_sum = df_all[df_all['LOAN_CATEGORY'] == 'Education Loan'].groupby('ID')['HIGH_CREDIT_OR_SANCTIONED_AMOUNT'].sum().reset_index(name='EDUCATION_LOANS_SUM')

# Merge the total sums with the occurrences dataframe
occurrences = occurrences.merge(personal_loans_sum, on='ID', how='left')
occurrences = occurrences.merge(property_loans_sum, on='ID', how='left')
occurrences = occurrences.merge(business_loans_sum, on='ID', how='left')
occurrences = occurrences.merge(gold_loans_sum, on='ID', how='left')
occurrences = occurrences.merge(housing_loans_sum, on='ID', how='left')
occurrences = occurrences.merge(kisan_credit_card_sum, on='ID', how='left')
occurrences = occurrences.merge(education_loans_sum, on='ID', how='left')


In [11]:
# Calculate the total unclosed sums for each loan category, including "Kisan Credit Card" and "Education Loan"
personal_loans_unclosed_sum = df_all[df_all['LOAN_CATEGORY'] == 'Personal Loan'].groupby('ID')['CURRENT_BALANCE'].sum().reset_index(name='PERSONAL_LOANS_UNCLOSED_SUM')
property_loans_unclosed_sum = df_all[df_all['LOAN_CATEGORY'] == 'Property Loan'].groupby('ID')['CURRENT_BALANCE'].sum().reset_index(name='PROPERTY_LOANS_UNCLOSED_SUM')
business_loans_unclosed_sum = df_all[df_all['LOAN_CATEGORY'] == 'Business Loan'].groupby('ID')['CURRENT_BALANCE'].sum().reset_index(name='BUSINESS_LOANS_UNCLOSED_SUM')
gold_loans_unclosed_sum = df_all[df_all['LOAN_CATEGORY'] == 'Gold Loan'].groupby('ID')['CURRENT_BALANCE'].sum().reset_index(name='GOLD_LOANS_UNCLOSED_SUM')
housing_loans_unclosed_sum = df_all[df_all['LOAN_CATEGORY'] == 'Housing Loan'].groupby('ID')['CURRENT_BALANCE'].sum().reset_index(name='HOUSING_LOANS_UNCLOSED_SUM')
kisan_credit_card_unclosed_sum = df_all[df_all['LOAN_CATEGORY'] == 'Kisan Credit Card'].groupby('ID')['CURRENT_BALANCE'].sum().reset_index(name='KISAN_CREDIT_CARD_UNCLOSED_SUM')
education_loans_unclosed_sum = df_all[df_all['LOAN_CATEGORY'] == 'Education Loan'].groupby('ID')['CURRENT_BALANCE'].sum().reset_index(name='EDUCATION_LOANS_UNCLOSED_SUM')

# Merge the total unclosed sums with the occurrences dataframe
occurrences = occurrences.merge(personal_loans_unclosed_sum, on='ID', how='left')
occurrences = occurrences.merge(property_loans_unclosed_sum, on='ID', how='left')
occurrences = occurrences.merge(business_loans_unclosed_sum, on='ID', how='left')
occurrences = occurrences.merge(gold_loans_unclosed_sum, on='ID', how='left')
occurrences = occurrences.merge(housing_loans_unclosed_sum, on='ID', how='left')
occurrences = occurrences.merge(kisan_credit_card_unclosed_sum, on='ID', how='left')
occurrences = occurrences.merge(education_loans_unclosed_sum, on='ID', how='left')


In [12]:
occurrences.head(10)

Unnamed: 0,ID,ACCOUNT_TYPE,HIGH_CREDIT_OR_SANCTIONED_AMOUNT,MONTH_OPENED,CURRENT_BALANCE,ACTUAL_PAYMT_AMT,REPAYMENT_TENURE,AMOUNT_OVERDUE,PAYMENT_HISTORY_1,OWNERSHIP_TYPE,COLLATERALVALUE,TU_SCORE,MONTH_PAYMENT_HISTORY_START,MONTH_PAYMENT_HISTORY_END,MONTH_CERTIFIED,MONTH_LAST_PAYMENT,AGE,OCCUPATION_TYPE,GENDER,ACTUAL_ROI,COUNT_PAID,COUNT_LATE,COUNT_MORE_THAN_90,COUNT_0_TO_60,AVG_DAYS,COUNT_MISSING_DAYS,PERSONAL_LOANS,PROPERTY_LOANS,BUSINESS_LOANS,GOLD_LOANS,HOUSING_LOANS,KISAN_CREDIT_CARD_LOANS,EDUCATION_LOANS,CLOSED_PERSONAL_LOANS,CLOSED_PROPERTY_LOANS,CLOSED_BUSINESS_LOANS,CLOSED_GOLD_LOANS,CLOSED_HOUSING_LOANS,CLOSED_KISAN_CREDIT_CARD_LOANS,CLOSED_EDUCATION_LOANS,PROPERTY_LOANS_PAID_SUM,BUSINESS_LOANS_PAID_SUM,GOLD_LOANS_PAID_SUM,HOUSING_LOANS_PAID_SUM,PERSONAL_LOANS_PAID_SUM,KISAN_CREDIT_CARD_LOANS_PAID_SUM,EDUCATION_LOANS_PAID_SUM,PERSONAL_LOANS_SUM,PROPERTY_LOANS_SUM,BUSINESS_LOANS_SUM,GOLD_LOANS_SUM,HOUSING_LOANS_SUM,KISAN_CREDIT_CARD_LOANS_SUM,EDUCATION_LOANS_SUM,PERSONAL_LOANS_UNCLOSED_SUM,PROPERTY_LOANS_UNCLOSED_SUM,BUSINESS_LOANS_UNCLOSED_SUM,GOLD_LOANS_UNCLOSED_SUM,HOUSING_LOANS_UNCLOSED_SUM,KISAN_CREDIT_CARD_UNCLOSED_SUM,EDUCATION_LOANS_UNCLOSED_SUM
0,A002338349,Housing Loan,818517,115,640742,8627.0,142.0,0.0,0000000000000000000000000000000000000520520520...,Joint,2086000.0,681,0,36,0,0.0,52,SALARIED,Male,12.32,13.0,4.0,0.0,4.0,44.25,1.0,1.0,2.0,,14.0,2.0,,,1.0,1.0,,13.0,1.0,,,97.0,,524426.0,825000.0,129000.0,,,129000.0,381739.0,,547026.0,1643517.0,,,0.0,298244.0,,22600.0,640742.0,,
1,A002000537,Housing Loan,1243755,133,949446,500.0,156.0,0.0,0000000000000000000000000000000000000000000000...,Joint,1536000.0,784,0,36,0,0.0,39,SALARIED,Female,12.42,17.0,0.0,0.0,0.0,0.0,1.0,1.0,,,,1.0,,,1.0,,,,,,,,,,,19000.0,,,19000.0,,,,1243755.0,,,0.0,,,,949446.0,,
2,A002421579,Housing Loan,1826422,81,1296849,16036.0,148.0,0.0,0000000000000000000000000000000000000000000000...,Joint,3587700.0,748,0,36,0,0.0,37,SALARIED,Male,10.87,17.0,0.0,0.0,0.0,0.0,1.0,9.0,,,,1.0,1.0,,8.0,,,,,,,,,,,3846792.0,,,4566792.0,,,,1826422.0,415000.0,,455004.0,,,,1296849.0,414994.0,
3,A002152345,Housing Loan,1847916,11,1724256,26996.0,174.0,0.0,0000000000000000000000000000000000000000000000...,Joint,3049600.0,786,0,12,0,0.0,54,SALARIED,Female,10.0,18.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,1.0,5.0,3.0,,,3.0,1.0,1.0,5.0,1.0,,,150000.0,30000.0,520000.0,2424978.0,520000.0,,,1620000.0,150000.0,30000.0,520000.0,4287894.0,,,485042.0,0.0,0.0,0.0,1737702.0,,
4,A001952834,Housing Loan,2318386,110,1790937,25131.0,116.0,431551.0,5395705705405095065075075075075095095094794784...,Joint,2785400.0,664,0,36,0,0.0,49,SALARIED,Male,11.97,0.0,17.0,17.0,0.0,510.235294,1.0,,,,,2.0,,,,,,,1.0,,,,,,1919444.0,,,,,,,,4237830.0,,,,,,,1790937.0,,
5,A002239370,Property Loan,1163870,80,959370,12544.0,158.0,12533.0,0210220520530520520520520500500000000000000002...,Joint,2369812.0,710,0,36,0,0.0,57,SALARIED,Male,12.97,5.0,12.0,2.0,10.0,77.5,1.0,7.0,1.0,,2.0,1.0,,,7.0,,,2.0,1.0,,,,,30650.0,365000.0,97244.0,,,97244.0,1163870.0,,30650.0,365000.0,,,0.0,959370.0,,0.0,0.0,,
6,A000936177,Housing Loan,2400318,9,2385404,500.0,174.0,0.0,0000000000000000000000000000000000000000000000...,Joint,3068676.0,774,0,9,0,0.0,33,OTHERS,Male,14.25,18.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3.0,,1.0,,,,,2.0,,,,,,486000.0,,,,,,734210.0,348000.0,806000.0,,2400318.0,,,435651.0,226670.0,163509.0,,2385404.0,,
7,A001137499,Personal Loan,138000,9,96869,6756.0,36.0,0.0,0000000000000000000000000000000000000000000000...,Individual,0.0,714,0,10,0,0.0,36,SALARIED,Male,15.99,18.0,0.0,0.0,0.0,0.0,0.0,10.0,,,11.0,,,,8.0,,,11.0,,,,,,292650.0,,1142509.0,,,1367499.0,,,292650.0,,,,109524.0,,,0.0,,,
8,A002421104,Housing Loan,1436063,108,1302519,10374.0,97.0,1223863.0,9009009009009009009009009009009009009009009009...,Joint,1844612.0,599,0,36,0,17.0,41,SENP,Male,13.82,0.0,17.0,17.0,0.0,900.0,1.0,2.0,,4.0,2.0,1.0,1.0,,1.0,,2.0,2.0,,,,,615000.0,104000.0,,619000.0,,,631000.0,,3975000.0,104000.0,1436063.0,0.0,,0.0,,3588980.0,0.0,1302519.0,310315.0,
9,A001132923,Housing Loan,889992,28,604148,7421.0,240.0,0.0,0000000000000000000000000000000000000000000000...,Joint,1176000.0,772,0,29,0,0.0,36,SALARIED,Male,14.5,16.0,0.0,0.0,0.0,0.0,2.0,5.0,,,,1.0,,,3.0,,,,,,,,,,,61988.0,,,103052.0,,,,889992.0,,,21114.0,,,,604148.0,,


In [13]:
columns_to_fill_zero = ['PERSONAL_LOANS', 'PROPERTY_LOANS', 'BUSINESS_LOANS', 'GOLD_LOANS', 'HOUSING_LOANS',
                         'CLOSED_PERSONAL_LOANS', 'CLOSED_PROPERTY_LOANS', 'CLOSED_BUSINESS_LOANS',
                         'CLOSED_GOLD_LOANS', 'CLOSED_HOUSING_LOANS','PERSONAL_LOANS_SUM',
    'PROPERTY_LOANS_SUM',
    'BUSINESS_LOANS_SUM',
    'GOLD_LOANS_SUM',
    'HOUSING_LOANS_SUM','HOUSING_LOANS_PAID_SUM','PERSONAL_LOANS_PAID_SUM','PROPERTY_LOANS_PAID_SUM','GOLD_LOANS_PAID_SUM','BUSINESS_LOANS_PAID_SUM','PERSONAL_LOANS_UNCLOSED_SUM',
    'PROPERTY_LOANS_UNCLOSED_SUM',
    'BUSINESS_LOANS_UNCLOSED_SUM',
    'GOLD_LOANS_UNCLOSED_SUM',
    'HOUSING_LOANS_UNCLOSED_SUM','KISAN_CREDIT_CARD_UNCLOSED_SUM','EDUCATION_LOANS_UNCLOSED_SUM','KISAN_CREDIT_CARD_LOANS_SUM','EDUCATION_LOANS_SUM','EDUCATION_LOANS_PAID_SUM','KISAN_CREDIT_CARD_LOANS_PAID_SUM','CLOSED_EDUCATION_LOANS','CLOSED_KISAN_CREDIT_CARD_LOANS','EDUCATION_LOANS','KISAN_CREDIT_CARD_LOANS']

occurrences[columns_to_fill_zero] = occurrences[columns_to_fill_zero].fillna(0)

In [14]:
occurrences.head(10)

Unnamed: 0,ID,ACCOUNT_TYPE,HIGH_CREDIT_OR_SANCTIONED_AMOUNT,MONTH_OPENED,CURRENT_BALANCE,ACTUAL_PAYMT_AMT,REPAYMENT_TENURE,AMOUNT_OVERDUE,PAYMENT_HISTORY_1,OWNERSHIP_TYPE,COLLATERALVALUE,TU_SCORE,MONTH_PAYMENT_HISTORY_START,MONTH_PAYMENT_HISTORY_END,MONTH_CERTIFIED,MONTH_LAST_PAYMENT,AGE,OCCUPATION_TYPE,GENDER,ACTUAL_ROI,COUNT_PAID,COUNT_LATE,COUNT_MORE_THAN_90,COUNT_0_TO_60,AVG_DAYS,COUNT_MISSING_DAYS,PERSONAL_LOANS,PROPERTY_LOANS,BUSINESS_LOANS,GOLD_LOANS,HOUSING_LOANS,KISAN_CREDIT_CARD_LOANS,EDUCATION_LOANS,CLOSED_PERSONAL_LOANS,CLOSED_PROPERTY_LOANS,CLOSED_BUSINESS_LOANS,CLOSED_GOLD_LOANS,CLOSED_HOUSING_LOANS,CLOSED_KISAN_CREDIT_CARD_LOANS,CLOSED_EDUCATION_LOANS,PROPERTY_LOANS_PAID_SUM,BUSINESS_LOANS_PAID_SUM,GOLD_LOANS_PAID_SUM,HOUSING_LOANS_PAID_SUM,PERSONAL_LOANS_PAID_SUM,KISAN_CREDIT_CARD_LOANS_PAID_SUM,EDUCATION_LOANS_PAID_SUM,PERSONAL_LOANS_SUM,PROPERTY_LOANS_SUM,BUSINESS_LOANS_SUM,GOLD_LOANS_SUM,HOUSING_LOANS_SUM,KISAN_CREDIT_CARD_LOANS_SUM,EDUCATION_LOANS_SUM,PERSONAL_LOANS_UNCLOSED_SUM,PROPERTY_LOANS_UNCLOSED_SUM,BUSINESS_LOANS_UNCLOSED_SUM,GOLD_LOANS_UNCLOSED_SUM,HOUSING_LOANS_UNCLOSED_SUM,KISAN_CREDIT_CARD_UNCLOSED_SUM,EDUCATION_LOANS_UNCLOSED_SUM
0,A002338349,Housing Loan,818517,115,640742,8627.0,142.0,0.0,0000000000000000000000000000000000000520520520...,Joint,2086000.0,681,0,36,0,0.0,52,SALARIED,Male,12.32,13.0,4.0,0.0,4.0,44.25,1.0,1.0,2.0,0.0,14.0,2.0,0.0,0.0,1.0,1.0,0.0,13.0,1.0,0.0,0.0,97.0,0.0,524426.0,825000.0,129000.0,0.0,0.0,129000.0,381739.0,0.0,547026.0,1643517.0,0.0,0.0,0.0,298244.0,0.0,22600.0,640742.0,0.0,0.0
1,A002000537,Housing Loan,1243755,133,949446,500.0,156.0,0.0,0000000000000000000000000000000000000000000000...,Joint,1536000.0,784,0,36,0,0.0,39,SALARIED,Female,12.42,17.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19000.0,0.0,0.0,19000.0,0.0,0.0,0.0,1243755.0,0.0,0.0,0.0,0.0,0.0,0.0,949446.0,0.0,0.0
2,A002421579,Housing Loan,1826422,81,1296849,16036.0,148.0,0.0,0000000000000000000000000000000000000000000000...,Joint,3587700.0,748,0,36,0,0.0,37,SALARIED,Male,10.87,17.0,0.0,0.0,0.0,0.0,1.0,9.0,0.0,0.0,0.0,1.0,1.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3846792.0,0.0,0.0,4566792.0,0.0,0.0,0.0,1826422.0,415000.0,0.0,455004.0,0.0,0.0,0.0,1296849.0,414994.0,0.0
3,A002152345,Housing Loan,1847916,11,1724256,26996.0,174.0,0.0,0000000000000000000000000000000000000000000000...,Joint,3049600.0,786,0,12,0,0.0,54,SALARIED,Female,10.0,18.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,1.0,5.0,3.0,0.0,0.0,3.0,1.0,1.0,5.0,1.0,0.0,0.0,150000.0,30000.0,520000.0,2424978.0,520000.0,0.0,0.0,1620000.0,150000.0,30000.0,520000.0,4287894.0,0.0,0.0,485042.0,0.0,0.0,0.0,1737702.0,0.0,0.0
4,A001952834,Housing Loan,2318386,110,1790937,25131.0,116.0,431551.0,5395705705405095065075075075075095095094794784...,Joint,2785400.0,664,0,36,0,0.0,49,SALARIED,Male,11.97,0.0,17.0,17.0,0.0,510.235294,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1919444.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4237830.0,0.0,0.0,0.0,0.0,0.0,0.0,1790937.0,0.0,0.0
5,A002239370,Property Loan,1163870,80,959370,12544.0,158.0,12533.0,0210220520530520520520520500500000000000000002...,Joint,2369812.0,710,0,36,0,0.0,57,SALARIED,Male,12.97,5.0,12.0,2.0,10.0,77.5,1.0,7.0,1.0,0.0,2.0,1.0,0.0,0.0,7.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,30650.0,365000.0,97244.0,0.0,0.0,97244.0,1163870.0,0.0,30650.0,365000.0,0.0,0.0,0.0,959370.0,0.0,0.0,0.0,0.0,0.0
6,A000936177,Housing Loan,2400318,9,2385404,500.0,174.0,0.0,0000000000000000000000000000000000000000000000...,Joint,3068676.0,774,0,9,0,0.0,33,OTHERS,Male,14.25,18.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,486000.0,0.0,0.0,0.0,0.0,0.0,734210.0,348000.0,806000.0,0.0,2400318.0,0.0,0.0,435651.0,226670.0,163509.0,0.0,2385404.0,0.0,0.0
7,A001137499,Personal Loan,138000,9,96869,6756.0,36.0,0.0,0000000000000000000000000000000000000000000000...,Individual,0.0,714,0,10,0,0.0,36,SALARIED,Male,15.99,18.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,11.0,0.0,0.0,0.0,8.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,292650.0,0.0,1142509.0,0.0,0.0,1367499.0,0.0,0.0,292650.0,0.0,0.0,0.0,109524.0,0.0,0.0,0.0,0.0,0.0,0.0
8,A002421104,Housing Loan,1436063,108,1302519,10374.0,97.0,1223863.0,9009009009009009009009009009009009009009009009...,Joint,1844612.0,599,0,36,0,17.0,41,SENP,Male,13.82,0.0,17.0,17.0,0.0,900.0,1.0,2.0,0.0,4.0,2.0,1.0,1.0,0.0,1.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,615000.0,104000.0,0.0,619000.0,0.0,0.0,631000.0,0.0,3975000.0,104000.0,1436063.0,0.0,0.0,0.0,0.0,3588980.0,0.0,1302519.0,310315.0,0.0
9,A001132923,Housing Loan,889992,28,604148,7421.0,240.0,0.0,0000000000000000000000000000000000000000000000...,Joint,1176000.0,772,0,29,0,0.0,36,SALARIED,Male,14.5,16.0,0.0,0.0,0.0,0.0,2.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61988.0,0.0,0.0,103052.0,0.0,0.0,0.0,889992.0,0.0,0.0,21114.0,0.0,0.0,0.0,604148.0,0.0,0.0


In [15]:
# Check for null values in the newly added columns
null_values = occurrences[['PERSONAL_LOANS', 'PROPERTY_LOANS', 'BUSINESS_LOANS', 'GOLD_LOANS', 'HOUSING_LOANS',
                         'CLOSED_PERSONAL_LOANS', 'CLOSED_PROPERTY_LOANS', 'CLOSED_BUSINESS_LOANS',
                         'CLOSED_GOLD_LOANS', 'CLOSED_HOUSING_LOANS','PERSONAL_LOANS_SUM',
    'PROPERTY_LOANS_SUM',
    'BUSINESS_LOANS_SUM',
    'GOLD_LOANS_SUM',
    'HOUSING_LOANS_SUM','HOUSING_LOANS_PAID_SUM','PERSONAL_LOANS_PAID_SUM','PROPERTY_LOANS_PAID_SUM','GOLD_LOANS_PAID_SUM','BUSINESS_LOANS_PAID_SUM','PERSONAL_LOANS_UNCLOSED_SUM',
    'PROPERTY_LOANS_UNCLOSED_SUM',
    'BUSINESS_LOANS_UNCLOSED_SUM',
    'GOLD_LOANS_UNCLOSED_SUM',
    'HOUSING_LOANS_UNCLOSED_SUM']].isnull().sum()

print("Null values in the newly added columns:")
print(null_values)

Null values in the newly added columns:
PERSONAL_LOANS                 0
PROPERTY_LOANS                 0
BUSINESS_LOANS                 0
GOLD_LOANS                     0
HOUSING_LOANS                  0
CLOSED_PERSONAL_LOANS          0
CLOSED_PROPERTY_LOANS          0
CLOSED_BUSINESS_LOANS          0
CLOSED_GOLD_LOANS              0
CLOSED_HOUSING_LOANS           0
PERSONAL_LOANS_SUM             0
PROPERTY_LOANS_SUM             0
BUSINESS_LOANS_SUM             0
GOLD_LOANS_SUM                 0
HOUSING_LOANS_SUM              0
HOUSING_LOANS_PAID_SUM         0
PERSONAL_LOANS_PAID_SUM        0
PROPERTY_LOANS_PAID_SUM        0
GOLD_LOANS_PAID_SUM            0
BUSINESS_LOANS_PAID_SUM        0
PERSONAL_LOANS_UNCLOSED_SUM    0
PROPERTY_LOANS_UNCLOSED_SUM    0
BUSINESS_LOANS_UNCLOSED_SUM    0
GOLD_LOANS_UNCLOSED_SUM        0
HOUSING_LOANS_UNCLOSED_SUM     0
dtype: int64


In [16]:
# Calculate the count of all loans for the same 'ID'
all_loans_count = df_all.groupby('ID').size().reset_index(name='ALL_LOANS_COUNT')
# Merge the total loan count with the existing occurrences dataframe
occurrences = occurrences.merge(all_loans_count, on='ID', how='left')


In [17]:
occurrences.head()

Unnamed: 0,ID,ACCOUNT_TYPE,HIGH_CREDIT_OR_SANCTIONED_AMOUNT,MONTH_OPENED,CURRENT_BALANCE,ACTUAL_PAYMT_AMT,REPAYMENT_TENURE,AMOUNT_OVERDUE,PAYMENT_HISTORY_1,OWNERSHIP_TYPE,COLLATERALVALUE,TU_SCORE,MONTH_PAYMENT_HISTORY_START,MONTH_PAYMENT_HISTORY_END,MONTH_CERTIFIED,MONTH_LAST_PAYMENT,AGE,OCCUPATION_TYPE,GENDER,ACTUAL_ROI,COUNT_PAID,COUNT_LATE,COUNT_MORE_THAN_90,COUNT_0_TO_60,AVG_DAYS,COUNT_MISSING_DAYS,PERSONAL_LOANS,PROPERTY_LOANS,BUSINESS_LOANS,GOLD_LOANS,HOUSING_LOANS,KISAN_CREDIT_CARD_LOANS,EDUCATION_LOANS,CLOSED_PERSONAL_LOANS,CLOSED_PROPERTY_LOANS,CLOSED_BUSINESS_LOANS,CLOSED_GOLD_LOANS,CLOSED_HOUSING_LOANS,CLOSED_KISAN_CREDIT_CARD_LOANS,CLOSED_EDUCATION_LOANS,PROPERTY_LOANS_PAID_SUM,BUSINESS_LOANS_PAID_SUM,GOLD_LOANS_PAID_SUM,HOUSING_LOANS_PAID_SUM,PERSONAL_LOANS_PAID_SUM,KISAN_CREDIT_CARD_LOANS_PAID_SUM,EDUCATION_LOANS_PAID_SUM,PERSONAL_LOANS_SUM,PROPERTY_LOANS_SUM,BUSINESS_LOANS_SUM,GOLD_LOANS_SUM,HOUSING_LOANS_SUM,KISAN_CREDIT_CARD_LOANS_SUM,EDUCATION_LOANS_SUM,PERSONAL_LOANS_UNCLOSED_SUM,PROPERTY_LOANS_UNCLOSED_SUM,BUSINESS_LOANS_UNCLOSED_SUM,GOLD_LOANS_UNCLOSED_SUM,HOUSING_LOANS_UNCLOSED_SUM,KISAN_CREDIT_CARD_UNCLOSED_SUM,EDUCATION_LOANS_UNCLOSED_SUM,ALL_LOANS_COUNT
0,A002338349,Housing Loan,818517,115,640742,8627.0,142.0,0.0,0000000000000000000000000000000000000520520520...,Joint,2086000.0,681,0,36,0,0.0,52,SALARIED,Male,12.32,13.0,4.0,0.0,4.0,44.25,1.0,1.0,2.0,0.0,14.0,2.0,0.0,0.0,1.0,1.0,0.0,13.0,1.0,0.0,0.0,97.0,0.0,524426.0,825000.0,129000.0,0.0,0.0,129000.0,381739.0,0.0,547026.0,1643517.0,0.0,0.0,0.0,298244.0,0.0,22600.0,640742.0,0.0,0.0,20
1,A002000537,Housing Loan,1243755,133,949446,500.0,156.0,0.0,0000000000000000000000000000000000000000000000...,Joint,1536000.0,784,0,36,0,0.0,39,SALARIED,Female,12.42,17.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19000.0,0.0,0.0,19000.0,0.0,0.0,0.0,1243755.0,0.0,0.0,0.0,0.0,0.0,0.0,949446.0,0.0,0.0,3
2,A002421579,Housing Loan,1826422,81,1296849,16036.0,148.0,0.0,0000000000000000000000000000000000000000000000...,Joint,3587700.0,748,0,36,0,0.0,37,SALARIED,Male,10.87,17.0,0.0,0.0,0.0,0.0,1.0,9.0,0.0,0.0,0.0,1.0,1.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3846792.0,0.0,0.0,4566792.0,0.0,0.0,0.0,1826422.0,415000.0,0.0,455004.0,0.0,0.0,0.0,1296849.0,414994.0,0.0,12
3,A002152345,Housing Loan,1847916,11,1724256,26996.0,174.0,0.0,0000000000000000000000000000000000000000000000...,Joint,3049600.0,786,0,12,0,0.0,54,SALARIED,Female,10.0,18.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,1.0,5.0,3.0,0.0,0.0,3.0,1.0,1.0,5.0,1.0,0.0,0.0,150000.0,30000.0,520000.0,2424978.0,520000.0,0.0,0.0,1620000.0,150000.0,30000.0,520000.0,4287894.0,0.0,0.0,485042.0,0.0,0.0,0.0,1737702.0,0.0,0.0,15
4,A001952834,Housing Loan,2318386,110,1790937,25131.0,116.0,431551.0,5395705705405095065075075075075095095094794784...,Joint,2785400.0,664,0,36,0,0.0,49,SALARIED,Male,11.97,0.0,17.0,17.0,0.0,510.235294,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1919444.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4237830.0,0.0,0.0,0.0,0.0,0.0,0.0,1790937.0,0.0,0.0,5


In [18]:
df=occurrences.copy()


In [19]:
df_all.shape

(1066009, 20)

In [20]:
df_all.isna().sum()

ID                                        0
ACCOUNT_TYPE                              0
HIGH_CREDIT_OR_SANCTIONED_AMOUNT      25044
DATE_OPENED                             470
DATE_CLOSED                          376940
CURRENT_BALANCE                           0
ACTUAL_PAYMT_AMT                     794091
EMI_AMOUNT                           667217
REPAYMENT_TENURE                     605305
AMOUNT_OVERDUE                      1020818
PAYMENT_HISTORY_1                         0
PAYMENT_HISTORY_2                    695020
OWNERSHIP_TYPE                            0
COLLATERALVALUE                      959898
TU_SCORE                                  0
PAYMENT_HISTORY_START_DATE                0
PAYMENT_HISTORY_END_DATE                  0
DATE_REPORTED_AND_CERTIFIED               0
DATE_OF_LAST_PAYMENT                 164747
LOAN_CATEGORY                        158959
dtype: int64

### now extract the payment history -1 of each category 

In [21]:
df_all['LOAN_CATEGORY'].unique()

array(['Housing Loan', 'Personal Loan', 'Property Loan', None,
       'Gold Loan', 'Kisan Credit Card', 'Education Loan',
       'Business Loan'], dtype=object)

In [22]:
df.to_csv('feature_generated.csv',index=False)