In [1]:
import numpy as np
import pandas as pd
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
def fill_gender(cols):
    gender = cols[0]
    married = cols[1]
    if pd.isnull(gender):
        if pd.isnull(married) or married == 'Yes':
            return 'Male'
        else:
            return 'Female'
    else:
        return gender

def fill_married(cols):
    gender = cols[0]
    married = cols[1]
    if pd.isnull(married):
        if pd.isnull(gender) or gender == 'Male':
            return 'Yes'
        else:
            return 'No'
    else:
        return married

def fill_loan_amount(cols):
    pa = cols[0]
    edu = cols[1]
    se = cols[2]
    la = cols[3]
    temp = df.groupby(['Property_Area','Education','Self_Employed'])['LoanAmount'].median()
    temp = temp.to_dict()
    if pd.isna(la) and (pa,edu,se) in temp:
        return temp[(pa,edu,se)]
    else:
        return la

def coaplicant_share(cols):
    x = cols[0]
    y = cols[1]
    if y == 0:
        return 'No Share'
    elif x == y :
        return 'Same Share'
    elif x > y and y >= x * 0.85 :
        return 'Lesser But Similar Share'
    elif y > x and y <= x * 1.15:
        return 'More But Similar Share'
    elif x > y :
        return 'Lesser Share'
    else:
        return 'More Share'

def coaplicant_share_similar(cols):
    x = cols[0]
    y = cols[1]
    if x == y or (x > y and y >= x * 0.85) or (y > x and y <= x * 1.15):
        return 'Similar'
    else:
        return 'Not Similar'

def encode(df):
    cols = ['Gender', 'Married', 'Education','Self_Employed','Property_Area', 
            'Coapplicant_Share_Similarity','Has_Dependents']
    df.Married = df.Married.apply(lambda x : 'Married' if x == 'Yes' else 'Not Married')
    df.Self_Employed = df.Self_Employed.apply(lambda x : 'Self Employed' if x == 'Yes' else 'Not Self Employed')
    df.Has_Dependents = df.Has_Dependents.apply(lambda x : 'Has Dependents' if x == 'Yes' else 'No Dependents')
    df = df.drop(['Loan_ID','Coapplicant_Share','Has_Coapplicant','Dependents'],axis=1)
    for col in cols :
        temp = pd.get_dummies(df[col],drop_first=True)
        df = pd.concat([df,temp],axis=1)
    df = df.drop(cols,axis=1)
    return df

def pre_process_test(df):
    df['Gender'] = df[['Gender','Married']].apply(fill_gender,axis=1)
    df['Married'] = df[['Gender','Married']].apply(fill_married,axis=1)
    # df['Gender'] = df['Gender'].fillna(df.Gender.mode()[0])
    # df['Married'] = df['Married'].fillna(df.Married.mode()[0])
    df['Dependents'] = df['Dependents'].fillna(df.Dependents.mode()[0])
    df['Self_Employed'] = df['Self_Employed'].fillna(df.Self_Employed.mode()[0])
    df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df.Loan_Amount_Term.mode()[0])
    df['Credit_History'] = df['Credit_History'].fillna(df.Credit_History.mode()[0])
    df['LoanAmount'] = df[['Property_Area','Education','Self_Employed','LoanAmount']].apply(fill_loan_amount,axis=1)
    # df['LoanAmount'] = df['LoanAmount'].fillna(df.LoanAmount.median())
    df['Has_Coapplicant'] = df.CoapplicantIncome.map(lambda x : 'Yes' if x != 0 else 'No')
    df['Coapplicant_Share'] = df[['ApplicantIncome','CoapplicantIncome']].apply(coaplicant_share,axis=1)
    df['Coapplicant_Share_Similarity'] = df[['ApplicantIncome','CoapplicantIncome']].apply(coaplicant_share_similar,axis=1)
    df['Has_Dependents'] = df['Dependents'].map(lambda x : 'No' if str(x) == '0' else 'Yes')
    df = encode(df)
    return df

In [5]:
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample.csv')

In [6]:
test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [7]:
sample

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,N
1,LP001022,N
2,LP001031,N
3,LP001035,N
4,LP001051,N
...,...,...
362,LP002971,N
363,LP002975,N
364,LP002980,N
365,LP002986,N


In [11]:
df = test.copy()
loan_id = df.Loan_ID.copy()
df = pre_process_test(df)
df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Male,Not Married,Not Graduate,Self Employed,Semiurban,Urban,Similar,No Dependents
0,5720,0,110.0,360.0,1.0,1,0,0,0,0,1,0,1
1,3076,1500,126.0,360.0,1.0,1,0,0,0,0,1,0,0
2,5000,1800,208.0,360.0,1.0,1,0,0,0,0,1,0,0
3,2340,2546,100.0,360.0,1.0,1,0,0,0,0,1,1,0
4,3276,0,78.0,360.0,1.0,1,1,1,0,0,1,0,1


In [12]:
path = '/content/drive/MyDrive/Loan Prediction/Models2/model_xgb_bgg.pkl'

with open(path, 'rb') as file:  
    model = pickle.load(file)
test_data = df.values
predictions = model.predict(test_data)
submission = pd.DataFrame()
submission['Loan_ID'] = loan_id
submission['Loan_Status'] = pd.Series(predictions)
submission['Loan_Status'] = submission['Loan_Status'].apply(lambda x : 'Y' if x == 1 else 'N')
submission.to_csv(f'sub.csv',index=False)

In [13]:
submission

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y
...,...,...
362,LP002971,Y
363,LP002975,Y
364,LP002980,Y
365,LP002986,Y
