In [152]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier

from imblearn.over_sampling import SMOTE

import seaborn as sns
import matplotlib.pyplot as plt

In [289]:
train_df = pd.read_csv('credit_train.csv')
train_df = train_df.rename(columns={
    'Loan ID': 'loan_id',
    'Customer ID': 'customer_id',
    'Loan Status': 'loan_status',
    'Current Loan Amount': 'current_loan_amount',
    'Term': 'term',
    'Credit Score': 'credit_score',
    'Annual Income': 'annual_income',
    'Years in current job': 'years_in_current_job',
    'Home Ownership': 'ownership',
    'Purpose': 'purpose',
    'Monthly Debt': 'monthly_debt',
    'Years of Credit History': 'years_of_credit_history',
    'Months since last delinquent': 'months',
    'Number of Open Accounts': 'open_accounts',
    'Number of Credit Problems': 'credit_problems',
    'Current Credit Balance': 'credit_balance',
    'Maximum Open Credit': 'max_credit',
    'Bankruptcies': 'bankruptcy',
    'Tax Liens': 'tax_liens'
})

# train_df = train_df.drop_duplicates()
# train_df = train_df.query('current_loan_amount != 99999999')

train_df

Unnamed: 0,loan_id,customer_id,loan_status,current_loan_amount,term,credit_score,annual_income,years_in_current_job,ownership,purpose,monthly_debt,years_of_credit_history,months,open_accounts,credit_problems,credit_balance,max_credit,bankruptcy,tax_liens
0,14dd8831-6af5-400b-83ec-68e61888a048,981165ec-3274-42f5-a3b4-d104041a9ca9,Fully Paid,445412,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,,6,1,228190,416746.0,1.0,0.0
1,4771cc26-131a-45db-b5aa-537ea4ba5342,2de017a3-2e01-49cb-a581-08169e83be29,Fully Paid,262328,Short Term,,,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35,0,229976,850784.0,0.0,0.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,Fully Paid,99999999,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,29.0,18,1,297996,750090.0,0.0,0.0
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,e777faab-98ae-45af-9a86-7ce5b33b1011,Fully Paid,347666,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.90,12.0,,9,0,256329,386958.0,0.0,0.0
4,d4062e70-befa-4995-8643-a0de73938182,81536ad9-5ccf-4eb8-befb-47a4d608658e,Fully Paid,176220,Short Term,,,5 years,Rent,Debt Consolidation,20639.70,6.1,,15,0,253460,427174.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,3f94c18c-ba8f-45d0-8610-88a684a410a9,2da51983-cfef-4b8f-a733-5dfaf69e9281,Fully Paid,147070,Short Term,725.0,475437.0,7 years,Own Home,other,2202.86,22.3,,5,0,47766,658548.0,0.0,0.0
99996,06eba04f-58fc-424a-b666-ed72aa008900,77f2252a-b7d1-4b07-a746-1202a8304290,Fully Paid,99999999,Short Term,732.0,1289416.0,1 year,Rent,Debt Consolidation,13109.05,9.4,21.0,22,0,153045,509234.0,0.0,0.0
99997,e1cb4050-eff5-4bdb-a1b0-aabd3f7eaac7,2ced5f10-bd60-4a11-9134-cadce4e7b0a3,Fully Paid,103136,Short Term,742.0,1150545.0,6 years,Rent,Debt Consolidation,7315.57,18.8,18.0,12,1,109554,537548.0,1.0,0.0
99998,81ab928b-d1a5-4523-9a3c-271ebb01b4fb,3e45ffda-99fd-4cfc-b8b8-446f4a505f36,Fully Paid,530332,Short Term,746.0,1717524.0,9 years,Rent,Debt Consolidation,9890.07,15.0,,8,0,404225,738254.0,0.0,0.0


In [290]:
# for column in train_df.filter(items=['loan_status', 'term', 'years_in_current_job', 'ownership', 'purpose']).columns:
#     print(f'{column}: {train_df[column].unique()}\n')
    
def ownership_resolver(ownership):
    if ownership in ['Home Mortgage', 'HaveMortgage']:
        return 'mortgage'
    else:
        return ownership
    
def purpose_resolver(purpose):
    if purpose in ['other', 'Other']:
        return 'other'
    elif purpose in ['Take a Trip', 'vacation']:
        return 'vacation'
    elif purpose in ['Buy House', 'Buy a Car', 'major_purchase']:
        return 'major_purchase'
    elif purpose in ['Home Improvements', 'moving']:
        return 'moving'
    else:
        return purpose
    
train_df.purpose = train_df.purpose.apply(purpose_resolver)
train_df.ownership = train_df.ownership.apply(ownership_resolver)

In [291]:
train_df = train_df.drop(labels=['loan_id', 'customer_id'], axis=1)

labels = ['loan_status', 'term', 'years_in_current_job', 'ownership', 'purpose']

for label in labels:
    replaced = list(train_df[label].unique())
    train_df[label] = train_df[label].apply(lambda x: replaced.index(x))

train_df.fillna(value=0, inplace=True)
# train_df.dropna(inplace=True)

train_df

Unnamed: 0,loan_status,current_loan_amount,term,credit_score,annual_income,years_in_current_job,ownership,purpose,monthly_debt,years_of_credit_history,months,open_accounts,credit_problems,credit_balance,max_credit,bankruptcy,tax_liens
0,0,445412,0,709.0,1167493.0,0,0,0,5214.74,17.2,0.0,6,1,228190,416746.0,1.0,0.0
1,0,262328,0,0.0,0.0,1,0,1,33295.98,21.1,8.0,35,0,229976,850784.0,0.0,0.0
2,0,99999999,0,741.0,2231892.0,0,1,1,29200.53,14.9,29.0,18,1,297996,750090.0,0.0,0.0
3,0,347666,1,721.0,806949.0,2,1,1,8741.90,12.0,0.0,9,0,256329,386958.0,0.0,0.0
4,0,176220,0,0.0,0.0,3,2,1,20639.70,6.1,0.0,15,0,253460,427174.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,147070,0,725.0,475437.0,8,1,3,2202.86,22.3,0.0,5,0,47766,658548.0,0.0,0.0
99996,0,99999999,0,732.0,1289416.0,9,2,1,13109.05,9.4,21.0,22,0,153045,509234.0,0.0,0.0
99997,0,103136,0,742.0,1150545.0,11,2,1,7315.57,18.8,18.0,12,1,109554,537548.0,1.0,0.0
99998,0,530332,0,746.0,1717524.0,7,2,1,9890.07,15.0,0.0,8,0,404225,738254.0,0.0,0.0


In [292]:
X = train_df.drop(labels=['loan_status'], axis=1)
y = train_df.loan_status

In [293]:
lr = LogisticRegression(penalty='l2', solver='liblinear', max_iter=100000, C=0.001)
# lr.fit(X, y)

# lr.score(X, y)

In [294]:
smote = SMOTE(random_state=0)

train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=42)

smote_data_X, smote_data_y = smote.fit_sample(train_X, train_y)
smote_data_X = pd.DataFrame(data=smote_data_X, columns=X.columns)
smote_data_y = pd.DataFrame(data=smote_data_y, columns=['loan_status'])

In [295]:
# lr.fit(smote_data_X, smote_data_y.loan_status)
lr.fit(train_X, train_y)
lr.score(test_X, test_y)

0.81712

In [296]:
tree = DecisionTreeClassifier(random_state=42, max_depth=7, splitter='best', min_samples_leaf=50)
tree.fit(train_X, train_y)

tree.score(test_X, test_y)

0.82076

In [231]:
tree.score(smote_data_X, smote_data_y)

0.6077001327609096

In [267]:
train_df.current_loan_amount.mode().values[0]

99999999

In [232]:
test_df = pd.read_csv('credit_test.csv')
test_df = test_df.rename(columns={
    'Loan ID': 'loan_id',
    'Customer ID': 'customer_id',
    'Current Loan Amount': 'current_loan_amount',
    'Term': 'term',
    'Credit Score': 'credit_score',
    'Annual Income': 'annual_income',
    'Years in current job': 'years_in_current_job',
    'Home Ownership': 'ownership',
    'Purpose': 'purpose',
    'Monthly Debt': 'monthly_debt',
    'Years of Credit History': 'years_of_credit_history',
    'Months since last delinquent': 'months',
    'Number of Open Accounts': 'open_accounts',
    'Number of Credit Problems': 'credit_problems',
    'Current Credit Balance': 'credit_balance',
    'Maximum Open Credit': 'max_credit',
    'Bankruptcies': 'bankruptcy',
    'Tax Liens': 'tax_liens'
})

# for column in test_df.filter(items=['loan_status', 'term', 'years_in_current_job', 'ownership', 'purpose']).columns:
#     print(f'{column}: {test_df[column].unique()}\n')

# test_df = test_df.drop(labels=['loan_id', 'customer_id'], axis=1)

test_df.purpose = test_df.purpose.apply(purpose_resolver)
test_df.ownership = test_df.ownership.apply(ownership_resolver)

labels = ['term', 'years_in_current_job', 'ownership', 'purpose']
for label in labels:
    replaced = list(test_df[label].unique())
    test_df[label] = test_df[label].apply(lambda x: replaced.index(x))

test_df.fillna(value=0, inplace=True)

# predicted = lr.predict(test_df.drop(labels=['loan_id', 'customer_id'], axis=1))
predicted = tree.predict(test_df.drop(labels=['loan_id', 'customer_id'], axis=1))

In [233]:
import csv

with open('result9.csv', 'w', newline='\n') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(['Loan ID', 'Loan Status'])
    
    for loan_id, res in zip(test_df['loan_id'], predicted):
        s_res = 'Fully Paid' if res == 0 else 'Charged Off'
            
        writer.writerow([loan_id, s_res])

In [101]:
print(np.unique(predicted, return_counts=True))
train_df.loan_status.value_counts()

(array([0, 1]), array([9544,  456]))


0    24722
1     7891
Name: loan_status, dtype: int64

In [102]:
df = pd.read_csv('credit_train.csv')
df['Loan Status'].value_counts()

Fully Paid     77361
Charged Off    22639
Name: Loan Status, dtype: int64