In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
pd.set_option('display.max_column', 500)

In [2]:
test = pd.read_csv('data/cs-test.csv', index_col=0)
test.shape

(101503, 11)

In [34]:
with open('saved/variables.obj', 'rb') as f:
    variables = pickle.load(f)
    
with open('saved/scaler.obj', 'rb') as f:
    scaler = pickle.load(f)
    
with open('saved/logistic_model.obj', 'rb') as f:
    lr = pickle.load(f)

In [4]:
test.drop('SeriousDlqin2yrs', axis=1, inplace=True)

In [5]:
test.isnull().sum()

RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           20103
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       2626
dtype: int64

In [6]:
test['RevolvingUtilizationOfUnsecuredLines'] = np.minimum(test['RevolvingUtilizationOfUnsecuredLines'], 1)

In [7]:
def RevolvingUtilizationOfUnsecuredLines(data):
    conditions = [
        (data['RevolvingUtilizationOfUnsecuredLines'] <= 0.3),
        (data['RevolvingUtilizationOfUnsecuredLines']> 0.3) & (data['RevolvingUtilizationOfUnsecuredLines'] <=0.7),
        (data['RevolvingUtilizationOfUnsecuredLines'] > 0.7)
    ]
    bin_labels = ['Good', 'Average', 'Bad']

    data['CreditUtility'] = np.select(conditions, bin_labels, default='Unknown')
    return None

RevolvingUtilizationOfUnsecuredLines(test)

In [8]:
def age(data):
    conditions = [
        (data['age'] <= 30),
        (data['age'] >30) & (data['age'] <=40),
        (data['age'] >40) & (data['age'] <=50),
        (data['age'] >50) & (data['age'] <=60),
        (data['age'] >60) & (data['age'] <=70),
        (data['age'] >70)
    ]
    bin_labels = ['Below30', '30-40', '40-50', '50-60', '60-70', 'Above70']

    data['Age'] = np.select(conditions, bin_labels, default='Unknown')
    return None

age(test)

In [9]:
def NumberOfTime3059DaysPastDueNotWorse(data):
    conditions = [
        (data['NumberOfTime30-59DaysPastDueNotWorse'] == 0),
        (data['NumberOfTime30-59DaysPastDueNotWorse'] >=1) & (data['NumberOfTime30-59DaysPastDueNotWorse'] <=2),
        (data['NumberOfTime30-59DaysPastDueNotWorse'] >2) & (data['NumberOfTime30-59DaysPastDueNotWorse'] <=5),
        (data['NumberOfTime30-59DaysPastDueNotWorse'] >5) & (data['NumberOfTime30-59DaysPastDueNotWorse'] <=10),
        (data['NumberOfTime30-59DaysPastDueNotWorse'] >10)
    ]
    bin_labels = ['No', 'Rare', 'Ocassional', 'Frequent', 'Very-Frequent']

    data['30-59DaysLatePayment'] = np.select(conditions, bin_labels, default='Unknown')
    return None

NumberOfTime3059DaysPastDueNotWorse(test)

In [10]:
def DebtRatio(data):
    conditions = [
        (data['DebtRatio'] <= 0.3),
        (data['DebtRatio'] >0.3) & (data['DebtRatio'] <=1),
        (data['DebtRatio'] >1)
    ]
    bin_labels = ['Low-Debt', 'Moderate-Debt', 'High-Debt']

    data['Debt'] = np.select(conditions, bin_labels, default='Unknown')
    return None

DebtRatio(test)

In [14]:
test['MonthlyIncome'].fillna(variables['MonthlyIncomeMedian'], inplace=True)

def MonthlyIncome(data):
    conditions = [
        (data['MonthlyIncome'] <= 1000),
        (data['MonthlyIncome'] >1000) & (data['MonthlyIncome'] <=4000),
        (data['MonthlyIncome'] >4000) & (data['MonthlyIncome'] <=7000),
        (data['MonthlyIncome'] >7000) & (data['MonthlyIncome'] <=10000),
        (data['MonthlyIncome'] >10000) & (data['MonthlyIncome'] <=15000),
        (data['MonthlyIncome'] >15000)
    ]
    bin_labels = ['Low-Income', 'Moderate-Income', 'Middle-Income', 'Upper-Middle-Income', 'High-Income', 'Rich']

    data['Monthly-Income'] = np.select(conditions, bin_labels, default='Unknown')
    return None

MonthlyIncome(test)

In [15]:
def NumberOfOpenCreditLinesAndLoans(data):
    conditions = [
        (data['NumberOfOpenCreditLinesAndLoans'] <= 2),
        (data['NumberOfOpenCreditLinesAndLoans'] >2) & (data['NumberOfOpenCreditLinesAndLoans'] <=5),
        (data['NumberOfOpenCreditLinesAndLoans'] >5) & (data['NumberOfOpenCreditLinesAndLoans'] <=10),
        (data['NumberOfOpenCreditLinesAndLoans'] >10) & (data['NumberOfOpenCreditLinesAndLoans'] <=15),
        (data['NumberOfOpenCreditLinesAndLoans'] >15)
    ]
    bin_labels = ['Few', 'Moderate', 'Many', 'Numerous', 'Extensive']

    data['OpenCreditLines'] = np.select(conditions, bin_labels, default='Unknown')
    return None

NumberOfOpenCreditLinesAndLoans(test)

In [16]:
def NumberOfTimes90DaysLate(data):
    conditions = [
        (data['NumberOfTimes90DaysLate'] == 0),
        (data['NumberOfTimes90DaysLate'] >=1) & (data['NumberOfTimes90DaysLate'] <=2),
        (data['NumberOfTimes90DaysLate'] >2) & (data['NumberOfTimes90DaysLate'] <=5),
        (data['NumberOfTimes90DaysLate'] >5) & (data['NumberOfTimes90DaysLate'] <=10),
        (data['NumberOfTimes90DaysLate'] >10)
    ]
    bin_labels = ['No', 'Rare', 'Ocassional', 'Frequent', 'Very-Frequent']

    data['90DaysLatePayment'] = np.select(conditions, bin_labels, default='Unknown')
    return None

NumberOfTimes90DaysLate(test)

In [17]:
def NumberRealEstateLoansOrLines(data):
    conditions = [
        (data['NumberRealEstateLoansOrLines'] == 0),
        (data['NumberRealEstateLoansOrLines'] >=1) & (data['NumberRealEstateLoansOrLines'] <=2),
        (data['NumberRealEstateLoansOrLines'] >2) & (data['NumberRealEstateLoansOrLines'] <=4),
        (data['NumberRealEstateLoansOrLines'] >4) & (data['NumberRealEstateLoansOrLines'] <=7),
        (data['NumberRealEstateLoansOrLines'] >7)
    ]
    bin_labels = ['No-Estate-Loans', 'Low-Estate-Loans', 'Moderate-Estate-Loans', 'High-Estate-Loans', 'Very-High-Estate-Loans']

    data['RealEstateLoans'] = np.select(conditions, bin_labels, default='Unknown')
    return None

NumberRealEstateLoansOrLines(test)

In [18]:
def NumberOfTime6089DaysPastDueNotWorse(data):
    conditions = [
        (data['NumberOfTime60-89DaysPastDueNotWorse'] == 0),
        (data['NumberOfTime60-89DaysPastDueNotWorse'] >=1) & (data['NumberOfTime60-89DaysPastDueNotWorse'] <=2),
        (data['NumberOfTime60-89DaysPastDueNotWorse'] >2) & (data['NumberOfTime60-89DaysPastDueNotWorse'] <=5),
        (data['NumberOfTime60-89DaysPastDueNotWorse'] >5)
    ]
    bin_labels = ['No', 'Rare', 'Ocassional', 'Frequent']

    data['60-89DaysLatePayment'] = np.select(conditions, bin_labels, default='Unknown')
    return None

NumberOfTime6089DaysPastDueNotWorse(test)

In [21]:
test['NumberOfDependents'].fillna(variables['DependentsMedian'], inplace=True)

def NumberOfDependents(data):
    conditions = [
        (data['NumberOfDependents'] == 0),
        (data['NumberOfDependents'] >0) & (data['NumberOfDependents'] <=2),
        (data['NumberOfDependents'] >2) & (data['NumberOfDependents'] <=5),
        (data['NumberOfDependents'] >5)
    ]
    bin_labels = ['No', 'Low', 'Moderate', 'High']

    data['Dependents'] = np.select(conditions, bin_labels, default='Unknown')
    return None

NumberOfDependents(test)

In [25]:
category_columns = ['CreditUtility', 'Age', '30-59DaysLatePayment', 'Debt', 'Monthly-Income', 
                    'OpenCreditLines', '90DaysLatePayment', 'RealEstateLoans', '60-89DaysLatePayment', 'Dependents']

In [26]:
test = test[category_columns]

In [30]:
for feature in category_columns:
    test[feature] = test[feature].map(variables['WoEDict'][feature])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [32]:
test[category_columns] = scaler.transform(test[category_columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [33]:
test

Unnamed: 0,CreditUtility,Age,30-59DaysLatePayment,Debt,Monthly-Income,OpenCreditLines,90DaysLatePayment,RealEstateLoans,60-89DaysLatePayment,Dependents
1,0.000000,0.209404,1.000000,1.000000,0.491599,0.787668,1.0,0.658441,1.000000,1.000000
2,0.477796,0.398567,1.000000,0.000000,0.705646,0.862336,1.0,0.775963,1.000000,0.576829
3,1.000000,0.398567,1.000000,0.000000,0.491599,0.862336,1.0,1.000000,1.000000,0.576829
4,1.000000,0.101443,0.518855,0.000000,0.000000,1.000000,1.0,1.000000,1.000000,1.000000
5,0.000000,0.000000,1.000000,1.000000,0.000000,0.787668,1.0,0.658441,1.000000,0.576829
...,...,...,...,...,...,...,...,...,...,...
101499,1.000000,0.000000,1.000000,1.000000,0.000000,0.787668,1.0,0.658441,1.000000,1.000000
101500,0.000000,0.101443,0.207411,0.000000,0.705646,1.000000,1.0,1.000000,1.000000,0.321516
101501,1.000000,0.724462,1.000000,0.612473,0.491599,0.787668,1.0,0.658441,1.000000,1.000000
101502,0.477796,0.398567,1.000000,0.612473,0.491599,1.000000,1.0,1.000000,0.287214,0.321516


In [37]:
preds = lr.predict_proba(test)[:, 1]

In [54]:
columns = ["Id", "Probability"]
final = pd.DataFrame(columns=columns)

final['Id'] = np.arange(len(preds))+1
final['Probability'] = pd.Series(preds)

final.to_csv('result/submission.csv', index=False)

In [59]:
probability = lr.predict_proba(test)

In [68]:
columns = ["Id", "Good-Customer", "Bad-Customer"]
proba = pd.DataFrame(probability).rename(columns={0: 'Good-Customer',1: 'Bad-Customer'})
proba['Id'] = np.arange(len(preds))+1

proba = proba[columns]

proba.to_csv('result/probability.csv', index=False)

In [69]:
proba

Unnamed: 0,Id,Good-Customer,Bad-Customer
0,1,0.475070,0.524930
1,2,0.603482,0.396518
2,3,0.799876,0.200124
3,4,0.515135,0.484865
4,5,0.371959,0.628041
...,...,...,...
101498,101499,0.748373,0.251627
101499,101500,0.110352,0.889648
101500,101501,0.840893,0.159107
101501,101502,0.399656,0.600344
