In [1]:
import os
import pandas as pd
import numpy as np

In [124]:
def create_data(N, id_col, corrupt_features=False, corrupt_labels=False):
    p = 0.7  # proportion that are educated (education=1)
    p_var = p*(1-p)
    p_std = np.sqrt(p_var)
    education = np.random.uniform(low=0, high=1, size=N) <= p
    # As education goes up, loan status should go down
    
    loan_amount = np.random.normal(6000, 1500, size=N)  # ev=6000, std=1500
    noise_term = np.random.normal(0,0.5, size=N)
    # As loan_amount goes up, loan_status should go down
    loan_status = ((education-p)/p_std - (loan_amount-6000)/1500 + noise_term) >= 0
    
    if corrupt_features:
        loan_amount += (np.random.uniform(low=0, high=1, size=N) <= 0.15)*2000
    if corrupt_labels:
        size = int(0.25 * len(loan_amount))
        idx = np.random.choice(N, size, replace=False)
        loan_status = loan_status.astype(np.int64)
        loan_status[idx] -= 1  # Turn 0 -> -1, 1 -> 0
        loan_status = np.clip(loan_status, 0, 1)  # -1 -> 0
    
    d = {
        'loan_id': id_col, 
        'college_degree': education, 
        'loan_amount': loan_amount, 
        'loan_repaid': loan_status
    }
    return pd.DataFrame(data=d)
    

In [128]:
from sklearn.linear_model import LogisticRegression

def compute_regression_score(res):
    model = LogisticRegression()
    model.fit(res[['college_degree', 'loan_amount']], res['loan_repaid'])
    return model.score(res[['college_degree', 'loan_amount']], res['loan_repaid'])

In [129]:
id_col = ["ID%04d"%x for x in range(0,800)]
res = create_data(800, id_col, corrupt_features=False, corrupt_labels=True)

In [130]:
res.head()

Unnamed: 0,loan_id,college_degree,loan_amount,loan_repaid
0,ID0000,True,5194.043915,1
1,ID0001,True,8940.178925,0
2,ID0002,False,6166.882851,0
3,ID0003,True,8104.82804,1
4,ID0004,True,8709.169478,0


In [131]:
res.describe(include='all')

Unnamed: 0,loan_id,college_degree,loan_amount,loan_repaid
count,800,800,800.0,800.0
unique,800,2,,
top,ID0241,True,,
freq,1,576,,
mean,,,5927.698154,0.4375
std,,,1426.292842,0.496389
min,,,1319.872451,0.0
25%,,,4950.598817,0.0
50%,,,5861.938229,0.0
75%,,,6921.020864,1.0


In [132]:
compute_regression_score(res)

0.825

In [133]:
train_data_size = 800
test_data_size = 400
# Above is reference
bad_test_data_size = 400  # features are bad, labels are good
label_shift_data_size = 400  # features are good, labels are bad

In [135]:
total = 0
for name, N, corrupt_features, corrupt_labels in [
    ('train', 853, False, False), ('test', 428, False, False),
    ('feature_shift', 613, True, False), ('label_shift', 275, False, True)]:
    id_col = ["ID%04d"%x for x in range(total,total+N)]
    total += N
    df = create_data(N, id_col, corrupt_features, corrupt_labels)
    save_path = f'{name}.csv'
    df.to_csv(save_path, index=False)