In [1]:
HEADERS = {
    "Acquisition": [
        "id",
        "channel",
        "seller",
        "interest_rate",
        "balance",
        "loan_term",
        "origination_date",
        "first_payment_date",
        "ltv",
        "cltv",
        "borrower_count",
        "dti",
        "borrower_credit_score",
        "first_time_homebuyer",
        "loan_purpose",
        "property_type",
        "unit_count",
        "occupancy_status",
        "property_state",
        "zip",
        "insurance_percentage",
        "product_type",
        "co_borrower_credit_score"
    ],
    "Performance": [
        "id",
        "reporting_period",
        "servicer_name",
        "interest_rate",
        "balance",
        "loan_age",
        "months_to_maturity",
        "maturity_date",
        "msa",
        "delinquency_status",
        "modification_flag",
        "zero_balance_code",
        "zero_balance_date",
        "last_paid_installment_date",
        "foreclosure_date",
        "disposition_date",
        "foreclosure_costs",
        "property_repair_costs",
        "recovery_costs",
        "misc_costs",
        "tax_costs",
        "sale_proceeds",
        "credit_enhancement_proceeds",
        "repurchase_proceeds",
        "other_foreclosure_proceeds",
        "non_interest_bearing_balance",
        "principal_forgiveness_balance"
    ]
}

SELECT = {
    "Acquisition": HEADERS["Acquisition"],
    "Performance": [
        "id",
        "foreclosure_date"
    ]
}

In [2]:
import os
import settings
import pandas as pd


In [3]:
def concatenate(prefix='Acquisition'):
    files = os.listdir(settings.DATA_DIR)
    full = []
    for f in files:
        if not f.startswith(prefix):
            continue

        data = pd.read_csv(os.path.join(settings.DATA_DIR, f), sep="|", header=None, names=HEADERS[prefix], index_col=False)
        data = data[SELECT[prefix]]
        full.append(data)

    full = pd.concat(full, axis=0)

    full.to_csv(os.path.join(settings.PROCESSED_DIR, "{}.txt".format(prefix)), sep="|", header=SELECT[prefix], index=False)

In [8]:
acquisition = pd.read_csv('processed/Acquisition.txt', sep="|")
acquisition.head()

Unnamed: 0,id,channel,seller,interest_rate,balance,loan_term,origination_date,first_payment_date,ltv,cltv,...,loan_purpose,property_type,unit_count,occupancy_status,property_state,zip,insurance_percentage,product_type,co_borrower_credit_scoreinsurance_type,relocation_indicator
0,100000127831,R,OTHER,4.75,415000,360,09/2013,11/2013,80,80.0,...,P,PU,1,P,CO,801,,FRM,,
1,100000614072,C,FRANKLIN AMERICAN MORTGAGE COMPANY,4.0,92000,180,09/2013,11/2013,71,71.0,...,R,SF,1,P,TX,750,,FRM,,
2,100002626362,B,OTHER,4.75,156000,360,10/2013,12/2013,70,70.0,...,R,SF,1,P,CO,800,,FRM,710.0,
3,100005817875,R,OTHER,3.5,113000,180,09/2013,11/2013,44,44.0,...,R,PU,1,P,MO,640,,FRM,787.0,
4,100006410152,R,OTHER,5.0,135000,360,11/2013,01/2014,75,75.0,...,P,PU,1,I,WA,983,,FRM,816.0,


In [9]:
acquisition.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5916667 entries, 0 to 5916666
Data columns (total 24 columns):
id                                        int64
channel                                   object
seller                                    object
interest_rate                             float64
balance                                   int64
loan_term                                 int64
origination_date                          object
first_payment_date                        object
ltv                                       int64
cltv                                      float64
borrower_count                            int64
dti                                       float64
borrower_credit_score                     float64
first_time_homebuyer                      object
loan_purpose                              object
property_type                             object
unit_count                                int64
occupancy_status                          object
propert

In [11]:
import sklearn
from sklearn import metrics


In [12]:
def compute_error(target, predictions):
	'''Uses scikit-learn to compute a simple accuracy score 
	(the percentage of predictions that matched the actual foreclosure_status values).'''
	return metrics.accuracy_score(target, predictions)

In [13]:
def cross_validate(train):
	#creates a logistic regression classifier, taking under consideration the imbalanced data we have
    clf = LogisticRegression(random_state=1, class_weight="balanced")
    #creats a list of columns that we want to use to train the model on, removing id and foreclosure_status
    predictors = train.columns.tolist()
    predictors = [p for p in predictors if p not in settings.NON_PREDICTORS]
    #Run cross validation across the train DataFrame.
    predictions = cross_validation.cross_val_predict(clf, train[predictors], train[settings.TARGET], cv=settings.CV_FOLDS)
    #Return the predictions
    return predictions

In [18]:
def compute_false_negatives(target, predictions):

    df = pd.DataFrame[{"target": target, "predictions": predictions}]
    return df[(df["target"] == 1) & (df["predictions"] == 0)].shape[0] / (df[(df["target"] == 1)].shape[0] + 1)


In [4]:
if __name__ == "__main__":
    concatenate("Acquisition")
    concatenate("Performance")

IndexError: list index out of range