In [None]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import re


In [None]:
dataset_dir = '../datasets/prosper-loans'

if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)


In [None]:
%%bash
cd ../datasets/prosper-loans
wget https://s3.amazonaws.com/udacity-hosted-downloads/ud651/prosperLoanData.csv


In [None]:
df = pd.read_csv(os.path.join(dataset_dir, 'prosperLoanData.csv'))

In [None]:
features = [
    'BorrowerAPR', 'BorrowerRate',
    'EstimatedEffectiveYield', 'EstimatedReturn',
    'ProsperScore', 'CreditScoreRangeUpper',
    'OpenCreditLines', 'AmountDelinquent',
    'StatedMonthlyIncome', 'MonthlyLoanPayment',
    'ClosedDate', 'LoanStatus',
]
df = df[features]


In [None]:
df = df.dropna(subset=['ProsperScore'])
df = df[df['LoanStatus'].isin(['Completed', 'Defaulted', 'Chargedoff'])]
df['LoanStatus'] = df['LoanStatus'] == 'Completed'



In [None]:
g = df.groupby('LoanStatus')
df = g.apply(lambda x: x.sample(g.size().min(), random_state=1)).reset_index(drop=True)


In [None]:
df['ClosedDate'] = pd.to_datetime(df['ClosedDate'])
df = df.sort_values(by='ClosedDate')
del df['ClosedDate']


In [None]:
df = df[[c for c in df if c not in ['LoanStatus']] + ['LoanStatus']]

In [None]:
df.to_csv(os.path.join(dataset_dir, 'prosper-loans.csv'), index=False)

In [None]:
#row_count = df.shape[0]
#resampled_df = df.sample(frac=2, replace=True, random_state=row_count)
twice_df = pd.concat([df] * 2)

In [None]:
row_count = twice_df.shape[0]
shuffled_df = twice_df.sample(frac=1, replace=False, random_state=row_count)
shuffled_df.to_csv(os.path.join(dataset_dir, 'prosper-loans-shuffled.csv'), index=False)