In [45]:
import pandas as pd
import datetime

from sklearn.linear_model import Lasso

In [46]:
df1 = pd.read_csv("data/sample/sampleAccepted.csv")
df1 = df1.drop(columns=["member_id", "url"])
df1.head().T

Unnamed: 0,0,1,2,3,4
id,145181203,102630556,135925949,59212770,74554116
loan_amnt,9100,8000,25000,7500,12000
funded_amnt,9100,8000,25000,7500,12000
funded_amnt_inv,9100,8000,25000,7500,12000
term,36 months,36 months,36 months,36 months,36 months
...,...,...,...,...,...
settlement_status,,,,,
settlement_date,,,,,
settlement_amount,,,,,
settlement_percentage,,,,,


In [47]:
def formatTerm(line):
    line = line.strip()
    line = line.split(" ")

    if line[-1] == "months":
        line = int(line[0]) * 30
    
    else:
        raise NotImplementedError(f"{line[-1]} não implementado ainda")

    return line

df1["term"] = df1["term"].apply(formatTerm)
df1.head()

Unnamed: 0,id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,145181203,9100.0,9100.0,9100.0,1080,26.31,368.15,E,E4,Rural carrier,...,,,Cash,N,,,,,,
1,102630556,8000.0,8000.0,8000.0,1080,16.99,285.19,D,D1,Gas Turbine Mechanic,...,,,Cash,N,,,,,,
2,135925949,25000.0,25000.0,25000.0,1080,8.08,784.34,A,A5,HR Business Partner,...,,,DirectPay,N,,,,,,
3,59212770,7500.0,7500.0,7500.0,1080,7.89,234.65,A,A5,manager,...,,,Cash,N,,,,,,
4,74554116,12000.0,12000.0,12000.0,1080,8.39,378.2,B,B1,Pharmacy Technician,...,,,Cash,N,,,,,,


In [48]:
df1.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 149 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   id                                          10000 non-null  int64  
 1   loan_amnt                                   10000 non-null  float64
 2   funded_amnt                                 10000 non-null  float64
 3   funded_amnt_inv                             10000 non-null  float64
 4   term                                        10000 non-null  int64  
 5   int_rate                                    10000 non-null  float64
 6   installment                                 10000 non-null  float64
 7   grade                                       10000 non-null  object 
 8   sub_grade                                   10000 non-null  object 
 9   emp_title                                   9257 non-null   object 
 10  emp_length

In [49]:
target = df1[["int_rate", "grade", "loan_status"]]
target.head()

Unnamed: 0,int_rate,grade,loan_status
0,26.31,E,Late (31-120 days)
1,16.99,D,Charged Off
2,8.08,A,Current
3,7.89,A,Fully Paid
4,8.39,B,Fully Paid


In [50]:
df1 = df1.join(pd.get_dummies(df1["home_ownership"]))
df1 = df1.drop(columns=["home_ownership", "OWN"])
ownership = ["ANY", "MORTGAGE", "NONE", "OTHER", "RENT"]
df1

Unnamed: 0,id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,...,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term,ANY,MORTGAGE,NONE,OTHER,RENT
0,145181203,9100.0,9100.0,9100.0,1080,26.31,368.15,E,E4,Rural carrier,...,,,,,,0,0,0,0,0
1,102630556,8000.0,8000.0,8000.0,1080,16.99,285.19,D,D1,Gas Turbine Mechanic,...,,,,,,0,0,0,0,1
2,135925949,25000.0,25000.0,25000.0,1080,8.08,784.34,A,A5,HR Business Partner,...,,,,,,0,0,0,0,1
3,59212770,7500.0,7500.0,7500.0,1080,7.89,234.65,A,A5,manager,...,,,,,,0,1,0,0,0
4,74554116,12000.0,12000.0,12000.0,1080,8.39,378.20,B,B1,Pharmacy Technician,...,,,,,,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,137815471,10000.0,10000.0,10000.0,1080,23.40,389.19,E,E1,Service Manager,...,,,,,,0,0,0,0,1
9996,5524600,9500.0,9500.0,9500.0,1080,15.80,333.06,C,C3,BLUWORLD,...,,,,,,0,1,0,0,0
9997,7677306,6000.0,6000.0,6000.0,1080,9.99,193.58,B,B1,supervisor,...,,,,,,0,1,0,0,0
9998,59251152,6125.0,6125.0,6125.0,1080,16.99,218.35,D,D3,Porter,...,,,,,,0,0,0,0,1


In [58]:
ownership = ["ANY", "MORTGAGE", "NONE", "OTHER", "RENT"]

features = df1[[
    "annual_inc", # Renda Anual
    "emp_length", # Tempo no emprego
    "issue_d", # data do empréstimo, para comparar com a taxa de juros básica
    "purpose", # objetivo
    "dti" # Debt-to-income <https://www.investopedia.com/terms/d/dti.asp>
    "delinq_2yrs"
     ]]



features = features.join(df1[
    ownership # Estado da residência(hipotecada, aluguel, etc.)
    ])

features.head()

Unnamed: 0,annual_inc,emp_length,issue_d,ANY,MORTGAGE,NONE,OTHER,RENT
0,62000.0,3 years,Dec-2018,0,0,0,0,0
1,69000.0,8 years,Mar-2017,0,0,0,0,1
2,102000.0,2 years,Jul-2018,0,0,0,0,1
3,33000.0,10+ years,Sep-2015,0,1,0,0,0
4,32201.0,10+ years,Mar-2016,0,0,0,0,1


In [72]:
df1.iloc[:5, 15:30]

Unnamed: 0,pymnt_plan,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc
0,n,,other,Other,465xx,IN,13.38,0.0,Nov-2003,725.0,729.0,1.0,41.0,,8.0
1,n,,debt_consolidation,Debt consolidation,968xx,HI,30.9,0.0,Feb-2007,665.0,669.0,1.0,37.0,,12.0
2,n,,credit_card,Credit card refinancing,986xx,WA,18.49,0.0,Oct-1997,745.0,749.0,0.0,,,12.0
3,n,,debt_consolidation,Debt consolidation,734xx,OK,24.62,0.0,Aug-1988,735.0,739.0,0.0,,84.0,7.0
4,n,,debt_consolidation,Debt consolidation,109xx,NY,28.14,0.0,Jan-2001,760.0,764.0,0.0,,,11.0


In [74]:
df1["delinq_2yrs"].unique()

array([ 0.,  1.,  5.,  2.,  6.,  3.,  4.,  9.,  7.,  8., 14., 10., 12.])