# LER DADOS

In [1]:
import pandas as pd

df = pd.read_csv('./source.csv')

df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


# CÉLULAS NULAS

In [2]:
df.apply(lambda x : sum(x.isnull()), axis=0)

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

Alternativas para o cliente:
- Deletar linhas de base pequena -> enviesar (underfitting)
- Colocar a moda / a média / um valor -> arrisca punir ou beneficiar mais um dado do que outro
- Considerar o nulo

# COLOCANDO A MODA

In [3]:
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True) # moda, para favorecer o que mais aparecer
df['Married'].fillna(df['Married'].mode()[0], inplace=True) # moda, para favorecer o que mais aparecer
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True) # moda, para favorecer o que mais aparecer
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace=True) # moda, para favorecer o que mais aparecer
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True) # moda, para favorecer o que mais aparecer
df['LoanAmount'].fillna(df['LoanAmount'].mean(), inplace=True) # não dá pra usar moda (contínuo) ou mediana (o meio termo)
df['Self_Employed'].fillna('No', inplace=True) # MEI é mais arriscado assegurar, então pune

df.apply(lambda x : sum(x.isnull()), axis=0)

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

# TRANSFORMAR CATEGORIAS EM NÚMEROS

In [4]:
from sklearn.preprocessing import LabelEncoder

df = df.drop(axis=1, columns=['Loan_ID']) # remover o loan id das colunas

var_mod = df.columns

le = LabelEncoder()

# for i in var_mod:
    # df[i] = le.fit_transform(df[i])

for i in var_mod:
    df[i] = le.fit_transform(df[i])

df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,376,0,100,8,1,2,1
1,1,1,1,0,0,306,60,81,8,1,0,0
2,1,1,0,0,1,139,0,26,8,1,2,1
3,1,1,0,1,0,90,160,73,8,1,2,1
4,1,0,0,0,0,381,0,94,8,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,125,0,29,8,1,0,1
610,1,1,3,0,0,275,0,7,5,1,0,1
611,1,1,1,0,0,431,3,163,8,1,2,1
612,1,1,2,0,0,422,0,133,8,1,2,1


In [5]:
# data frame de variáveis para prever o resultado
df_vars = df.drop(axis=1, columns=['LoanAmount'])

df_vars

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,376,0,8,1,2,1
1,1,1,1,0,0,306,60,8,1,0,0
2,1,1,0,0,1,139,0,8,1,2,1
3,1,1,0,1,0,90,160,8,1,2,1
4,1,0,0,0,0,381,0,8,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,125,0,8,1,0,1
610,1,1,3,0,0,275,0,5,1,0,1
611,1,1,1,0,0,431,3,8,1,2,1
612,1,1,2,0,0,422,0,8,1,2,1


In [6]:
df_res = df[['LoanAmount']]

df_res

Unnamed: 0,LoanAmount
0,100
1,81
2,26
3,73
4,94
...,...
609,29
610,7
611,163
612,133


# SEPARANDO BASE DE TESTE E DE TREINO

In [7]:
from sklearn.model_selection import train_test_split

# 30% dos df serão base de teste
# random_state -> seed do modelo para criar modelos próximos usando a mesma seed na mesma máquina para variar variáveis
vars_train, vars_test, res_train, res_test = train_test_split(df_vars, df_res, test_size=0.3, random_state=777)

# TREINANDO O MODELO

In [8]:
from sklearn.linear_model import LinearRegression

model = LinearRegression().fit(vars_train, res_train)

#print("intercept", model.intercept_) # termo independente da regressão
#print("coeficiente", model.coef_) # coef de cada variável da regressão

res_prev = model.predict(vars_train) # previsões geradas

res_prev

array([[ 85.91871131],
       [ 73.99696735],
       [ 93.02222197],
       [102.39016225],
       [ 52.66108875],
       [ 81.34539951],
       [ 59.36989076],
       [ 48.72521309],
       [113.22156261],
       [ 88.84157633],
       [ 85.84945188],
       [105.05536828],
       [160.04094095],
       [102.92302417],
       [ 79.25987264],
       [ 88.65176242],
       [154.39172967],
       [110.21356328],
       [ 71.05439589],
       [ 51.45447297],
       [113.84701665],
       [ 52.5612483 ],
       [116.47531863],
       [ 99.5804138 ],
       [114.56687986],
       [ 96.06589753],
       [ 63.74400694],
       [ 56.63791989],
       [ 65.84324499],
       [ 40.96883726],
       [ 83.68263277],
       [ 99.6932839 ],
       [147.59590025],
       [ 97.94770426],
       [ 56.61021845],
       [ 84.75713981],
       [ 19.22742693],
       [ 74.52362118],
       [ 55.51758958],
       [ 46.76410407],
       [ 96.72685418],
       [107.07694608],
       [ 94.98886956],
       [ 59

# MEDINDO O MODELO

## MEDINDO O TREINO

In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

predict_train = model.predict(vars_train)

print(mean_absolute_error(res_train, predict_train)) # medir a média de erro de variável contínua 
print(mean_squared_error(res_train, predict_train)) # grau de intensidade do erro
print(mean_absolute_error(res_train, predict_train) / res_train.mean()) # percent de diferença da média do real com o da previsão

24.36958827543718
1016.1797439484237
LoanAmount    0.282662
dtype: float64


## MEDINDO COM DADOS QUE NÃO CONHECE

In [10]:
predict_test = model.predict(vars_test)

print(mean_absolute_error(res_test, predict_test))
print(mean_squared_error(res_test, predict_test))
print(mean_absolute_error(res_test, predict_test) / res_test.mean())

26.42331331092049
1361.6018919367023
LoanAmount    0.289695
dtype: float64
