In [61]:
import pandas as pd

In [62]:
df = pd.read_csv('../data/credit-data.csv')

## Verificando dados

In [63]:
# conferindo o dataset
df.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [64]:
# verificando os valores que estão abaixo de 0
df.loc[df['age'] < 0]

Unnamed: 0,clientid,income,age,loan,default
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


In [65]:
# guardando a médida da idade > 0 e substituindo idade < 0 pela média
media_idade = df.loc[df['age'] > 0, 'age'].mean()
df.loc[df['age'] < 0, 'age'] = media_idade

### Separando previsores e classe

In [73]:
previsores = df.iloc[:,1:4].values
previsores[:5]

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       [4.27095342e+04, 4.57519724e+01, 6.10364226e+03],
       [6.69526888e+04, 1.85843359e+01, 8.77009924e+03]])

In [74]:
classe = df.iloc[:,4].values
classe[:5]

array([0, 0, 0, 0, 1])

### Valores faltantes

In [75]:
print('Existe falores faltantes?', pd.isnull(df).values.any())
for coluna in df.columns:
    if True in pd.isnull(df[coluna]).values or \
            True in pd.isna(df[coluna]).values:
        print(coluna)

Existe falores faltantes? True
age


In [76]:
from sklearn.preprocessing import Imputer

In [80]:
# Substituindo os valores faltantes pela média
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(previsores)
previsores = imputer.fit_transform(previsores)
previsores[:5]

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       [4.27095342e+04, 4.57519724e+01, 6.10364226e+03],
       [6.69526888e+04, 1.85843359e+01, 8.77009924e+03]])

### Escalonamento dos valores

In [81]:
from sklearn.preprocessing import StandardScaler

In [83]:
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)
previsores[:5]

array([[ 1.45393393,  1.36538005,  1.20281942],
       [-0.76217555,  0.54265932,  0.69642695],
       [ 0.83682073,  1.67417101,  1.17471147],
       [-0.18307006,  0.36413567,  0.54497999],
       [ 1.50956319, -1.68647541,  1.4207648 ]])

## Separando dados de teste e treinamento

In [85]:
from sklearn.model_selection import train_test_split

In [87]:
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classe, test_size=.25, random_state=0)

In [88]:
print('Previsores treinamento:', len(previsores_treinamento))
print('Previsores teste:', len(previsores_teste))
print('Classe treinamento:', len(classe_treinamento))
print('Classe teste:', len(classe_teste))

Previsores treinamento: 1500
Previsores teste: 500
Classe treinamento: 1500
Classe teste: 500
