In [1]:
import pandas as pd

In [73]:
df = pd.read_csv('../data/credit-data.csv')

In [3]:
df.head()

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [13]:
print('Tipo variáveis:',df.dtypes, sep='\n')
print('\nInformações:', df.describe(), sep='\n')

Tipo variáveis:
clientid      int64
income      float64
age         float64
loan        float64
default       int64
dtype: object

Informações:
          clientid        income          age          loan      default
count  2000.000000   2000.000000  1997.000000   2000.000000  2000.000000
mean   1000.500000  45331.600018    40.807559   4444.369695     0.141500
std     577.494589  14326.327119    13.624469   3045.410024     0.348624
min       1.000000  20014.489470   -52.423280      1.377630     0.000000
25%     500.750000  32796.459717    28.990415   1939.708847     0.000000
50%    1000.500000  45789.117313    41.317159   3974.719419     0.000000
75%    1500.250000  57791.281668    52.587040   6432.410625     0.000000
max    2000.000000  69995.685578    63.971796  13766.051239     1.000000


## Tratando valores inconsistentes

In [14]:
# localizando idade que está negativa
df.loc[df['age'] < 0]

Unnamed: 0,clientid,income,age,loan,default
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


### Apagando coluna

In [15]:
df.drop('age', 1, inplace=True)
df.head()

Unnamed: 0,clientid,income,loan,default
0,1,66155.925095,8106.532131,0
1,2,34415.153966,6564.745018,0
2,3,57317.170063,8020.953296,0
3,4,42709.534201,6103.64226,0
4,5,66952.688845,8770.099235,1


### Apagando registros com problemas

In [24]:
df.drop(df[df.age < 0].index, inplace=True)
df.describe()

Unnamed: 0,clientid,income,age,loan,default
count,1997.0,1997.0,1994.0,1997.0,1997.0
mean,1001.970456,45326.59672,40.9277,4442.124566,0.141713
std,576.679293,14327.97155,13.271802,3045.494192,0.348842
min,1.0,20014.48947,18.055189,1.37763,0.0
25%,503.0,32804.904487,29.043284,1936.813257,0.0
50%,1002.0,45788.7471,41.382673,3971.155479,0.0
75%,1501.0,57787.565659,52.6169,6429.593688,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


### Preencher os valores manualmente
Sendo mais viável preencher os valores com a média

In [31]:
media_idade = df[df.age > 0]['age'].mean()
print('Média idade:',media_idade)

Média idade: 40.92770044906149


In [39]:
df.loc[df.age < 0, 'age'] = media_idade
df.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.9277,4444.369695,0.1415
std,577.494589,14326.327119,13.261825,3045.410024,0.348624
min,1.0,20014.48947,18.055189,1.37763,0.0
25%,500.75,32796.459717,29.072097,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


## Valores faltantes

In [43]:
pd.isnull(df['age']) # mostra o resultado geral
df.loc[pd.isnull(df['age'])] # somente os nulos

Unnamed: 0,clientid,income,age,loan,default
28,29,59417.805406,,2082.625938,0
30,31,48528.852796,,6155.78467,0
31,32,23526.302555,,2862.010139,0


### Separando base

In [122]:
previsores = df.iloc[:, 1:4].values # todas as linhas e colunas 1 até 3
print('Existe valores NaN?',np.isnan(previsores).any())
previsores[:5,:]

Existe valores NaN? True


array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       [4.27095342e+04, 4.57519724e+01, 6.10364226e+03],
       [6.69526888e+04, 1.85843359e+01, 8.77009924e+03]])

In [123]:
classe = df.iloc[:,4].values
classe[:5]

array([0, 0, 0, 0, 1])

### Tratando os valores faltantes

In [124]:
from sklearn.preprocessing import Imputer
import numpy as np

In [125]:
# Axis 0 é a coluna
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)

In [126]:
imputer = imputer.fit(previsores)

In [128]:
previsores = imputer.transform(previsores)
print('Existe valores NaN?',np.isnan(previsores).any())

Existe valores NaN? False


## Escalonamento dos atributos

In [129]:
from sklearn.preprocessing import StandardScaler

In [130]:
scaler = StandardScaler()

In [132]:
previsores = scaler.fit_transform(previsores)
previsores[:5,:] # todos os dados na mesma escala

array([[ 1.45393393,  1.33786439,  1.20281942],
       [-0.76217555,  0.53704215,  0.69642695],
       [ 0.83682073,  1.63843621,  1.17471147],
       [-0.18307006,  0.36327028,  0.54497999],
       [ 1.50956319, -1.63275936,  1.4207648 ]])