In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/credit-data.csv')

## Pré-processamento

In [3]:
df.dtypes

clientid      int64
income      float64
age         float64
loan        float64
default       int64
dtype: object

In [4]:
df.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [5]:
df.loc[df.age < 0]

Unnamed: 0,clientid,income,age,loan,default
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


In [6]:
media = df.loc[df.age > 0].age.mean()
df.loc[df.age < 0, 'age'] = media

In [7]:
df.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.9277,4444.369695,0.1415
std,577.494589,14326.327119,13.261825,3045.410024,0.348624
min,1.0,20014.48947,18.055189,1.37763,0.0
25%,500.75,32796.459717,29.072097,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


### Valores faltantes

In [8]:
for coluna in df.columns:
    if df[coluna].isnull().values.any():
        print(coluna)

age


In [9]:
df.loc[df.age.isnull()]

Unnamed: 0,clientid,income,age,loan,default
28,29,59417.805406,,2082.625938,0
30,31,48528.852796,,6155.78467,0
31,32,23526.302555,,2862.010139,0


In [10]:
previsores = df.iloc[:,1:4].values
classe = df.iloc[:,4].values
print(previsores[:5], classe[:5], sep='\n')

[[6.61559251e+04 5.90170151e+01 8.10653213e+03]
 [3.44151540e+04 4.81171531e+01 6.56474502e+03]
 [5.73171701e+04 6.31080495e+01 8.02095330e+03]
 [4.27095342e+04 4.57519724e+01 6.10364226e+03]
 [6.69526888e+04 1.85843359e+01 8.77009924e+03]]
[0 0 0 0 1]


#### Tratando 'NaN'

In [11]:
# from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer

In [12]:
# imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = SimpleImputer()
imputer = imputer.fit(previsores)
previsores = imputer.transform(previsores)

In [13]:
print('Valores faltantes?', pd.isnull(previsores).any())

Valores faltantes? False


## Escalonamento dos dados

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)
previsores[:5]

array([[ 1.45393393,  1.36538005,  1.20281942],
       [-0.76217555,  0.54265932,  0.69642695],
       [ 0.83682073,  1.67417101,  1.17471147],
       [-0.18307006,  0.36413567,  0.54497999],
       [ 1.50956319, -1.68647541,  1.4207648 ]])

## Separação dos dados

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classe, test_size=0.25, random_state=0)

In [18]:
previsores_teste

array([[ 1.59301567, -1.35435933,  2.58262733],
       [ 0.99769755,  0.99806485,  0.84418709],
       [-0.42485257,  0.55812535, -1.15785286],
       ...,
       [ 1.37445674, -1.05746369, -1.12564819],
       [-1.57087737, -0.6348826 , -0.36981671],
       [-1.03572293, -0.93978209,  0.04244312]])

## Árvore de decisão

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
classificador = RandomForestClassifier(n_estimators=40,
                                       criterion='entropy', 
                                       random_state=0)
classificador.fit(previsores_treinamento, classe_treinamento)
previsoes = classificador.predict(previsores_teste)

## Métricas

In [39]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [40]:
precisao = accuracy_score(classe_teste, previsoes)
matriz = confusion_matrix(classe_teste, previsoes)
print('precisão', precisao)
print('confusão\n', matriz)

precisão 0.984
confusão
 [[433   3]
 [  5  59]]
