# Minicurso Introdução à ciência de dados e aprendizado de máquina usando Python
## Parte 5 - Introdução ao Aprendizado de Máquina usando Python

In [1]:
import numpy as np

### Carregando os dados (features e labels)

In [2]:
X = np.load('../workshop-data-science/X.npy')
y = np.load('../workshop-data-science/y.npy')

### Treinando e avaliando o modelo

In [3]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
import sklearn
sklearn.__version__

'0.18.1'

In [5]:
num_folds = 10
seed = 7

i = 0
kfold = model_selection.KFold(n_splits=num_folds, random_state=seed)
accuracy = np.zeros(num_folds)
for train_idx, test_idx in kfold.split(X):
    print(len(train_idx), len(test_idx))
    
    # TREINO
    model = LogisticRegression()
    model.fit(X[train_idx], y[train_idx])    
    
    # FAZER PREDIÇÃO A PARTIR DO MODELO TREINADO
    y_pred = model.predict(X[test_idx])
    
    # AVALIAR A TÉCNICA ATRAVÉS DA COMPARAÇÃO DO DADO REAL COM O DADO PREVISTO 
    accuracy[i] = accuracy_score(y[test_idx], y_pred)
    i += 1

print(accuracy)
print(np.average(accuracy), np.std(accuracy))

691 77
691 77
691 77
691 77
691 77
691 77
691 77
691 77
692 76
692 76
[ 0.7012987   0.81818182  0.74025974  0.71428571  0.77922078  0.75324675
  0.85714286  0.80519481  0.72368421  0.80263158]
0.76951469583 0.0484105192457


In [6]:
# 0.76951469583 0.0484105192457

#### De forma mais automatizada e compacta...

In [8]:
results = model_selection.cross_val_score(model, X, y, cv=kfold)
print(results)
print("Accuracy: {} {}".format(results.mean(), results.std()) )

[ 0.7012987   0.81818182  0.74025974  0.71428571  0.77922078  0.75324675
  0.85714286  0.80519481  0.72368421  0.80263158]
Accuracy: 0.7695146958304853 0.04841051924567195


In [9]:
results = model_selection.cross_val_score(model, X, y, cv=10)
print(results)
print("Accuracy: {} {}".format(results.mean(), results.std()) )

[ 0.74025974  0.75324675  0.79220779  0.72727273  0.74025974  0.74025974
  0.81818182  0.79220779  0.73684211  0.82894737]
Accuracy: 0.7669685577580314 0.03542589693856446


In [10]:
# Accuracy: 0.7669685577580314 0.03542589693856446

### Usando outra técnica

In [12]:
from sklearn.ensemble import GradientBoostingClassifier

In [13]:
model2 = GradientBoostingClassifier()

In [14]:
results = model_selection.cross_val_score(model2, X, y, cv=kfold)
print(results)
print("Accuracy: {} {}".format(results.mean(), results.std()) )

[ 0.74025974  0.81818182  0.74025974  0.63636364  0.80519481  0.79220779
  0.80519481  0.83116883  0.72368421  0.78947368]
Accuracy: 0.7681989063568012 0.055815077983448785


### Mudando os Hiper-parâmetros (Tuning do Algoritmo)

In [17]:
model3 = GradientBoostingClassifier(n_estimators=50, max_depth=3, learning_rate=0.05)
results = model_selection.cross_val_score(model3, X, y, cv=kfold)
print(results)
print("Accuracy: {} {}".format(results.mean(), results.std()) )

[ 0.71428571  0.85714286  0.72727273  0.63636364  0.79220779  0.81818182
  0.81818182  0.85714286  0.73684211  0.78947368]
Accuracy: 0.7747095010252905 0.06661018830310401


### Grid Search

In [19]:
from itertools import product

n_estimators_ = [20, 40, 60, 80, 100]
max_depth_ = [2, 3, 5]
learning_rate_ = [0.05, 0.1]

results = []
for ne, md, lr in product(n_estimators_, max_depth_, learning_rate_):
    print(ne, md, lr)
    model_ = GradientBoostingClassifier(n_estimators=ne, max_depth=md, learning_rate=lr)
    result = model_selection.cross_val_score(model3, X, y, cv=kfold)
    result_ = { 'n_estimators': ne, 'max_depth': md, 'learning_rate': lr, 
                'accuracy': result.mean(), 'std': result.std() } 

    results.append(result_)

20 2 0.05
20 2 0.1
20 3 0.05
20 3 0.1
20 5 0.05
20 5 0.1
40 2 0.05
40 2 0.1
40 3 0.05
40 3 0.1
40 5 0.05
40 5 0.1
60 2 0.05
60 2 0.1
60 3 0.05
60 3 0.1
60 5 0.05
60 5 0.1
80 2 0.05
80 2 0.1
80 3 0.05
80 3 0.1
80 5 0.05
80 5 0.1
100 2 0.05
100 2 0.1
100 3 0.05
100 3 0.1
100 5 0.05
100 5 0.1


In [20]:
import pandas as pd
df = pd.DataFrame(results, columns=['n_estimators', 'max_depth', 'learning_rate', 'accuracy', 'std'])
df.sort_values(by='accuracy', ascending=False)

Unnamed: 0,n_estimators,max_depth,learning_rate,accuracy,std
15,60,3,0.1,0.77471,0.06661
1,20,2,0.1,0.77471,0.06661
28,100,5,0.05,0.77471,0.06661
27,100,3,0.1,0.77471,0.06661
26,100,3,0.05,0.77471,0.06661
25,100,2,0.1,0.77471,0.06661
24,100,2,0.05,0.77471,0.06661
23,80,5,0.1,0.77471,0.06661
21,80,3,0.1,0.77471,0.06661
20,80,3,0.05,0.77471,0.06661
