# Scikit Learn

In [1]:
from sklearn import datasets
import pandas as pd
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn import grid_search
from sklearn import svm

## 1) Import de dataset

In [3]:
boston = datasets.load_boston()
X = boston["data"]
y = boston["target"]

## 2) separer le jeu d'apprentissage du jeu de test

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

## 3) Standardiser les valeurs

In [9]:
scaler = StandardScaler()
scaler.fit(X_train)                 # Il ne faut fiter que sur les data d'entrainement
X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)  # apply same transformation to test data

## 4) choisir,  exemple : Support Vector Machine

## 4.1) phase d'apprentissage

In [10]:
parameters = {  'C'       : [190, 200, 250, 260]   ,
                'kernel'  : ['rbf'],
                'gamma'   : [ 0.03],
                  "epsilon" : [1.5, 2, 2.5],
                 "degree" : [1,2]}
grid = grid_search.GridSearchCV(svm.SVR(), parameters, n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_score_, grid.best_estimator_

0.837444600113 SVR(C=250, cache_size=200, coef0=0.0, degree=1, epsilon=1.5, gamma=0.03,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)


## 4.2) prédiction sur le jeu de test (qu'on se sert comme jeu de validation)

In [21]:
model = grid.best_estimator_
model.score(X_test, y_test)

0.89695786776384812

# 4.3) pour s'amuser, on regarde dans une dataframe

In [20]:
df = pd.DataFrame()
df["prediction"] = model.predict(X_test)
df["reel"] = y_test
df["diff"] = df.reel-df.prediction
df["pct_diff"] = (df.reel-df.prediction)/df.reel
df.head()

Unnamed: 0,prediction,reel,diff,pct_diff
0,19.283085,19.4,0.116915,0.006027
1,18.941528,17.2,-1.741528,-0.101252
2,13.134473,15.3,2.165527,0.141538
3,14.572814,13.6,-0.972814,-0.07153
4,19.954283,19.2,-0.754283,-0.039286


In [14]:
df.pct_diff.mean()

-0.04010719189796854

# 5) On sauve le modèle

In [15]:
path_where_to_save =  'model_svr.pkl'
joblib.dump(model, path_where_to_save) 

['model_svr.pkl',
 'model_svr.pkl_01.npy',
 'model_svr.pkl_02.npy',
 'model_svr.pkl_03.npy',
 'model_svr.pkl_04.npy',
 'model_svr.pkl_05.npy',
 'model_svr.pkl_06.npy',
 'model_svr.pkl_07.npy',
 'model_svr.pkl_08.npy',
 'model_svr.pkl_09.npy',
 'model_svr.pkl_10.npy']

## 5.1) le dump crée plusieurs fichiers sur le disque

In [16]:
!ls

MANIFEST.in
README.md
[1m[34m__pycache__[m[m
[1m[34mbuild[m[m
[1m[34mdist[m[m
[1m[34mjr_data_science[m[m
[1m[34mjr_data_science.egg-info[m[m
[1m[32mmaj.sh[m[m
model_svr.pkl
model_svr.pkl_01.npy
model_svr.pkl_02.npy
model_svr.pkl_03.npy
model_svr.pkl_04.npy
model_svr.pkl_05.npy
model_svr.pkl_06.npy
model_svr.pkl_07.npy
model_svr.pkl_08.npy
model_svr.pkl_09.npy
model_svr.pkl_10.npy
scikit_learn_examples.ipynb
scikit_learn_examples_2-Copy1.ipynb
scikit_learn_examples_2.ipynb
scikit_learn_examples_for_rest_api.ipynb
setup.py


# 6) on load le modèle

In [17]:
clf = joblib.load(path_where_to_save) 

In [19]:
clf.score(X_test, y_test)

0.89695786776384812