# Scikit Learn

In [2]:
from sklearn import datasets
import pandas as pd
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn import grid_search
from sklearn import svm

## 1) Import de dataset

In [3]:
boston = datasets.load_boston()
X = boston["data"]
y = boston["target"]

## 2) separer le jeu d'apprentissage du jeu de test

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

## 3) Standardiser les valeurs

In [5]:
scaler = StandardScaler()
scaler.fit(X_train)                 # Il ne faut fiter que sur les data d'entrainement
X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)  # apply same transformation to test data

## 4) choisir,  exemple : Support Vector Machine

## 4.1) phase d'apprentissage

In [6]:
parameters = {  'C'       : [190, 200, 250, 260]   ,
                'kernel'  : ['rbf'],
                'gamma'   : [ 0.03],
                  "epsilon" : [1.5, 2, 2.5],
                 "degree" : [1,2]}
grid = grid_search.GridSearchCV(svm.SVR(), parameters, n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_score_, grid.best_estimator_

0.861868927906 SVR(C=190, cache_size=200, coef0=0.0, degree=1, epsilon=1.5, gamma=0.03,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)


## 4.2) prédiction sur le jeu de test (qu'on se sert comme jeu de validation)

In [7]:
model = grid.best_estimator_
model.score(X_test, y_test)

0.81576011004740978

# 4.3) pour s'amuser, on regarde dans une dataframe

In [8]:
df = pd.DataFrame()
df["prediction"] = model.predict(X_test)
df["reel"] = y_test
df["diff"] = df.reel-df.prediction
df["pct_diff"] = (df.reel-df.prediction)/df.reel
df.head()

Unnamed: 0,prediction,reel,diff,pct_diff
0,32.882426,29.0,-3.882426,-0.133877
1,23.89316,20.8,-3.09316,-0.14871
2,20.496659,19.3,-1.196659,-0.062003
3,31.599891,34.9,3.300109,0.094559
4,21.935959,23.2,1.264041,0.054485


In [9]:
df.pct_diff.mean()

-0.022069662707047504

# 5) On sauve le modèle

In [10]:
path_where_to_save =  'model_svr.pkl'
joblib.dump(model, path_where_to_save) 

['model_svr.pkl',
 'model_svr.pkl_01.npy',
 'model_svr.pkl_02.npy',
 'model_svr.pkl_03.npy',
 'model_svr.pkl_04.npy',
 'model_svr.pkl_05.npy',
 'model_svr.pkl_06.npy',
 'model_svr.pkl_07.npy',
 'model_svr.pkl_08.npy',
 'model_svr.pkl_09.npy',
 'model_svr.pkl_10.npy']

## 5.1) le dump crée plusieurs fichiers sur le disque

In [15]:
!pwd

/Users/romain/Informatique/PycharmProjects/data-science/boston_house_price_prediction_with_svm


In [11]:
!ls

boston_house_price_prediction_with_svm.ipynb
model_svr.pkl
model_svr.pkl_01.npy
model_svr.pkl_02.npy
model_svr.pkl_03.npy
model_svr.pkl_04.npy
model_svr.pkl_05.npy
model_svr.pkl_06.npy
model_svr.pkl_07.npy
model_svr.pkl_08.npy
model_svr.pkl_09.npy
model_svr.pkl_10.npy


# 6) on load le modèle

In [12]:
clf = joblib.load(path_where_to_save) 

In [13]:
clf.score(X_test, y_test)

0.81576011004740978

# 7) pour utiliser l'API on récupère des valeurs de maison : 

In [14]:
for i in pd.DataFrame(X).loc[1]:
    print(i)

0.02731
0.0
7.07
0.0
0.469
6.421
78.9
4.9671
2.0
242.0
17.8
396.9
9.14


In [17]:
pd.DataFrame(X).loc[1]

0       0.02731
1       0.00000
2       7.07000
3       0.00000
4       0.46900
5       6.42100
6      78.90000
7       4.96710
8       2.00000
9     242.00000
10     17.80000
11    396.90000
12      9.14000
Name: 1, dtype: float64

In [16]:
path_to_model = "./model_svr.pkl"
model = joblib.load(path_to_model)

In [18]:
model.predict(pd.DataFrame(X).loc[1])



array([ 25.56665728])

In [19]:
pd.DataFrame(y).loc[1]

0    21.6
Name: 1, dtype: float64

In [23]:
print(boston["DESCR"])

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      