# PD 4 - WUM
## SVM
### Wojciech Kretowicz

### Importy

In [7]:
import pandas as pd
import numpy as np
import rdata

from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.ensemble.partial_dependence import plot_partial_dependence

import warnings
warnings.filterwarnings('ignore')

### Wczytanie danych

In [8]:
data = rdata.conversion.convert(rdata.parser.parse_file("apartments.rda"))['apartments']

In [9]:
data.head()

Unnamed: 0,m2.price,construction.year,surface,floor,no.rooms,district
0,5897.0,1953.0,25.0,3,1.0,Srodmiescie
1,1818.0,1992.0,143.0,9,5.0,Bielany
2,3643.0,1937.0,56.0,1,2.0,Praga
3,3517.0,1995.0,93.0,7,3.0,Ochota
4,3013.0,1992.0,144.0,6,5.0,Mokotow


In [10]:
data.shape

(1000, 6)

In [11]:
X = np.array(data.drop(columns='m2.price'))
y = np.array(data.loc[:,'m2.price'])

### Podział danych

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

### Encoding

In [13]:
from category_encoders.target_encoder import TargetEncoder

# SVM
## bez standaryzacji i hiperparametryzacji

In [14]:
from sklearn.svm import SVR

In [15]:
svm_pipe = Pipeline([('te', TargetEncoder(cols = [4])),
                    ('svr', SVR())])

scores = cross_val_score(svm_pipe, X_train, y_train, scoring='neg_mean_squared_error',cv=5,n_jobs=-1)

print(str.format('{0:.2} +- {1:.2}',np.mean(scores), np.std(scores)))

-8.4e+05 +- 9e+04


### Test

In [16]:
svm_pipe.fit(X_train, y_train)
-metrics.mean_squared_error(y_test, svm_pipe.predict(X_test))

-814058.4161905991

In [26]:
plot_partial_dependence(svm_pipe, X_train, ['construction.year'])

ValueError: gbrt has to be an instance of BaseGradientBoosting

## Ze standaryzacją

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
svm_pipe2 = Pipeline([
    ('te', TargetEncoder(cols=[4])),
    ('stdsc', StandardScaler()),
    ('svr', SVR())
])


scores2 = cross_val_score(svm_pipe2, X_train, y_train, scoring='neg_mean_squared_error',cv=5,n_jobs=-1)

print(str.format('{0:.2} +- {1:.2}',np.mean(scores2), np.std(scores2)))

-7.8e+05 +- 8.6e+04


### Test

In [19]:
svm_pipe2.fit(X_train, y_train)
-metrics.mean_squared_error(y_test, svm_pipe2.predict(X_test))

-742191.7069173526

In [None]:
plot_

# Wniosek
Jest znaczna poprawa po przeprowadzeniu standaryzacji. SVM jest wyjątkowo na to wrażliwy z uwagi na bazowanie na metryce przy uczeniu i predykcji.

# SVM
## Strojenie hiperparametryczne

In [20]:
param_dist = {
    'svr__gamma': np.random.uniform(0, 10, 100),
    'svr__C': np.random.uniform(0,100,100),
    'svr__epsilon': np.random.uniform(0,2,100)
}

rscv = RandomizedSearchCV(svm_pipe2, param_dist, n_iter=100, scoring='neg_mean_squared_error', cv=2)

rscv.fit(X_train, y_train)

RandomizedSearchCV(cv=2, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('te', TargetEncoder(cols=[4], drop_invariant=False, handle_unknown='impute',
       impute_missing=True, min_samples_leaf=1, return_df=True,
       smoothing=1.0, verbose=0)), ('stdsc', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svr', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False))]),
          fit_params=None, iid='warn', n_iter=100, n_jobs=None,
          param_distributions={'svr__gamma': array([1.94903, 2.41557, ..., 0.452  , 8.53776]), 'svr__C': array([36.41265,  0.5749 , ..., 63.33712, 84.83428]), 'svr__epsilon': array([1.56072, 0.83499, ..., 0.46949, 1.54841])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=0)

In [21]:
rscv.best_score_

-192925.81173929613

In [22]:
rscv.best_params_

{'svr__gamma': 0.45199790619964486,
 'svr__epsilon': 0.6910684636573576,
 'svr__C': 89.55702212408687}

### Test

In [23]:
svm_pipe2.set_params(**rscv.best_params_)

Pipeline(memory=None,
     steps=[('te', TargetEncoder(cols=[4], drop_invariant=False, handle_unknown='impute',
       impute_missing=True, min_samples_leaf=1, return_df=True,
       smoothing=1.0, verbose=0)), ('stdsc', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svr', SVR(C=89.55702212408687, cache_size=200, coef0=0.0, degree=3,
  epsilon=0.6910684636573576, gamma=0.45199790619964486, kernel='rbf',
  max_iter=-1, shrinking=True, tol=0.001, verbose=False))])

In [None]:
svm_pipe2.fit(X_train, y_train)
-metrics.mean_squared_error(y_test, svm_pipe2.predict(X_test))

# Wnioski
Po standaryzacji i przeszukaniu siatki hiperparametrów błąd zmniejszył się o cały rząd wielkości. SVM jest dość wrażliwe na dwa hiperparametry: gamma i C.

# Partial Dependency Plot

Przperaszam, ale nie starczyło mi czasu.