# Carga de datos

In [1]:
import pandas as pd
# Carga
df = pd.read_csv('data/departamentos_clean.csv')
df.shape

(2349, 13)

In [2]:
df.head()

Unnamed: 0,preferencia,comuna,estacion_cercana,distancia_estacion,dormitorios,baños,estacionamientos,bodegas,superficie_total,superficie_util,precio,gastos_comunes,total
0,0,La Reina,Príncipe de Gales,412.0,3.0,2.0,1.0,1.0,118.0,98.0,820000.0,110000.0,930000.0
1,0,La Reina,Príncipe de Gales,425.0,4.0,3.0,3.0,2.0,135.0,128.0,1400000.0,180000.0,1580000.0
2,0,La Reina,Fernando Castillo Velasco,1433.0,2.0,1.0,0.0,0.0,45.0,45.0,370000.0,5000.0,375000.0
3,0,La Reina,Sin estación cercana,3000.0,3.0,3.0,3.0,1.0,190.0,140.0,1898413.0,0.0,1898413.0
4,0,La Reina,Príncipe de Gales,966.0,4.0,3.0,2.0,0.0,162.0,137.0,1500000.0,200000.0,1700000.0


## Modelo SVM

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score
from sklearn import svm

In [4]:
# Para usar los métodos SVM las variables categoricas "comuna" y "estacion_cercana" se transforman a variables dummys
df = pd.get_dummies(df)

In [7]:
df.head()

Unnamed: 0,preferencia,distancia_estacion,dormitorios,baños,estacionamientos,bodegas,superficie_total,superficie_util,precio,gastos_comunes,...,estacion_cercana_San José de la Estrella,estacion_cercana_Santa Isabel,estacion_cercana_Simón Bolívar,estacion_cercana_Sin estación cercana,estacion_cercana_Tobalaba,estacion_cercana_Vicente Valdés,estacion_cercana_Vicuña Mackenna,estacion_cercana_Villa Frei,estacion_cercana_Ñuble,estacion_cercana_Ñuñoa
0,0,412.0,3.0,2.0,1.0,1.0,118.0,98.0,820000.0,110000.0,...,0,0,0,0,0,0,0,0,0,0
1,0,425.0,4.0,3.0,3.0,2.0,135.0,128.0,1400000.0,180000.0,...,0,0,0,0,0,0,0,0,0,0
2,0,1433.0,2.0,1.0,0.0,0.0,45.0,45.0,370000.0,5000.0,...,0,0,0,0,0,0,0,0,0,0
3,0,3000.0,3.0,3.0,3.0,1.0,190.0,140.0,1898413.0,0.0,...,0,0,0,1,0,0,0,0,0,0
4,0,966.0,4.0,3.0,2.0,0.0,162.0,137.0,1500000.0,200000.0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Separación de etiqueta Y y caracteristicas X
y = df.iloc[:,0]
X = df.iloc[:,1:]
# Subconjunto de entrenamiento y testing de 80 y 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

## Prueba automatica de diferentes modelos SVM

In [None]:
for kernel in ['linear', 'poly', 'rbf', 'sigmoid'] :
    print('====> Kernel : ', kernel)
    for C in [0.01, 0.1, 1, 1.5, 5, 10, 20] :
        print('==> C : ', C)
        clf_iter = svm.SVC(kernel=kernel, C=C)
        accuracy = cross_val_score(clf_iter, X, y, cv=5)
        print("Precisión promedio: ", accuracy.mean() * 100)
        clf_iter.fit(X_train, y_train)
        y_test_pred = clf_iter.predict(X_test)
        f1 = f1_score(y_test, y_test_pred, average='macro')
        confusion_matrix(y_test, y_test_pred)
        print('f1 : ', f1)
        aucscore = roc_auc_score(y_test, y_test_pred, average='macro')
        print('auc : ', aucscore)

# Testing

### Probando la Estandarización

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

print(scaler.fit(df))

print(scaler.mean_)

print(scaler.transform(df))

#print(scaler.transform([[2, 2]]))

StandardScaler()
[1.28762794e+06 1.27533206e+03]
[[-4.92310153e-01 -7.72657037e-01]
 [ 1.18303257e-01 -7.61022416e-01]
 [-9.66061937e-01  1.41108216e-01]
 ...
 [-6.18643962e-01  1.54352756e+00]
 [-5.97912802e-04  1.54352756e+00]
 [-3.39346230e-01 -6.36621466e-01]]


In [10]:
df[['precio', 'dormitorios']].head()

Unnamed: 0,precio,dormitorios
0,820000.0,3.0
1,1400000.0,4.0
2,370000.0,2.0
3,1898413.0,3.0
4,1500000.0,4.0


In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

print(scaler.fit(df[['precio', 'dormitorios', 'estacion_cercana_Tobalaba']]))

print(scaler.mean_)

print(scaler.transform(df[['precio', 'dormitorios', 'estacion_cercana_Tobalaba']]))

StandardScaler()
[1.28762794e+06 2.50191571e+00 4.64027246e-02]
[[-4.92310153e-01  4.78122161e-01 -2.20591737e-01]
 [ 1.18303257e-01  1.43804435e+00 -2.20591737e-01]
 [-9.66061937e-01 -4.81800024e-01 -2.20591737e-01]
 ...
 [-6.18643962e-01 -1.44172221e+00 -2.20591737e-01]
 [-5.97912802e-04  1.43804435e+00 -2.20591737e-01]
 [-3.39346230e-01  4.78122161e-01 -2.20591737e-01]]


### Usando Pipeline

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline