In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set_theme()
sns.set_palette("colorblind")

In [None]:
df = pd.read_csv("propiedades.csv")

In [None]:
df_dummies = pd.get_dummies(df, columns=["property_type", "neighbourhood"], drop_first=True)

In [None]:
df_dummies.columns

Index(['Unnamed: 0', 'id', 'start_date', 'end_date', 'latitud', 'longitud',
       'property_rooms', 'property_bedrooms', 'property_surface_total',
       'property_surface_covered', 'property_price', 'property_title',
       'precio_m2', 'tipo_precio', 'property_type_Departamento',
       'property_type_PH', 'neighbourhood_Agronomía', 'neighbourhood_Almagro',
       'neighbourhood_Balvanera', 'neighbourhood_Barracas',
       'neighbourhood_Barrio Norte', 'neighbourhood_Belgrano',
       'neighbourhood_Boca', 'neighbourhood_Boedo', 'neighbourhood_Caballito',
       'neighbourhood_Catalinas', 'neighbourhood_Centro / Microcentro',
       'neighbourhood_Chacarita', 'neighbourhood_Coghlan',
       'neighbourhood_Colegiales', 'neighbourhood_Congreso',
       'neighbourhood_Constitución', 'neighbourhood_Flores',
       'neighbourhood_Floresta', 'neighbourhood_Las Cañitas',
       'neighbourhood_Liniers', 'neighbourhood_Mataderos',
       'neighbourhood_Monserrat', 'neighbourhood_Monte Castro

In [None]:
df = df_dummies

In [None]:
from sklearn.model_selection import train_test_split

onehot_cols = [col for col in df.columns if col.startswith(("property_type_","neighbourhood_"))]
num_cols = ["property_rooms", "property_bedrooms", "property_surface_total", "property_surface_covered"]

X = df[onehot_cols + num_cols]
y = df["tipo_precio"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=137)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

ct = ColumnTransformer([
    ("scaling", StandardScaler(), num_cols)
], remainder="passthrough")

X_train_trans = ct.fit_transform(X_train)

In [None]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, StratifiedShuffleSplit
from sklearn.svm import SVC

Cs = [1, 10, 100, 1000]

params = [
    {"C": Cs, "kernel": ["linear"]},
    {"C": Cs, "degree": [2, 3, 4], "kernel": ["poly"]},
    {"C": Cs, "gamma": np.logspace(-9, 3, 13), "kernel": ["rbf"]},
]

cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
clf = HalvingGridSearchCV(estimator=SVC(),
                          param_grid=params,
                          cv=cv,
                          scoring="accuracy",
                          verbose=2)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
clf.fit(X_train_trans, y_train)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 2480
max_resources_: 66977
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 68
n_resources: 2480
Fitting 5 folds for each of 68 candidates, totalling 340 fits
[CV] END .................................C=1, kernel=linear; total time=   1.8s
[CV] END .................................C=1, kernel=linear; total time=   1.4s
[CV] END .................................C=1, kernel=linear; total time=   0.7s
[CV] END .................................C=1, kernel=linear; total time=   0.7s
[CV] END .................................C=1, kernel=linear; total time=   0.8s
[CV] END ................................C=10, kernel=linear; total time=   1.0s
[CV] END ................................C=10, kernel=linear; total time=   0.7s
[CV] END ................................C=10, kernel=linear; total time=   0.8s
[CV] END ................................C=10, kernel=linear; total time=   0.7s
[CV] E

KeyboardInterrupt: 

In [None]:
model = clf.best_estimator_
model

In [None]:
from joblib import dump
dump(model, '/content/drive/My Drive/svm.joblib')

In [None]:
# from sklearn.metrics import confusion_matrix
#
# y_pred = model.predict(X_test)
#
# conf = confusion_matrix(y_test, y_pred)
#
# sns.heatmap(conf, cmap='GnBu', annot=True, fmt='g')
# plt.xlabel('Predicted')
# plt.ylabel('True')