Split the dataset into training and test sets.
2. Train an SVM regression model on the training set using an RBF kernel.
3. Evaluate the performance of the model on the test set using the mean
squared error (MSE) metric.
4. Fine-tune the hyperparameters of the SVM model (such as the
regularization parameter and kernel bandwidth) using cross-validation to
further improve its performance.
5. Once you are satisfied with the performance of the model, use it to make
predictions on new, unseen properties.

In [1]:
#load the dataset
import pandas as pd
import numpy as np
sh=pd.read_csv("C:\\Users\\Sree Harini\\Downloads\\residential_properties.csv")

In [2]:
sh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Size (sq ft)             19 non-null     int64  
 1   Bedrooms                 19 non-null     int64  
 2   Bathrooms                19 non-null     int64  
 3   Location                 19 non-null     object 
 4   Year Built               19 non-null     int64  
 5   Garage Size              19 non-null     int64  
 6   Distance to School (mi)  19 non-null     float64
 7   Selling Price (k$)       19 non-null     int64  
dtypes: float64(1), int64(6), object(1)
memory usage: 1.3+ KB


In [3]:
#cleaning
sh.columns=sh.columns.str.replace(' ','_')
sh.columns

Index(['Size_(sq_ft)', 'Bedrooms', 'Bathrooms', 'Location', 'Year_Built',
       'Garage_Size', 'Distance_to_School_(mi)', 'Selling_Price_(k$)'],
      dtype='object')

In [4]:
#for model training
sh1=sh.select_dtypes(exclude=['object'])

In [5]:
#EDA
sh1.describe()

Unnamed: 0,Size_(sq_ft),Bedrooms,Bathrooms,Year_Built,Garage_Size,Distance_to_School_(mi),Selling_Price_(k$)
count,19.0,19.0,19.0,19.0,19.0,19.0,19.0
mean,2142.105263,3.210526,2.263158,1997.105263,1.526316,1.968421,405.789474
std,696.272448,1.182227,1.097578,18.438772,1.073334,1.459091,242.837755
min,1000.0,1.0,1.0,1950.0,0.0,0.3,100.0
25%,1700.0,2.0,1.0,1987.5,1.0,0.75,187.5
50%,2000.0,3.0,2.0,2003.0,2.0,1.5,375.0
75%,2650.0,4.0,3.0,2010.0,2.0,2.95,575.0
max,3400.0,5.0,4.0,2020.0,3.0,5.2,900.0


In [6]:
#Scaling
from sklearn import preprocessing
sh2=sh1.copy()
#scaler=preprocessing.MinMaxScaler()->default feature_range=(0,1)
scalar=preprocessing.MinMaxScaler(feature_range=(0,1))
sh2.iloc[:,:]=scalar.fit_transform(sh2)

In [7]:
#independent and target varaibles
x=sh2.drop('Selling_Price_(k$)',axis=1)
y=sh2['Selling_Price_(k$)']

In [8]:
#1.spliting data into train and test data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, random_state=2)

In [9]:
#2.training SVR model, rbf kernel
from sklearn.svm import SVR
model=SVR(kernel='rbf')
model.fit(x_train,y_train)

SVR()

In [10]:
#predicting values for test data
predit=model.predict(x_test)

In [11]:
#error rate
from sklearn import metrics
metrics.mean_absolute_error(y_test,predit)

0.054289902402331634

In [12]:
#3.MSE
metrics.mean_squared_error(y_test,predit)

0.005238273799967599

In [13]:
import numpy as np
rmse=np.sqrt(metrics.mean_squared_error(y_test,predit))
print(rmse)

0.07237592002847079


In [14]:
#score for test data
model.score(x_test,y_test)

0.9006668079413551

In [15]:
#score for train data
model.score(x_train,y_train)

0.9358902779279757

In [16]:
#4.#Regularization using Cross validation

from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.model_selection import cross_val_score
import numpy as np

# Set up Ridge regression with cross-validation
ridge = RidgeCV(alphas=np.logspace(-10, 10, 21), cv=5)

# Perform cross-validation and compute mean squared error
mse = -cross_val_score(ridge, x, y, scoring='neg_mean_squared_error', cv=5)

# Print mean squared error for each alpha
for alpha, mse_val in zip(ridge.alphas, mse):
    print("Alpha: {:.2e} -- Mean squared error: {:.2f}".format(alpha, mse_val))

Alpha: 1.00e-10 -- Mean squared error: 0.01
Alpha: 1.00e-09 -- Mean squared error: 0.00
Alpha: 1.00e-08 -- Mean squared error: 0.01
Alpha: 1.00e-07 -- Mean squared error: 0.01
Alpha: 1.00e-06 -- Mean squared error: 0.00


In [17]:
#4.Lasso regularization with cross-validation
lasso = LassoCV(alphas=np.logspace(-10, 10, 21), cv=5)

# Perform cross-validation and compute mean squared error
mse = -cross_val_score(lasso, x, y, scoring='neg_mean_squared_error', cv=5)

# Print mean squared error for each alpha
for alpha, mse_val in zip(lasso.alphas, mse):
    print("Alpha: {:.2e} -- Mean squared error: {:.2f}".format(alpha, mse_val))


Alpha: 1.00e-10 -- Mean squared error: 0.01
Alpha: 1.00e-09 -- Mean squared error: 0.00
Alpha: 1.00e-08 -- Mean squared error: 0.00
Alpha: 1.00e-07 -- Mean squared error: 0.01
Alpha: 1.00e-06 -- Mean squared error: 0.00


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


In [18]:
#4.optimising kernel bandwidth using CV
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity

# Set up grid search to select bandwidth parameter
params = {'bandwidth': np.logspace(-1, 1, 20)}
grid = GridSearchCV(KernelDensity(), params, cv=5)

# Fit grid search on data and obtain optimal bandwidth
grid.fit(sh2)
bw = grid.best_params_['bandwidth']

# Create KDE estimator with optimal bandwidth
kde = KernelDensity(bandwidth=bw)

# Fit KDE estimator on data
kde.fit(sh2)


KernelDensity(bandwidth=0.12742749857031338)

In [19]:
(model.predict([[2600,3,2,2003,1,2.95]]))*1000



array([422.04914258])