In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
import optuna
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder,TargetEncoder,PowerTransformer
from sklearn.compose import ColumnTransformer,TransformedTargetRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,KFold,RandomizedSearchCV
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
import optuna
from plotly.io import show
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [2]:
df = pd.read_csv('final_data.csv')
df = df.drop(['Title','Price_per_sqft','Is_in_Dhaka'],axis=1)
df.head()

Unnamed: 0,Bedrooms,Bathrooms,Floor_area,City,location_area,Price_in_Cr,floor_level
0,3.0,4.0,1960.0,dhaka,gulshan,3.9,low
1,3.0,3.0,1705.0,dhaka,kalabagan,1.69,low
2,3.0,3.0,1370.0,dhaka,dhanmondi,1.25,medium
3,3.0,3.0,2125.0,dhaka,bashundhara,2.0,medium
4,3.0,3.0,2687.0,dhaka,banani,4.75,medium


In [3]:
x = df.drop('Price_in_Cr',axis=1)
y = df['Price_in_Cr']

# Optuna 

In [8]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

floor_area_pipe = Pipeline([
    ('log_transform',PowerTransformer(method='yeo-johnson')),
    ('scalar',StandardScaler())
])

transformer = ColumnTransformer([
    ("floor_area",floor_area_pipe,['Floor_area']),
    ("log_transform",PowerTransformer(method='yeo-johnson'),['Floor_area']),
    ('Onehot',OneHotEncoder(handle_unknown='ignore',sparse_output=False),['City','Bedrooms','Bathrooms','location_area']),
    ('Ordinal',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1),['floor_level']),
],remainder='passthrough')


def objective(trial,preprocessor):
     regressor_name = trial.suggest_categorical('regressor',['SVR'])

     if regressor_name == 'SVR':
        kernel = trial.suggest_categorical('model__kernel',['rbf','linear','poly','sigmoid'])
        C = trial.suggest_float('model__C',0.01,1000)
        epsilon = trial.suggest_float('model__epsilon', 0.01,1)
        gamma = trial.suggest_categorical('model__gamma', ['scale', 'auto'])

        if kernel == 'poly':
            degree = trial.suggest_int('model__degree',2,5)
            coef0 = trial.suggest_float('model__coef0',0.01,1)
        elif kernel == 'sigmoid':
            coef0 = trial.suggest_float('model__coef0',0.01,1)
        else:
            degree = 3
            coef0 = 0.0


        model_pipe = Pipeline([
            ('preprocessing',transformer),
            ('model',SVR(kernel=kernel,C=C,epsilon=epsilon,gamma=gamma,degree=degree if kernel == 'poly' else 3,coef0=coef0))
        ])

                              
        final_pipe = TransformedTargetRegressor(regressor=model_pipe,func=np.log1p,inverse_func=np.expm1)

        trial.set_user_attr('model',final_pipe)

        cv = cross_val_score(final_pipe,x_train,y_train,cv=KFold(n_splits=5,shuffle=True,random_state=0),scoring='neg_mean_squared_error')

        return cv.mean()
     

study = optuna.create_study(direction='maximize',sampler=optuna.samplers.TPESampler(seed=0),pruner=optuna.pruners.HyperbandPruner())
study.optimize(lambda trial: objective(trial,transformer),n_trials=100,n_jobs=-1)

print(f'Best neg score: {study.best_value}')
print(f'Best Params: {study.best_params}')

final_model = study.best_trial.user_attrs['model']
final_model.fit(x_train,y_train)
y_pred = final_model.predict(x_test)

print(f'MSE: {mean_squared_error(y_test,y_pred)}')
print(f'MAE: {mean_absolute_error(y_test,y_pred)}')
print(f'R2: {r2_score(y_test,y_pred)}')

[I 2025-09-26 18:20:01,043] A new study created in memory with name: no-name-f0818eec-30af-4174-bbca-a16b0a613dd3
[I 2025-09-26 18:20:02,406] Trial 0 finished with value: -1.1007230935011303 and parameters: {'regressor': 'SVR', 'model__kernel': 'linear', 'model__C': 853.1861740602301, 'model__epsilon': 0.78251768688683, 'model__gamma': 'auto'}. Best is trial 0 with value: -1.1007230935011303.
[I 2025-09-26 18:20:02,446] Trial 10 finished with value: -1.4642348333020827 and parameters: {'regressor': 'SVR', 'model__kernel': 'sigmoid', 'model__C': 42.63500851438657, 'model__epsilon': 0.8381913989287384, 'model__gamma': 'auto', 'model__coef0': 0.6307895734220429}. Best is trial 0 with value: -1.1007230935011303.
[I 2025-09-26 18:20:02,459] Trial 2 finished with value: -0.6255638475226657 and parameters: {'regressor': 'SVR', 'model__kernel': 'rbf', 'model__C': 187.37898164776, 'model__epsilon': 0.6717882881948473, 'model__gamma': 'auto'}. Best is trial 2 with value: -0.6255638475226657.
[I 

Best neg score: -0.062694331212951
Best Params: {'regressor': 'SVR', 'model__kernel': 'poly', 'model__C': 655.8335042440599, 'model__epsilon': 0.06403339452008569, 'model__gamma': 'auto', 'model__degree': 2, 'model__coef0': 0.452473706007096}
MSE: 0.05293977750345322
MAE: 0.14167954072518918
R2: 0.8919481537676762


In [5]:
final_model

In [9]:
import pickle

with open('SVR_model.pkl','wb') as f:
    pickle.dump(final_model,f)

In [13]:
!nbmerge \
"(1) Data_ Gathering.ipynb" \
"(2) Data_Cleaning.ipynb" \
"(3) Feature_Engineering.ipynb" \
"(4) EDA-1.ipynb" \
"(5) Outliers.ipynb" \
"(6) EDA-2 & Missing_Imputations.ipynb" \
"(7) Feature Selection.ipynb" \
"(8) Model Selection.ipynb" \
"(9) Gradient_Boosting.ipynb" \
"(10) XGboost.ipynb" \
"(11) SVR.ipynb" \
> Final.ipynb

  validate(nb)
