In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

import pickle

In [2]:
df = pd.read_csv('C:/Users/admin/Downloads/ML1 Mid Prj/ML1 Mid Prj/data/data_for_model.csv', index_col = 0)

In [7]:
df.head()

Unnamed: 0,log_price,area,bedrooms,bathrooms,floors,property_type,furniture,legal_status,distance_to_center
0,23.362323,88,4,4,4.0,villa,other,has_title,16.574719
1,23.544645,96,3,2,4.0,villa,other,has_title,16.598702
2,23.035801,65,1,1,1.0,shophouse,other,has_title,11.669648
3,23.520547,96,3,2,5.0,villa,other,other,16.486178
4,23.550579,75,6,7,5.0,private_house,full,has_title,2.263692


In [3]:
X = df.drop('log_price', axis = 1)
y = df[['log_price']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
cat_col = X.select_dtypes(include = 'object').columns
num_col = X.select_dtypes(exclude = 'object').columns

scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

ct = ColumnTransformer(
    [
        ('encoder', encoder, cat_col),
        ('scaler', scaler, num_col)
    ]
)

X_train = pd.DataFrame(ct.fit_transform(X_train))
X_test = pd.DataFrame(ct.transform(X_test))

In [12]:
with open('C:/Users/admin/Downloads/ML1 Mid Prj/ML1 Mid Prj/artifacts/processor.pkl', 'wb') as file:
    pickle.dump(ct, file)

In [5]:
models = {
        "Linear Regressor": LinearRegression(),
        "Decision Tree": DecisionTreeRegressor(),
        "Random Forest Regressor": RandomForestRegressor(),
        "XGBRegressor": XGBRegressor(n_jobs = -1), 
        "AdaBoost Regressor": AdaBoostRegressor(),
        'CatBoosting Regressor': CatBoostRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'K Nearest Neighbors': KNeighborsRegressor(),
        'Ridge': Ridge(),
        'Lasso': Lasso()
    }
    
params={
    "Decision Tree": {
        'criterion':['squared_error', 'friedman_mse', 'absolute_error'],
        # 'splitter':['best','random'],
        # 'max_features':['sqrt','log2'],
    },
    "Random Forest Regressor":{
        'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        
        # 'max_features':['sqrt','log2',None],
        'n_estimators': [8,16,32,64,128]
    },
    "K Nearest Neighbors": {
        'n_neighbors': [3, 4, 5, 6]
    },
    "Gradient Boosting":{
        'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
        'learning_rate':[.1,.01,.05,.001],
        'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
        'criterion':['squared_error', 'friedman_mse'],
        # 'max_features':['auto','sqrt','log2'],
        'n_estimators': [8,16,32,64,128]
    },
    "Linear Regressor":{},
    "XGBRegressor":{
        'learning_rate':[.1,.01,.05,.001],
        'n_estimators': [8,16,32,64,128]
    },
    "CatBoosting Regressor":{
        'depth': [6,8,10],
        'learning_rate': [0.01, 0.05, 0.1],
        'iterations': [30, 50, 100]
    },
    "AdaBoost Regressor":{
        'learning_rate':[.1,.01,0.5,.001],
        # 'loss':['linear','square','exponential'],
        'n_estimators': [8,16,32,64,128]
    },
    'Ridge': {
        'alpha': [1, 0.1, 0.01, 0.001]
    },
    'Lasso': {
        'alpha': [1, 0.1, 0.01, 0.001]
    }
}

In [6]:
model_best_params = {}
def get_best():
    for model_name, param in params.items():
        model = GridSearchCV(models[model_name], param, n_jobs=-1, verbose = False)
        model.fit(X_train, y_train)
        model_best_params[model_name] = model.best_params_
        
    return model_best_params
    
best = get_best()
for key, value in best.items():
    print(f'{key}: {value}')

  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


0:	learn: 0.8951280	total: 139ms	remaining: 13.8s
1:	learn: 0.8444239	total: 144ms	remaining: 7.04s
2:	learn: 0.8014872	total: 147ms	remaining: 4.76s
3:	learn: 0.7722932	total: 149ms	remaining: 3.58s
4:	learn: 0.7466244	total: 152ms	remaining: 2.89s
5:	learn: 0.7085685	total: 155ms	remaining: 2.43s
6:	learn: 0.6711601	total: 157ms	remaining: 2.08s
7:	learn: 0.6387499	total: 159ms	remaining: 1.83s
8:	learn: 0.6126480	total: 161ms	remaining: 1.62s
9:	learn: 0.5968012	total: 162ms	remaining: 1.46s
10:	learn: 0.5737123	total: 164ms	remaining: 1.33s
11:	learn: 0.5498189	total: 167ms	remaining: 1.22s
12:	learn: 0.5359439	total: 168ms	remaining: 1.13s
13:	learn: 0.5193322	total: 170ms	remaining: 1.04s
14:	learn: 0.5025874	total: 171ms	remaining: 971ms
15:	learn: 0.4832875	total: 173ms	remaining: 908ms
16:	learn: 0.4704290	total: 175ms	remaining: 852ms
17:	learn: 0.4600377	total: 176ms	remaining: 802ms
18:	learn: 0.4436799	total: 178ms	remaining: 758ms
19:	learn: 0.4336543	total: 179ms	remaini

  y = column_or_1d(y, warn=True)


Decision Tree: {'criterion': 'squared_error'}
Random Forest Regressor: {'criterion': 'absolute_error', 'n_estimators': 128}
K Nearest Neighbors: {'n_neighbors': 3}
Gradient Boosting: {'criterion': 'friedman_mse', 'learning_rate': 0.1, 'loss': 'squared_error', 'n_estimators': 64, 'subsample': 0.6}
Linear Regressor: {}
XGBRegressor: {'learning_rate': 0.05, 'n_estimators': 64}
CatBoosting Regressor: {'depth': 6, 'iterations': 100, 'learning_rate': 0.1}
AdaBoost Regressor: {'learning_rate': 0.5, 'n_estimators': 128}
Ridge: {'alpha': 1}
Lasso: {'alpha': 0.001}


In [7]:
models = {
            "Linear Regressor": LinearRegression(),
            "Decision Tree": DecisionTreeRegressor(criterion='squared_error'),
            "Random Forest Regressor": RandomForestRegressor(criterion='absolute_error', n_estimators=128),
            "XGBRegressor": XGBRegressor(n_jobs = -1, learning_rate = 0.05, n_estimators = 128), 
            "AdaBoost Regressor": AdaBoostRegressor(learning_rate=0.05, n_estimators=64),
            'CatBoosting Regressor': CatBoostRegressor(depth=6, iterations=100, learning_rate=0.1),
            'Gradient Boosting': GradientBoostingRegressor(criterion='friedman_mse', learning_rate=0.05, loss = 'squared_error', n_estimators=64, subsample=0.6),
            'K Nearest Neighbors': KNeighborsRegressor(n_neighbors=3),
            'Ridge': Ridge(alpha = 1),
            'Lasso': Lasso(alpha = 0.001)
        }

def model_metrics():
    model_result = {}
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        
        model_result[model_name] = r2
        model_resut = dict(sorted(model_result.items(), key=lambda item: item[1], reverse=True))
    return model_result  
    
    
metrics = model_metrics()
for key, value in metrics.items():
    print(f'{key}: {value}')

  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)


0:	learn: 0.8951280	total: 1.96ms	remaining: 194ms
1:	learn: 0.8444239	total: 3.79ms	remaining: 186ms
2:	learn: 0.8014872	total: 5.35ms	remaining: 173ms
3:	learn: 0.7722932	total: 6.78ms	remaining: 163ms
4:	learn: 0.7466244	total: 8.73ms	remaining: 166ms
5:	learn: 0.7085685	total: 10.4ms	remaining: 163ms
6:	learn: 0.6711601	total: 12.1ms	remaining: 161ms
7:	learn: 0.6387499	total: 13.8ms	remaining: 159ms
8:	learn: 0.6126480	total: 15.5ms	remaining: 157ms
9:	learn: 0.5968012	total: 17.1ms	remaining: 153ms
10:	learn: 0.5737123	total: 18.6ms	remaining: 151ms
11:	learn: 0.5498189	total: 20.2ms	remaining: 148ms
12:	learn: 0.5359439	total: 22.1ms	remaining: 148ms
13:	learn: 0.5193322	total: 23.9ms	remaining: 147ms
14:	learn: 0.5025874	total: 25.5ms	remaining: 145ms
15:	learn: 0.4832875	total: 27.2ms	remaining: 143ms
16:	learn: 0.4704290	total: 28.9ms	remaining: 141ms
17:	learn: 0.4600377	total: 30.5ms	remaining: 139ms
18:	learn: 0.4436799	total: 32ms	remaining: 136ms
19:	learn: 0.4336543	tot

  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


Linear Regressor: 0.6098930508473182
Decision Tree: 0.46547844890972223
Random Forest Regressor: 0.8148374903101538
XGBRegressor: 0.7254796624183655
AdaBoost Regressor: 0.7050510239847936
CatBoosting Regressor: 0.8418637662561664
Gradient Boosting: 0.8175901033883688
K Nearest Neighbors: 0.7013639345482832
Ridge: 0.6128987433506715
Lasso: 0.6140699851945914


In [8]:
best_model = models['Gradient Boosting']

with open('C:/Users/admin/Downloads/ML1 Mid Prj/ML1 Mid Prj/artifacts/model.pkl', 'wb') as file:
    pickle.dump(best_model, file)