In [1]:
import numpy as np

import pandas as pd

import sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, cross_val_score

from feature_engine.datetime import DatetimeFeatures

import joblib

import matplotlib.pyplot as plt

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
sklearn.set_config(transform_output="default")

In [4]:
data=pd.read_csv("data_included/housing.csv")

In [5]:
nan_counts = data.isnull().sum()
print('nan_counts')

nan_counts


In [6]:
data['total_rooms'] = data['bedrooms'] + data['bathrooms']

In [7]:
data.drop(['bedrooms', 'bathrooms','prefarea','stories'],axis=1, inplace=True)

In [8]:
X = data.drop(columns=['price']) 
y = data['price']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
X_train

Unnamed: 0,area,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,furnishingstatus,total_rooms
46,6000,yes,no,no,no,yes,1,furnished,5
93,7200,yes,no,yes,no,yes,3,semi-furnished,5
335,3816,yes,no,yes,no,yes,2,furnished,3
412,2610,yes,no,yes,no,no,0,unfurnished,4
471,3750,yes,no,no,no,no,0,unfurnished,4
...,...,...,...,...,...,...,...,...,...
71,6000,yes,no,no,no,yes,0,unfurnished,6
106,5450,yes,no,yes,no,yes,0,semi-furnished,6
270,4500,yes,no,no,yes,no,1,furnished,5
435,4040,yes,no,no,no,no,0,unfurnished,3


In [11]:
X_train.shape

(436, 9)

In [12]:
X_test.shape

(109, 9)

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   mainroad          545 non-null    object
 3   guestroom         545 non-null    object
 4   basement          545 non-null    object
 5   hotwaterheating   545 non-null    object
 6   airconditioning   545 non-null    object
 7   parking           545 non-null    int64 
 8   furnishingstatus  545 non-null    object
 9   total_rooms       545 non-null    int64 
dtypes: int64(4), object(6)
memory usage: 42.7+ KB


In [14]:
data.head()


Unnamed: 0,price,area,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,furnishingstatus,total_rooms
0,13300000,7420,yes,no,no,no,yes,2,furnished,6
1,12250000,8960,yes,no,no,no,yes,3,furnished,8
2,12250000,9960,yes,no,yes,no,no,2,semi-furnished,5
3,12215000,7500,yes,no,yes,no,yes,3,furnished,6
4,11410000,7420,yes,yes,yes,no,yes,2,furnished,5


In [15]:
num_cols = X.select_dtypes(include=['int64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [16]:
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [17]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

In [18]:
algorithms = {
	"Linear Regression": LinearRegression(),
	"Support Vector Machine": SVR(),
	"Random Forest": RandomForestRegressor(n_estimators=10),
}

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score


X_train_processed = preprocessor.fit_transform(X_train)

results = {}

for name, alg in algorithms.items():
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('algorithm', alg)
    ])
    
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2', n_jobs=-1)
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    
    results[name] = {
        'mean_r2': mean_score,
        'std_r2': std_score
    }

best_algorithm = None
best_accuracy = -1

for name, result in results.items():
    if result['mean_r2'] > best_accuracy:
        best_accuracy = result['mean_r2']
        best_algorithm = algorithms[name] 
print(f"Best Algorithm: {type(best_algorithm).__name__}")
print(f"Mean Accuracy: {best_accuracy:.2f}")


best_algorithm.fit(X_train_processed, y_train)

X_test_processed = preprocessor.transform(X_test)
y_pred = best_algorithm.predict(X_test_processed)

accuracy_on_test = r2_score(y_test, y_pred)
print(f"Accuracy on Test Set: {accuracy_on_test:.2f}")


Best Algorithm: LinearRegression
Mean Accuracy: 0.56
Accuracy on Test Set: 0.59


In [20]:
model = Pipeline(steps=[
	("pre", preprocessor),
	("Linear Regression",LinearRegression( ))
])

In [21]:
model.fit(X_train, y_train)

In [22]:
joblib.dump(model,"model.joblib")

['model.joblib']

In [23]:
saved=joblib.load("model.joblib")
saved