In [5]:
import numpy as np

import pandas as pd

import sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, cross_val_score

from feature_engine.datetime import DatetimeFeatures

import joblib

import matplotlib.pyplot as plt

In [6]:
pd.set_option("display.max_columns", None)

In [7]:
sklearn.set_config(transform_output="default")

In [8]:
data=pd.read_csv("data_yoo/housing.csv")

In [9]:
X = data.drop(columns=['price']) 
y = data['price']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
X_train

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
46,6000,3,2,4,yes,no,no,no,yes,1,no,furnished
93,7200,3,2,1,yes,no,yes,no,yes,3,no,semi-furnished
335,3816,2,1,1,yes,no,yes,no,yes,2,no,furnished
412,2610,3,1,2,yes,no,yes,no,no,0,yes,unfurnished
471,3750,3,1,2,yes,no,no,no,no,0,no,unfurnished
...,...,...,...,...,...,...,...,...,...,...,...,...
71,6000,4,2,4,yes,no,no,no,yes,0,no,unfurnished
106,5450,4,2,1,yes,no,yes,no,yes,0,yes,semi-furnished
270,4500,3,2,3,yes,no,no,yes,no,1,no,furnished
435,4040,2,1,1,yes,no,no,no,no,0,no,unfurnished


In [12]:
X_train.shape

(436, 12)

In [13]:
X_test.shape

(109, 12)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [15]:
data['total_rooms'] = data['bedrooms'] + data['bathrooms']

In [16]:
data.drop(['bedrooms', 'bathrooms'], axis=1, inplace=True)

In [17]:
data.head()


Unnamed: 0,price,area,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,total_rooms
0,13300000,7420,3,yes,no,no,no,yes,2,yes,furnished,6
1,12250000,8960,4,yes,no,no,no,yes,3,no,furnished,8
2,12250000,9960,2,yes,no,yes,no,no,2,yes,semi-furnished,5
3,12215000,7500,2,yes,no,yes,no,yes,3,yes,furnished,6
4,11410000,7420,2,yes,yes,yes,no,yes,2,no,furnished,5


In [18]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [19]:
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [20]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

In [21]:
algorithms = {
	"Linear Regression": LinearRegression(),
	"Support Vector Machine": SVR(),
	"Random Forest": RandomForestRegressor(n_estimators=10),
}

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test are defined and preprocessed
# Assuming algorithms and preprocessor are defined as in your original code

# Fit and transform the training data with preprocessor
X_train_processed = preprocessor.fit_transform(X_train)

# Initialize results dictionary
results = {}

# Iterate over algorithms
for name, alg in algorithms.items():
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('algorithm', alg)
    ])
    
    # Perform cross-validation with accuracy scoring
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='r2', n_jobs=-1)
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    
    results[name] = {
        'mean_r2': mean_score,
        'std_r2': std_score
    }

# Initialize variables to track the best algorithm
best_algorithm = None
best_accuracy = -1

# Determine the best algorithm based on mean accuracy
for name, result in results.items():
    if result['mean_r2'] > best_accuracy:
        best_accuracy = result['mean_r2']
        best_algorithm = algorithms[name]  # Get the actual algorithm object

# Check if best_algorithm is still None
if best_algorithm is None:
    raise ValueError("No best algorithm found. Check your algorithm selection logic.")

print(f"Best Algorithm: {type(best_algorithm).__name__}")
print(f"Mean Accuracy: {best_accuracy:.2f}")

# Train the best algorithm on the full training data
best_algorithm.fit(X_train_processed, y_train)

# Transform the test data with preprocessor (assuming it has been fit on X_train)
X_test_processed = preprocessor.transform(X_test)

# Predict using the best algorithm on the test set
y_pred = best_algorithm.predict(X_test_processed)

# Evaluate the performance of the best algorithm on the test set if needed
accuracy_on_test = r2_score(y_test, y_pred)
print(f"Accuracy on Test Set: {accuracy_on_test:.2f}")


Best Algorithm: LinearRegression
Mean Accuracy: 0.64
Accuracy on Test Set: 0.65


In [24]:
model = Pipeline(steps=[
	("pre", preprocessor),
	("Linear Regression",LinearRegression( ))
])

In [25]:
model.fit(X_train, y_train)

In [26]:
joblib.dump(model,"model.joblib")

['model.joblib']

In [27]:
saved=joblib.load("model.joblib")
saved