### Linear regression with log-transformation

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy import stats


# Data integration
df = pd.read_excel('podatkiZaPython.xlsx')

# Apply log transformations
df['Log_Price'] = np.log1p(df['price'])
df['Log_LivingSize'] = np.log1p(df['livingSize'])
df['Log_Year'] = np.log1p(df['year'])

# Calculate price per square meter
df['Price_Per_Sqm'] = df['price'] / df['livingSize']

# Convert categorical features to numerical using Label Encoding
le_propType = LabelEncoder()
le_regija = LabelEncoder()
df['propType'] = le_propType.fit_transform(df['propType'])
df['regija'] = le_regija.fit_transform(df['regija'])

# Feature selection
features = ['propType', 'Log_LivingSize', 'Log_Year', 'regija']
X = df[features]
y = df['Log_Price']  # Use log-transformed price as the target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features using Standard Scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Function to predict log-transformed price based on user input
def predict_log_price(propType, livingSize, year, regija):
    # Encode categorical features
    propType_encoded = le_propType.transform([propType])[0]
    regija_encoded = le_regija.transform([regija])[0]

    # Scale numerical features
    livingSize_log = np.log1p(livingSize)  # Apply log transformation to livingSize
    year_log = np.log1p(year)
    
    # Make predictions using the trained model
    input_features = scaler.transform([[propType_encoded, livingSize_log, year_log, regija_encoded]])
    predicted_log_price = model.predict(input_features)

    return predicted_log_price[0]

# Example user input (replace with actual values)
user_propType = 'Stanovanje'
user_livingSize = 50  # Replace with actual value
user_year = 2020  # Replace with actual value
user_regija = 'Dolenjska'

# Predict log-transformed price based on user input
predicted_log_price = predict_log_price(user_propType, user_livingSize, user_year, user_regija)

# Convert the predicted log price back to the original scale
predicted_price = np.expm1(predicted_log_price)

print(f'Predicted Price: {predicted_price}')

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Evaluate the model
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Mean Absolute Error: {mae}')

Predicted Price: 229129.89412348837
Mean Squared Error: 0.3243280488925497
R-squared: 0.4146675840007449
Mean Absolute Error: 0.4296943468817331




### Removing the outliers

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy import stats


# Data integration
df = pd.read_excel('podatkiZaPython.xlsx')

# Apply log transformations
df['Log_Price'] = np.log1p(df['price'])
df['Log_LivingSize'] = np.log1p(df['livingSize'])
df['Log_Year'] = np.log1p(df['year'])

# Calculate price per square meter
df['Price_Per_Sqm'] = df['price'] / df['livingSize']

# Define a function to remove outliers based on Z-scores
def remove_outliers_zscore(df, column, threshold=4):
    z_scores = np.abs(stats.zscore(df[column]))
    df = df[(z_scores < threshold)]
    return df

# Remove outliers from 'Price_Per_Sqm' column
df = remove_outliers_zscore(df, 'year', 3)
df = remove_outliers_zscore(df, 'Price_Per_Sqm', 3)

# Convert categorical features to numerical using Label Encoding
le_propType = LabelEncoder()
le_regija = LabelEncoder()
df['propType'] = le_propType.fit_transform(df['propType'])
df['regija'] = le_regija.fit_transform(df['regija'])

# Feature selection
features = ['propType', 'Log_LivingSize', 'Log_Year', 'regija']
X = df[features]
y = df['Log_Price']  # Use log-transformed price as the target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features using Standard Scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Function to predict log-transformed price based on user input
def predict_log_price(propType, livingSize, year, regija):
    # Encode categorical features
    propType_encoded = le_propType.transform([propType])[0]
    regija_encoded = le_regija.transform([regija])[0]

    # Scale numerical features
    livingSize_log = np.log1p(livingSize)  # Apply log transformation to livingSize
    year_log = np.log1p(year)
    
    # Make predictions using the trained model
    input_features = scaler.transform([[propType_encoded, livingSize_log, year_log, regija_encoded]])
    predicted_log_price = model.predict(input_features)

    return predicted_log_price[0]

# Example user input (replace with actual values)
user_propType = 'Stanovanje'
user_livingSize = 50  # Replace with actual value
user_year = 2020  # Replace with actual value
user_regija = 'Dolenjska'

# Predict log-transformed price based on user input
predicted_log_price = predict_log_price(user_propType, user_livingSize, user_year, user_regija)

# Convert the predicted log price back to the original scale
predicted_price = np.expm1(predicted_log_price)

print(f'Predicted Price: {predicted_price}')

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Evaluate the model
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Mean Absolute Error: {mae}')

Predicted Price: 235326.71753721792
Mean Squared Error: 0.2982527724275919
R-squared: 0.42030856323504096
Mean Absolute Error: 0.4138075694517879




### Decision tree regression

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy import stats
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Data integration
df = pd.read_excel('podatkiZaPython.xlsx')

# Apply log transformations
df['Log_Price'] = np.log1p(df['price'])
df['Log_LivingSize'] = np.log1p(df['livingSize'])
df['Log_Year'] = np.log1p(df['year'])

# Calculate price per square meter
df['Price_Per_Sqm'] = df['price'] / df['livingSize']

# Define a function to remove outliers based on Z-scores
def remove_outliers_zscore(df, column, threshold=3):
    z_scores = np.abs(stats.zscore(df[column]))
    df = df[(z_scores < threshold)]
    return df

# Remove outliers from 'Price_Per_Sqm' column
df = remove_outliers_zscore(df, 'year', 3)
df = remove_outliers_zscore(df, 'Price_Per_Sqm', 3)

# Convert categorical features to numerical using Label Encoding
le_propType = LabelEncoder()
le_regija = LabelEncoder()
df['propType'] = le_propType.fit_transform(df['propType'])
df['regija'] = le_regija.fit_transform(df['regija'])

# Feature selection
features = ['propType', 'Log_LivingSize', 'Log_Year', 'regija']
X = df[features]
y = df['Log_Price']  # Use log-transformed price as the target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features using Standard Scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Decision Tree model
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

# Function to predict log-transformed price based on user input
def predict_log_price(propType, livingSize, year, regija):
    # Encode categorical features
    propType_encoded = le_propType.transform([propType])[0]
    regija_encoded = le_regija.transform([regija])[0]

    # Scale numerical features
    livingSize_log = np.log1p(livingSize)  # Apply log transformation to livingSize
    year_log = np.log1p(year)

    # Make predictions using the trained model
    input_features = scaler.transform([[propType_encoded, livingSize_log, year_log, regija_encoded]])
    predicted_log_price = model.predict(input_features)

    return predicted_log_price[0]

# Example user input (replace with actual values)
user_propType = 'Stanovanje'
user_livingSize = 50  # Replace with actual value
user_year = 2020  # Replace with actual value
user_regija = 'Dolenjska'

# Predict log-transformed price based on user input
predicted_log_price = predict_log_price(user_propType, user_livingSize, user_year, user_regija)

# Convert the predicted log price back to the original scale
predicted_price = np.expm1(predicted_log_price)

print(f'Predicted Price: {predicted_price}')

# Evaluate the Decision Tree model
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Mean Absolute Error: {mae}')

Predicted Price: 116000.0000000001
Mean Squared Error: 0.31818931887188046
R-squared: 0.38155940037444513
Mean Absolute Error: 0.3710976277917724




### Random Forrest regression and Gradient Booster regression

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy import stats
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Data integration
df = pd.read_excel('podatkiZaPython.xlsx')

# Apply log transformations
df['Log_Price'] = np.log1p(df['price'])
df['Log_LivingSize'] = np.log1p(df['livingSize'])
df['Log_Year'] = np.log1p(df['year'])

# Calculate price per square meter
df['Price_Per_Sqm'] = df['price'] / df['livingSize']

# Define a function to remove outliers based on Z-scores
def remove_outliers_zscore(df, column, threshold=3):
    z_scores = np.abs(stats.zscore(df[column]))
    df = df[(z_scores < threshold)]
    return df

# Remove outliers from 'Price_Per_Sqm' column
df = remove_outliers_zscore(df, 'year', 3)
df = remove_outliers_zscore(df, 'Price_Per_Sqm', 3)

# Convert categorical features to numerical using Label Encoding
le_propType = LabelEncoder()
le_regija = LabelEncoder()
df['propType'] = le_propType.fit_transform(df['propType'])
df['regija'] = le_regija.fit_transform(df['regija'])

# Feature selection
features = ['propType', 'Log_LivingSize', 'Log_Year', 'regija']
X = df[features]
y = df['Log_Price']  # Use log-transformed price as the target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features using Standard Scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Random Forest model
random_forest_model = RandomForestRegressor(random_state=42)
random_forest_model.fit(X_train_scaled, y_train)

# Train a Gradient Boosting model
gradient_boosting_model = GradientBoostingRegressor(random_state=42)
gradient_boosting_model.fit(X_train_scaled, y_train)

# Function to predict log-transformed price based on user input
def predict_log_price(model, propType, livingSize, year, regija):
    # Encode categorical features
    propType_encoded = le_propType.transform([propType])[0]
    regija_encoded = le_regija.transform([regija])[0]

    # Scale numerical features
    livingSize_log = np.log1p(livingSize)  # Apply log transformation to livingSize
    year_log = np.log1p(year)

    # Make predictions using the trained model
    input_features = scaler.transform([[propType_encoded, livingSize_log, year_log, regija_encoded]])
    predicted_log_price = model.predict(input_features)

    return predicted_log_price[0]

# Example user input (replace with actual values)
user_propType = 'Stanovanje'
user_livingSize = 50  # Replace with actual value
user_year = 2020  # Replace with actual value
user_regija = 'Dolenjska'

# Predict log-transformed price based on user input using Random Forest model
predicted_log_price_rf = predict_log_price(random_forest_model, user_propType, user_livingSize, user_year, user_regija)

# Predict log-transformed price based on user input using Gradient Boosting model
predicted_log_price_gb = predict_log_price(gradient_boosting_model, user_propType, user_livingSize, user_year, user_regija)

# Convert the predicted log prices back to the original scale
predicted_price_rf = np.expm1(predicted_log_price_rf)
predicted_price_gb = np.expm1(predicted_log_price_gb)

print(f'Random Forest Predicted Price: {predicted_price_rf}')
print(f'Gradient Boosting Predicted Price: {predicted_price_gb}')

# Evaluate the Random Forest model
y_pred_rf = random_forest_model.predict(X_test_scaled)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

print('\nRandom Forest Model Metrics:')
print(f'Mean Squared Error: {mse_rf}')
print(f'R-squared: {r2_rf}')
print(f'Mean Absolute Error: {mae_rf}')

# Evaluate the Gradient Boosting model
y_pred_gb = gradient_boosting_model.predict(X_test_scaled)
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)
mae_gb = mean_absolute_error(y_test, y_pred_gb)

print('\nGradient Boosting Model Metrics:')
print(f'Mean Squared Error: {mse_gb}')
print(f'R-squared: {r2_gb}')
print(f'Mean Absolute Error: {mae_gb}')


Random Forest Predicted Price: 142696.54774545555
Gradient Boosting Predicted Price: 171307.77846271315

Random Forest Model Metrics:
Mean Squared Error: 0.1797247614846413
R-squared: 0.650682525566247
Mean Absolute Error: 0.2888203509810451

Gradient Boosting Model Metrics:
Mean Squared Error: 0.17656912732191696
R-squared: 0.6568158938726166
Mean Absolute Error: 0.30052425225632295




### Support Vector Regression

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
from scipy import stats
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Data integration
df = pd.read_excel('podatkiZaPython.xlsx')

# Apply log transformations
df['Log_Price'] = np.log1p(df['price'])
df['Log_LivingSize'] = np.log1p(df['livingSize'])
df['Log_Year'] = np.log1p(df['year'])

# Calculate price per square meter
df['Price_Per_Sqm'] = df['price'] / df['livingSize']

# Define a function to remove outliers based on Z-scores
def remove_outliers_zscore(df, column, threshold=3):
    z_scores = np.abs(stats.zscore(df[column]))
    df = df[(z_scores < threshold)]
    return df

# Remove outliers from 'Price_Per_Sqm' column
df = remove_outliers_zscore(df, 'year', 3)
df = remove_outliers_zscore(df, 'Price_Per_Sqm', 3)

# Convert categorical features to numerical using Label Encoding
le_propType = LabelEncoder()
le_regija = LabelEncoder()
df['propType'] = le_propType.fit_transform(df['propType'])
df['regija'] = le_regija.fit_transform(df['regija'])

# Feature selection
features = ['propType', 'Log_LivingSize', 'Log_Year', 'regija']
X = df[features]
y = df['Log_Price']  # Use log-transformed price as the target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features using Standard Scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Support Vector Regression model
svr_model = SVR()
svr_model.fit(X_train_scaled, y_train)

# Function to predict log-transformed price based on user input
def predict_log_price(model, propType, livingSize, year, regija):
    # Encode categorical features
    propType_encoded = le_propType.transform([propType])[0]
    regija_encoded = le_regija.transform([regija])[0]

    # Scale numerical features
    livingSize_log = np.log1p(livingSize)  # Apply log transformation to livingSize
    year_log = np.log1p(year)

    # Make predictions using the trained model
    input_features = scaler.transform([[propType_encoded, livingSize_log, year_log, regija_encoded]])
    predicted_log_price = model.predict(input_features)

    return predicted_log_price[0]

# Example user input (replace with actual values)
user_propType = 'Stanovanje'
user_livingSize = 50  # Replace with actual value
user_year = 2020  # Replace with actual value
user_regija = 'Dolenjska'

# Predict log-transformed price based on user input using SVR model
predicted_log_price_svr = predict_log_price(svr_model, user_propType, user_livingSize, user_year, user_regija)

# Convert the predicted log price back to the original scale
predicted_price_svr = np.expm1(predicted_log_price_svr)

print(f'SVR Predicted Price: {predicted_price_svr}')

# Evaluate the SVR model
y_pred_svr = svr_model.predict(X_test_scaled)
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)
mae_svr = mean_absolute_error(y_test, y_pred_svr)

print('\nSVR Model Metrics:')
print(f'Mean Squared Error: {mse_svr}')
print(f'R-squared: {r2_svr}')
print(f'Mean Absolute Error: {mae_svr}')

SVR Predicted Price: 156815.28789783764

SVR Model Metrics:
Mean Squared Error: 0.22157975057186713
R-squared: 0.5693321375639158
Mean Absolute Error: 0.33142423962327966




### Cross validation for 3 models - NOT USEFUL SKIP THIS

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
from scipy import stats
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Data integration
df = pd.read_excel('podatkiZaPython.xlsx')

# Apply log transformations
df['Log_Price'] = np.log1p(df['price'])
df['Log_LivingSize'] = np.log1p(df['livingSize'])
df['Log_Year'] = np.log1p(df['year'])

# Calculate price per square meter
df['Price_Per_Sqm'] = df['price'] / df['livingSize']

# Define a function to remove outliers based on Z-scores
def remove_outliers_zscore(df, column, threshold=3):
    z_scores = np.abs(stats.zscore(df[column]))
    df = df[(z_scores < threshold)]
    return df

# Remove outliers from 'Price_Per_Sqm' column
df = remove_outliers_zscore(df, 'year', 3)
df = remove_outliers_zscore(df, 'Price_Per_Sqm', 3)

# Convert categorical features to numerical using Label Encoding
le_propType = LabelEncoder()
le_regija = LabelEncoder()
df['propType'] = le_propType.fit_transform(df['propType'])
df['regija'] = le_regija.fit_transform(df['regija'])

# Feature selection
features = ['propType', 'Log_LivingSize', 'Log_Year', 'regija']
X = df[features]
y = df['Log_Price']  # Use log-transformed price as the target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features using Standard Scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Decision Tree Regression with Cross-validation
decision_tree_model = DecisionTreeRegressor(random_state=42)
cv_scores_dt = cross_val_score(decision_tree_model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
cv_mse_dt = -cv_scores_dt.mean()

# Fit the Decision Tree model on the entire training set
decision_tree_model.fit(X_train_scaled, y_train)

# Function to predict log-transformed price based on user input
def predict_log_price(model, propType, livingSize, year, regija):
    # Encode categorical features
    propType_encoded = le_propType.transform([propType])[0]
    regija_encoded = le_regija.transform([regija])[0]

    # Scale numerical features
    livingSize_log = np.log1p(livingSize)  # Apply log transformation to livingSize
    year_log = np.log1p(year)

    # Make predictions using the trained model
    input_features = scaler.transform([[propType_encoded, livingSize_log, year_log, regija_encoded]])
    predicted_log_price = model.predict(input_features)

    return predicted_log_price[0]

# Example user input (replace with actual values)
user_propType = 'Stanovanje'
user_livingSize = 50  # Replace with actual value
user_year = 2020  # Replace with actual value
user_regija = 'Ljubljana-okolica'

# Predict log-transformed price based on user input using Decision Tree model
predicted_log_price_dt = predict_log_price(decision_tree_model, user_propType, user_livingSize, user_year, user_regija)

# Convert the predicted log price back to the original scale
predicted_price_dt = np.expm1(predicted_log_price_dt)

# Evaluate Decision Tree Regression model on the test set
y_pred_dt = decision_tree_model.predict(X_test_scaled)
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)
mae_dt = mean_absolute_error(y_test, y_pred_dt)

print('\nDecision Tree Regression Model Metrics:')
print(f'Mean Squared Error (CV): {cv_mse_dt}')
print(f'Mean Squared Error (Test): {mse_dt}')
print(f'R-squared: {r2_dt}')
print(f'Mean Absolute Error: {mae_dt}')
print(f'Predicted Price (user input): {predicted_price_dt}')

# RandomForestRegressor with Cross-validation
random_forest_model = RandomForestRegressor(random_state=42)
cv_scores_rf = cross_val_score(random_forest_model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
cv_mse_rf = -cv_scores_rf.mean()

# Fit the Random Forest model on the entire training set
random_forest_model.fit(X_train_scaled, y_train)

# Predict log-transformed price based on user input using Random Forest model
predicted_log_price_rf = predict_log_price(random_forest_model, user_propType, user_livingSize, user_year, user_regija)

# Convert the predicted log price back to the original scale
predicted_price_rf = np.expm1(predicted_log_price_rf)

# Evaluate Random Forest Regression model on the test set
y_pred_rf = random_forest_model.predict(X_test_scaled)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

print('\nRandom Forest Regression Model Metrics:')
print(f'Mean Squared Error (CV): {cv_mse_rf}')
print(f'Mean Squared Error (Test): {mse_rf}')
print(f'R-squared: {r2_rf}')
print(f'Mean Absolute Error: {mae_rf}')
print(f'Predicted Price (user input): {predicted_price_rf}')

# SVR with Cross-validation
svr_model = SVR()
cv_scores_svr = cross_val_score(svr_model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
cv_mse_svr = -cv_scores_svr.mean()

# Fit the SVR model on the entire training set
svr_model.fit(X_train_scaled, y_train)

# Predict log-transformed price based on user input using SVR model
predicted_log_price_svr = predict_log_price(svr_model, user_propType, user_livingSize, user_year, user_regija)

# Convert the predicted log price back to the original scale
predicted_price_svr = np.expm1(predicted_log_price_svr)

# Evaluate SVR model on the test set
y_pred_svr = svr_model.predict(X_test_scaled)
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)
mae_svr = mean_absolute_error(y_test, y_pred_svr)

print('\nSVR Model Metrics:')
print(f'Mean Squared Error (CV): {cv_mse_svr}')
print(f'Mean Squared Error (Test): {mse_svr}')
print(f'R-squared: {r2_svr}')
print(f'Mean Absolute Error: {mae_svr}')
print(f'Predicted Price (user input): {predicted_price_svr}')





Decision Tree Regression Model Metrics:
Mean Squared Error (CV): 0.3311355308047916
Mean Squared Error (Test): 0.31818931887188046
R-squared: 0.38155940037444513
Mean Absolute Error: 0.3710976277917724
Predicted Price (user input): 191000.00000000015





Random Forest Regression Model Metrics:
Mean Squared Error (CV): 0.19235051028861655
Mean Squared Error (Test): 0.1797247614846413
R-squared: 0.650682525566247
Mean Absolute Error: 0.2888203509810451
Predicted Price (user input): 187412.2751329893

SVR Model Metrics:
Mean Squared Error (CV): 0.2183289349134306
Mean Squared Error (Test): 0.22157975057186713
R-squared: 0.5693321375639158
Mean Absolute Error: 0.33142423962327966
Predicted Price (user input): 187977.87752563364




### Hyperparemetrization

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Data integration
df = pd.read_excel('podatkiZaPython.xlsx')

# Apply log transformations
df['Log_Price'] = np.log1p(df['price'])
df['Log_LivingSize'] = np.log1p(df['livingSize'])
df['Log_Year'] = np.log1p(df['year'])

# Calculate price per square meter
df['Price_Per_Sqm'] = df['price'] / df['livingSize']

# Remove outliers
df = remove_outliers_zscore(df, 'year', 3)
df = remove_outliers_zscore(df, 'Price_Per_Sqm', 3)

# Convert categorical features to numerical using Label Encoding
le_propType = LabelEncoder()
le_regija = LabelEncoder()
df['propType'] = le_propType.fit_transform(df['propType'])
df['regija'] = le_regija.fit_transform(df['regija'])

# Feature selection
features = ['propType', 'Log_LivingSize', 'Log_Year', 'regija']
X = df[features]
y = df['Log_Price']  # Use log-transformed price as the target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features using Standard Scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random Forest model with hyperparameter tuning using Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f'Best Hyperparameters: {best_params}')

# Train the model with the best hyperparameters
best_rf_model = RandomForestRegressor(random_state=42, **best_params)
best_rf_model.fit(X_train_scaled, y_train)

# Function to predict log-transformed price based on user input
def predict_log_price_rf(propType, livingSize, year, regija):
    propType_encoded = le_propType.transform([propType])[0]
    regija_encoded = le_regija.transform([regija])[0]

    livingSize_log = np.log1p(livingSize)
    year_log = np.log1p(year)

    input_features = scaler.transform([[propType_encoded, livingSize_log, year_log, regija_encoded]])
    predicted_log_price = best_rf_model.predict(input_features)

    return predicted_log_price[0]

# Example user input (replace with actual values)
user_propType = 'Stanovanje'
user_livingSize = 50  # Replace with actual value
user_year = 2020  # Replace with actual value
user_regija = 'Dolenjska'

# Predict log-transformed price based on user input
predicted_log_price_rf = predict_log_price_rf(user_propType, user_livingSize, user_year, user_regija)

# Convert the predicted log price back to the original scale
predicted_price_rf = np.expm1(predicted_log_price_rf)

print(f'Random Forest Predicted Price: {predicted_price_rf}')

# Evaluate the model
y_pred_rf = best_rf_model.predict(X_test_scaled)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

print(f'Random Forest Mean Squared Error: {mse_rf}')
print(f'Random Forest R-squared: {r2_rf}')
print(f'Random Forest Mean Absolute Error: {mae_rf}')


KeyboardInterrupt: 

### Random Forrest with the best hyperparametrization

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import numpy as np

# Data integration
df = pd.read_excel('podatkiZaPython.xlsx')

# Apply log transformations
df['Log_Price'] = np.log1p(df['price'])
df['Log_LivingSize'] = np.log1p(df['livingSize'])
df['Log_Year'] = np.log1p(df['year'])

# Calculate price per square meter
df['Price_Per_Sqm'] = df['price'] / df['livingSize']

# Define a function to remove outliers based on Z-scores
def remove_outliers_zscore(df, column, threshold=3):
    z_scores = np.abs(stats.zscore(df[column]))
    df = df[(z_scores < threshold)]
    return df

# Remove outliers
df = remove_outliers_zscore(df, 'year', 3)
df = remove_outliers_zscore(df, 'Price_Per_Sqm', 3)

# Convert categorical features to numerical using Label Encoding
le_propType = LabelEncoder()
le_regija = LabelEncoder()
df['propType'] = le_propType.fit_transform(df['propType'])
df['regija'] = le_regija.fit_transform(df['regija'])

# Feature selection
features = ['propType', 'Log_LivingSize', 'Log_Year', 'regija']
X = df[features]
y = df['Log_Price']  # Use log-transformed price as the target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features using Standard Scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Best hyperparameters obtained from the grid search
best_hyperparameters = {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}

# Train the model with the best hyperparameters
best_rf_model = RandomForestRegressor(random_state=42, **best_hyperparameters)
best_rf_model.fit(X_train_scaled, y_train)

# Function to predict log-transformed price based on user input
def predict_log_price_rf(propType, livingSize, year, regija):
    propType_encoded = le_propType.transform([propType])[0]
    regija_encoded = le_regija.transform([regija])[0]

    livingSize_log = np.log1p(livingSize)
    year_log = np.log1p(year)

    input_features = scaler.transform([[propType_encoded, livingSize_log, year_log, regija_encoded]])
    predicted_log_price = best_rf_model.predict(input_features)

    return predicted_log_price[0]

#Ljubljana-mesto
#Severna-primorska
#Savinjska
#Južna-primorska
#Ljubljana-okolica
#Gorenjska
#Dolenjska
#Posavska
#Notranjska
#Podravska
#Koroška
#Pomurska
#Zasavska

# Example user input (replace with actual values)
user_propType = 'Hiša'
user_livingSize = 150  # Replace with actual value
user_year = 2020  # Replace with actual value
user_regija = 'Gorenjska'

# Predict log-transformed price based on user input
predicted_log_price_rf = predict_log_price_rf(user_propType, user_livingSize, user_year, user_regija)

# Convert the predicted log price back to the original scale
predicted_price_rf = np.expm1(predicted_log_price_rf)

print(f'Random Forest Predicted Price: {predicted_price_rf}')

# Evaluate the model
y_pred_rf = best_rf_model.predict(X_test_scaled)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

print(f'Random Forest Mean Squared Error: {mse_rf}')
print(f'Random Forest R-squared: {r2_rf}')
print(f'Random Forest Mean Absolute Error: {mae_rf}')

Random Forest Predicted Price: 429330.5255122348
Random Forest Mean Squared Error: 0.17546161086033804
Random Forest R-squared: 0.6589684901540456
Random Forest Mean Absolute Error: 0.2902934257310538


