In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [4]:
data = pd.read_csv('../input/property-prices-in-tunisia/Property Prices in Tunisia.csv')
data

Unnamed: 0,category,room_count,bathroom_count,size,type,price,city,region,log_price
0,Terrains et Fermes,-1.0,-1.0,-1.0,À Vendre,100000.0,Ariana,Raoued,5.000000
1,Terrains et Fermes,-1.0,-1.0,-1.0,À Vendre,316000.0,Ariana,Autres villes,5.499687
2,Appartements,2.0,1.0,80.0,À Louer,380.0,Ariana,Autres villes,2.579784
3,Locations de vacances,1.0,1.0,90.0,À Louer,70.0,Ariana,Autres villes,1.845098
4,Appartements,2.0,2.0,113.0,À Vendre,170000.0,Ariana,Ariana Ville,5.230449
...,...,...,...,...,...,...,...,...,...
12743,Terrains et Fermes,-1.0,-1.0,-1.0,À Vendre,3200000.0,Tunis,Sidi Bou Said,6.505150
12744,Appartements,1.0,1.0,100.0,À Louer,600.0,Tunis,Autres villes,2.778151
12745,Maisons et Villas,3.0,1.0,760.0,À Vendre,1950000.0,Tunis,La Marsa,6.290035
12746,Maisons et Villas,3.0,1.0,190.0,À Vendre,240000.0,Tunis,La Marsa,5.380211


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12748 entries, 0 to 12747
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   category        12748 non-null  object 
 1   room_count      12748 non-null  float64
 2   bathroom_count  12748 non-null  float64
 3   size            12748 non-null  float64
 4   type            12748 non-null  object 
 5   price           12748 non-null  float64
 6   city            12748 non-null  object 
 7   region          12748 non-null  object 
 8   log_price       12748 non-null  float64
dtypes: float64(5), object(4)
memory usage: 896.5+ KB


In [6]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Encode missing values properly
    df = df.replace(-1, np.NaN)
    
    # Fill missing values with column medians
    for column in ['room_count', 'bathroom_count', 'size']:
        df[column] = df[column].fillna(df[column].median())
    
    # Binary encoding
    df['type'] = df['type'].replace({'À Louer': 0, 'À Vendre': 1})
    
    # One-hot encoding
    for column in ['category', 'city', 'region']:
        dummies = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    
    # Drop log_price column
    df = df.drop('log_price', axis=1)
    
    # Split df into X and y
    y = df['price']
    X = df.drop('price', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [7]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [8]:
models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                        Neural Network trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.


In [10]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    rmse = np.sqrt(np.mean((y_test - y_pred)**2))
    print(name + " RMSE: {:.4f}".format(rmse))

                     Linear Regression RMSE: 261804423238592266240.0000
 Linear Regression (L2 Regularization) RMSE: 1618496260.6453
 Linear Regression (L1 Regularization) RMSE: 1618365373.2866
                   K-Nearest Neighbors RMSE: 1636680381.8575
                        Neural Network RMSE: 1617133667.8701
                         Decision Tree RMSE: 1619254136.7632
                         Random Forest RMSE: 1618575021.5216
                     Gradient Boosting RMSE: 1618062157.0167


In [11]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    r2 = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))
    print(name + " R^2: {:.5f}".format(r2))

                     Linear Regression R^2: -26217791367284604796928.00000
 Linear Regression (L2 Regularization) R^2: -0.00199
 Linear Regression (L1 Regularization) R^2: -0.00183
                   K-Nearest Neighbors R^2: -0.02464
                        Neural Network R^2: -0.00031
                         Decision Tree R^2: -0.00293
                         Random Forest R^2: -0.00209
                     Gradient Boosting R^2: -0.00146


In [18]:
modell = []
rmsel = []
r2l = []
print('###########################################################') 
for name, model in models.items():
    y_pred = model.predict(X_test)
    rmse = np.sqrt(np.mean((y_test - y_pred)**2))
    print(name + " RMSE: {:.4f}".format(rmse))
    r2 = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))
    print(name + " R^2: {:.5f}".format(r2))
    modell.append(name)
    rmsel.append(rmse)
    r2l.append(r2)
print('###########################################################')    
df = pd.DataFrame({'ModelName': modell, 'RMSE': rmsel, 'R_squared': r2l})
display(df)
print('###########################################################')

###########################################################
                     Linear Regression RMSE: 261804423238592266240.0000
                     Linear Regression R^2: -26217791367284604796928.00000
 Linear Regression (L2 Regularization) RMSE: 1618496260.6453
 Linear Regression (L2 Regularization) R^2: -0.00199
 Linear Regression (L1 Regularization) RMSE: 1618365373.2866
 Linear Regression (L1 Regularization) R^2: -0.00183
                   K-Nearest Neighbors RMSE: 1636680381.8575
                   K-Nearest Neighbors R^2: -0.02464
                        Neural Network RMSE: 1617133667.8701
                        Neural Network R^2: -0.00031
                         Decision Tree RMSE: 1619254136.7632
                         Decision Tree R^2: -0.00293
                         Random Forest RMSE: 1618575021.5216
                         Random Forest R^2: -0.00209
                     Gradient Boosting RMSE: 1618062157.0167
                     Gradient Boosting R^2: -0.0

Unnamed: 0,ModelName,RMSE,R_squared
0,Linear Regression,2.618044e+20,-2.621779e+22
1,Linear Regression (L2 Regularization),1618496000.0,-0.001994977
2,Linear Regression (L1 Regularization),1618365000.0,-0.001832921
3,K-Nearest Neighbors,1636680000.0,-0.02463668
4,Neural Network,1617134000.0,-0.0003085516
5,Decision Tree,1619254000.0,-0.002933584
6,Random Forest,1618575000.0,-0.002092499
7,Gradient Boosting,1618062000.0,-0.00145755


###########################################################
