# House price prediction

In [9]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import numpy as np

### `load dataset`

In [10]:
data=pd.read_csv('global_house_purchase_dataset.csv')

In [11]:
data.head()

Unnamed: 0,property_id,country,city,property_type,furnishing_status,property_size_sqft,price,constructed_year,previous_owners,rooms,...,customer_salary,loan_amount,loan_tenure_years,monthly_expenses,down_payment,emi_to_income_ratio,satisfaction_score,neighbourhood_rating,connectivity_score,decision
0,1,France,Marseille,Farmhouse,Semi-Furnished,991,412935,1989,6,6,...,10745,193949,15,6545,218986,0.16,1,5,6,0
1,2,South Africa,Cape Town,Apartment,Semi-Furnished,1244,224538,1990,4,8,...,16970,181465,20,8605,43073,0.08,9,1,2,0
2,3,South Africa,Johannesburg,Farmhouse,Semi-Furnished,4152,745104,2019,5,2,...,21914,307953,30,2510,437151,0.09,6,8,1,0
3,4,Germany,Frankfurt,Farmhouse,Semi-Furnished,3714,1110959,2008,1,3,...,17980,674720,15,8805,436239,0.33,2,6,6,0
4,5,South Africa,Johannesburg,Townhouse,Fully-Furnished,531,99041,2007,6,3,...,17676,65833,25,8965,33208,0.03,3,3,4,0


# data cleaning

In [12]:
def clean_data(df):
    return(
        df
        .drop(columns=['property_id', 'customer_salary','loan_amount','loan_tenure_years',
                                 'monthly_expenses','down_payment',"emi_to_income_ratio",
                                 "satisfaction_score",'neighbourhood_rating','connectivity_score','decision'])

         .assign(
            country= lambda df_:(                 #    country
                  df_
                  .country
                  .str.lower()
             )
        )
        .assign(
            city= lambda df_:(                       #   city
                  df_
                  .city
                  .str.lower()
             )
        )
        .assign(
            property_type= lambda df_:(                # property_type
                  df_
                  .property_type
                  .str.lower()
             )
        )
        .assign(
            furnishing_status= lambda df_:(           # furnishing_status
                  df_
                  .furnishing_status
                  .str.lower()
             )
        )
        .assign(
        price = lambda df_: (
            np.log1p(df_.price.astype(float)) # here we convert into log because some value is show negtive price 
        )
    
      )
        
       
        

        

    )
             

In [13]:
dataset=clean_data(data)

In [14]:
dataset.head()

Unnamed: 0,country,city,property_type,furnishing_status,property_size_sqft,price,constructed_year,previous_owners,rooms,bathrooms,garage,garden,crime_cases_reported,legal_cases_on_property
0,france,marseille,farmhouse,semi-furnished,991,12.931048,1989,6,6,2,1,1,1,0
1,south africa,cape town,apartment,semi-furnished,1244,12.321805,1990,4,8,8,1,1,1,1
2,south africa,johannesburg,farmhouse,semi-furnished,4152,13.52128,2019,5,2,1,1,1,0,0
3,germany,frankfurt,farmhouse,semi-furnished,3714,13.920735,2008,1,3,3,0,1,0,0
4,south africa,johannesburg,townhouse,fully-furnished,531,11.503299,2007,6,3,3,1,1,3,1


#### input and output

In [17]:
input_data=data.drop(columns="price")
output_data=data["price"]

#### train_test_split

In [18]:
from sklearn.model_selection import train_test_split

In [24]:
# Split into training and testing sets (80% train, 20% test)
x_train, x_test, y_train, y_test = train_test_split(input_data,output_data, test_size=0.2, random_state=42)

# Further split the training set into training and validation sets (80% train, 20% val)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=42)


In [30]:
y_train = y_train.astype(float) + 1e-6


#### load some library

In [31]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

#from category_encoders import TargetEncoder
from sklearn.preprocessing import StandardScaler, TargetEncoder

#  `Apply Columns transformer & pipline`

In [32]:
cat_cols = ["country", "city", "property_type"]

num_cols = [
    "property_size_sqft",
    "constructed_year",
    "previous_owners",
    "rooms",
    "bathrooms",
    "garage",
    "garden",
    "crime_cases_reported",
    "legal_cases_on_property"
]



In [33]:
num_pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler())    # here apply stander scaler on numerical column
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ("target_enc", TargetEncoder()),   # Here also apply target encoding and after the target encoding also apply scaling  
        ("scaler", StandardScaler())  
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_cols),
        ("cat", cat_pipeline, cat_cols)
    ]
)

model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("regressor", LinearRegression())          # here apply linear reg
    ]
)



In [34]:
model.fit(x_train, y_train)

# calculate mae & r2 score

In [35]:
from sklearn.metrics import r2_score, mean_absolute_error

# Train prediction
y_train_pred = model.predict(x_train)

# Test prediction
y_test_pred = model.predict(x_test)

print("Train R2:", r2_score(y_train, y_train_pred))
print("Test R2 :", r2_score(y_test, y_test_pred))

print("Train MAE:", mean_absolute_error(y_train, y_train_pred))
print("Test MAE :", mean_absolute_error(y_test, y_test_pred))


Train R2: 0.9093703046836069
Test R2 : 0.9090839634895154
Train MAE: 168864.35060403426
Test MAE : 170285.39143547745


# exporting the pipeline

In [37]:
import pickle

In [38]:
pickle.dump(model,open('model@.pkl','wb'))

In [39]:
pickle.dump(data,open('dataset_@.pkl','wb'))
