# Processing data

In [490]:
import pandas as pd
import seaborn as sns
import numpy as np 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [491]:
url = r"..\data\raw\clean_house.csv"
house = pd.read_csv(url, sep=",")

## subsetting columns and rows

In [492]:
house.head(5)

Unnamed: 0,property_id,locality_name,postal_code,latitude,longitude,property_type,property_subtype,price,type_of_sale,number_of_rooms,...,terrace,terrace_area,garden,garden_area,surface_of_good,number_of_facades,swimming_pool,state_of_building,main_city,province
0,11157483,ronse,9600,50.741809,3.607601,HOUSE,HOUSE,245000.0,BUY_REGULAR,,...,1.0,13.0,1.0,161.0,250.0,2.0,0.0,GOOD,ronse,oost-vlaanderen
1,11156843,merelbeke,9820,51.01925,3.760957,HOUSE,HOUSE,315000.0,BUY_REGULAR,,...,1.0,23.0,,,77.0,2.0,,GOOD,merelbeke,oost-vlaanderen
2,11161362,forêt,4870,50.563966,5.675489,HOUSE,HOUSE,325000.0,BUY_REGULAR,,...,1.0,16.0,1.0,420.0,572.0,3.0,,AS_NEW,trooz,luik
3,11153591,ottignies,1340,50.671784,4.575929,HOUSE,HOUSE,395000.0,BUY_REGULAR,,...,1.0,37.0,1.0,76.0,165.0,2.0,,GOOD,ottignies-louvain-la-neuve,waals-brabant
4,11128275,antwerpen merksem,2170,51.243743,4.443372,HOUSE,MIXED_USE_BUILDING,399900.0,BUY_REGULAR,,...,,,,,65.0,2.0,0.0,GOOD,antwerpen,antwerpen


## drop columns

In [493]:
def drop_columns(df, clist):
    ndf = df.drop(columns=clist)
    return ndf


In [494]:
todropcolumns = ["property_id", "locality_name", "latitude", "longitude", "property_type", "type_of_sale", "kitchen_type", "main_city", "postal_code", "province"]
nhouse = drop_columns(house, todropcolumns)

In [495]:

# dropping the NaN values in the columns selected
def droping_null(df, colist):
    df.dropna(subset=colist, inplace=True)
    return df



In [496]:
nhouse = droping_null(nhouse, "terrace")

In [497]:
nhouse.isnull().sum(), len(nhouse)

(property_subtype             0
 price                        0
 number_of_rooms           5465
 living_area                  0
 fully_equipped_kitchen     936
 furnished                 3598
 open_fire                    0
 terrace                      0
 terrace_area              3156
 garden                    3303
 garden_area               3732
 surface_of_good              0
 number_of_facades          437
 swimming_pool             3694
 state_of_building         1089
 dtype: int64,
 6581)

## transform categorical data to numerical data

In [498]:
def transform_categorical(df, clist):
    # First convert type of columns to category
    for i in clist:
        df[i] = df[i].astype("category")

    # Create an instance of One-Hot-encoder
    enc = OneHotEncoder(handle_unknown='ignore')
                        

    # Passing encoded columns
    enc_data = pd.DataFrame(enc.fit_transform(df[clist]).toarray())

    # Drop the original categorical columns
    df.drop(clist, axis=1, inplace=True)

    # Merge with main
    new_df = pd.concat([df, enc_data], axis=1)

    return new_df

In [499]:
def transform_categorical2(df, clist):
    for column in clist:
        # One-hot encode the current column
        ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        X_ohe = ohe.fit_transform(df[[column]])
        
        # Convert the one-hot encoded features to a DataFrame
        ohe_df = pd.DataFrame(X_ohe, columns=ohe.categories_[0])
        
        # Concatenate the original DataFrame with the one-hot encoded DataFrame
        df = pd.concat([df, ohe_df], axis=1)
        
        # Drop the original categorical column
        df.drop(columns=[column], inplace=True)

    return df

In [None]:
chouse = droping_null(chouse, "terrace")
chouse = droping_null(chouse, "garden")

In [500]:
clist = ["property_subtype", "state_of_building"]
chouse = transform_categorical2(nhouse, clist)
chouse = droping_null(chouse, "terrace")
chouse = droping_null(chouse, "garden")

## Make the training set and test set

In [501]:
def create_X_y(df, Y):
    X = df.drop(columns=[Y])
    y = df[Y]
    return X, y




In [502]:
X, y = create_X_y(chouse, "price")

In [503]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=0 )

## Imputation on the X_train, X_test

In [504]:
X_test.shape
X_train.shape 

(2622, 32)

In [505]:
def impute_date(X_train, X_test):
    imputer = KNNImputer(n_neighbors=4)
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)
    return X_train_imputed, X_test_imputed

In [507]:
X_train.head(5)

Unnamed: 0,number_of_rooms,living_area,fully_equipped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,surface_of_good,...,OTHER_PROPERTY,TOWN_HOUSE,VILLA,AS_NEW,GOOD,JUST_RENOVATED,TO_BE_DONE_UP,TO_RENOVATE,TO_RESTORE,NaN
2330,,95.0,1.0,,0.0,1.0,18.0,1.0,124.0,223.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10529,,275.0,1.0,,0.0,1.0,30.0,1.0,500.0,787.0,...,,,,,,,,,,
85,,143.0,1.0,0.0,0.0,1.0,,1.0,200.0,492.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8964,,156.0,1.0,,0.0,1.0,20.0,1.0,400.0,672.0,...,,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0
11072,,150.0,1.0,0.0,0.0,1.0,35.0,1.0,92.0,220.0,...,,,,,,,,,,


In [None]:
def replace_NaN_Categorical(df, comlist):
    df[comlist] = df[comlist].fillna(0.0)
    