In [16]:
import numpy as np
import pandas as pd

In [17]:
df = pd.read_csv('flats_cleaned_v6.csv')

In [18]:
df.head()

Unnamed: 0,place,bedRoom,bathroom,balcony,facing,agePossession,Direction,built_up_area,servant room,furnishing_type,luxury_category,floor_category,price
0,5.0,2.0,2.0,1.0,7.0,2.0,0.0,653.0,0.0,0.0,1.0,1.0,0.22
1,9.0,2.0,2.0,1.0,4.0,0.0,1.0,684.0,0.0,0.0,1.0,2.0,0.38
2,36.0,2.0,2.0,1.0,4.0,2.0,2.0,749.0,0.0,2.0,1.0,2.0,0.55
3,104.0,3.0,2.0,1.0,0.0,3.0,0.0,1080.0,0.0,0.0,2.0,0.0,0.82
4,104.0,2.0,2.0,2.0,5.0,2.0,0.0,836.0,0.0,0.0,2.0,2.0,0.36


In [19]:
# We will use linear regression, So we need one hot encoding, scaling, and log transformation
# one hot encode -> sector, balcony, agePossession, furnishing type, luxury category, floor category

In [20]:
X = df.drop(columns=['price', 'place'])
y = df['price']

In [21]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [22]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

In [23]:
columns_to_encode = ['balcony', 'facing', 'agePossession', 'Direction', 'furnishing_type', 'luxury_category', 'floor_category']

In [24]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), [ 'bedRoom', 'bathroom', 'built_up_area', 'servant room']),
        ('cat', OneHotEncoder(drop='first'), columns_to_encode)
    ],
    remainder='passthrough'
)

In [25]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
    #('regressor', LinearRegression())
])

# K-fold cross-validation

In [None]:
#kfold = KFold(n_splits=10, shuffle=True, random_state=42)
#scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [None]:
scores.mean()

In [None]:
scores.std()

# pipeline test

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [27]:
preprocessor.fit_transform(X_train)

array([[ 2.39296249,  1.71576222,  1.49595985, ...,  0.        ,
         1.        ,  0.        ],
       [-0.72626119, -0.15152791, -0.96115277, ...,  0.        ,
         0.        ,  1.        ],
       [-0.72626119, -0.15152791, -0.0212178 , ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [-0.72626119, -0.15152791, -0.39157261, ...,  1.        ,
         1.        ,  0.        ],
       [-0.72626119, -2.01881805, -0.97136946, ...,  1.        ,
         0.        ,  1.        ],
       [-0.72626119, -2.01881805, -1.00457368, ...,  0.        ,
         1.        ,  0.        ]])

In [28]:
preprocessor.fit_transform(X_test)

array([[ 0.74247419, -0.16109796,  0.18295961, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.74247419, -0.16109796, -0.02860235, ...,  0.        ,
         1.        ,  0.        ],
       [-0.74989893, -0.16109796, -0.84532992, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.74989893, -0.16109796, -0.62146785, ...,  0.        ,
         0.        ,  1.        ],
       [-0.74989893, -0.16109796, -0.66082821, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.74247419,  1.54314886,  1.2186292 , ...,  1.        ,
         0.        ,  0.        ]])

In [29]:
pipeline.fit(X_train,y_train)

In [30]:
y_pred = pipeline.predict(X_test)

In [31]:
y_pred = np.expm1(y_pred)

In [32]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test),y_pred)

0.15359791221822305