In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [34]:
df = pd.read_csv('data_cleaned.csv')

In [35]:
print(df.info)

<bound method DataFrame.info of      photo-count  list-view-price list-view-date           left  \
0             12            13250     25-03-2024  Kiralık Daire   
1             14            19000     27-03-2024  Kiralık Daire   
2             21            11000     06-04-2024  Kiralık Daire   
3              9            10000     28-03-2024  Kiralık Daire   
4             34            29000     01-03-2024  Kiralık Daire   
..           ...              ...            ...            ...   
690           36            50000     27-01-2024  Kiralık Villa   
691           46            20000     07-03-2024  Kiralık Daire   
692           17            17000     11-03-2024  Kiralık Daire   
693           39            15000     08-04-2024  Kiralık Daire   
694           49            30000     01-03-2024  Kiralık Daire   

                            celly celly 2                           celly 3  \
0    3 +                        1  140 m²  15                       Yaşında   
1    

In [40]:
df['city'] = df['city'].astype('category')
df['district'] = df['district'].astype('category')
df['neighborhood'] = df['neighborhood'].astype('category')
df['room'] = df['room'].astype('int')
df['living_room'] = df['living_room'].astype('int')
df['area'] = df['area'].astype('int')
df['age'] = df['age'].astype('int')
df['floor'] = df['floor'].astype('int')
df['price'] = df['price'].astype('int')
# df['feature1'] = df['feature1'].astype('int')  # Removed because 'feature1' does not exist in df
# df['target'] = df['target'].astype('int')  # Removed because 'target' does not exist in df

In [41]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 695 entries, 0 to 694
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   photo-count          695 non-null    int64   
 1   list-view-price      695 non-null    int64   
 2   list-view-date       695 non-null    object  
 3   left                 695 non-null    object  
 4   celly                695 non-null    object  
 5   celly 2              695 non-null    object  
 6   celly 3              695 non-null    object  
 7   celly 4              695 non-null    object  
 8   list-view-header     695 non-null    object  
 9   list-view-location   695 non-null    object  
 10  img-wrp href         281 non-null    object  
 11  he-lazy-image src    638 non-null    object  
 12  he-lazy-image src 2  251 non-null    object  
 13  city                 695 non-null    category
 14  district             695 non-null    category
 15  neighborhood         69

In [42]:
categorical_features = ['city', 'district', 'neighborhood']
numerical_features = ['room', 'living_room', 'area', 'age', 'floor']

In [43]:
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [44]:
X = df.drop('price', axis=1)
y = df['price']

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
model = Pipeline([
    ('preparation', full_pipeline),
    ('model', LinearRegression())
])

In [47]:
model.fit(X_train, y_train)

In [48]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [49]:
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

MSE: 48380476.52679424
RMSE: 6955.6075598609095
R^2: 0.28293707422940806


In [50]:
feature_importances = model.named_steps['model'].coef_
print(len(feature_importances))
print(feature_importances)

239
[-1.70297904e+02  0.00000000e+00  3.38760789e+03 -2.00873947e+03
  7.77627900e+02  1.42992612e-09  3.10395433e+03  1.87757202e+03
 -3.01822647e+03 -3.15438068e+03 -2.54914174e+03 -2.62170130e+03
  8.14179664e+03 -5.31111576e+03  8.19944299e+02  6.08929404e+03
 -3.10715738e+03  5.13367093e+03  2.42651718e+03 -1.94829945e+03
  1.76611957e+02  1.38561883e+04 -4.37847255e+03 -4.96642379e+03
 -5.46444206e+03 -6.94301876e+03  3.49003023e+03 -8.72817501e+03
  2.90704847e+04 -1.10990564e+03 -2.30696653e+03 -7.92673671e+03
 -1.06519008e+04  5.95841362e+03  9.92590391e+03 -7.18180787e+03
  5.60333061e+03 -8.07187343e+02  2.26988361e+03 -4.00286185e+03
 -2.52174285e+03 -3.03966192e+02 -1.41691156e+03  2.31858157e+03
 -2.36416467e+02 -3.11152553e+02  1.27084049e+04 -1.63076395e+04
  7.66353128e+03  7.23214257e+02 -2.70804522e+03 -3.84278121e+03
 -2.24931516e+04 -2.20031602e+04  9.21648269e+03  1.01058773e+04
 -2.29957330e+03  4.21516360e+03 -1.81523003e+03  1.70848198e+03
 -6.88552588e+03 -4.4

In [None]:
print("Numerical Features")
for i in range(len(numerical_features)):
    print(numerical_features[i], feature_importances[i])

In [51]:
print("Categorical Features")
for i in range(len(categorical_features)):
    for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
        print(model.named_steps['preparation'].transformers_[1][1].categories_[i][j], feature_importances[len(numerical_features) + j])

Categorical Features
izmir 1.4299261166828535e-09
aliaga 1.4299261166828535e-09
balcova 3103.9543326956305
bayrakli 1877.572015512863
bergama -3018.2264657812348
bornova -3154.380675604048
buca -2549.141743204317
cesme -2621.7013049135862
cigli 8141.796641849079
dikili -5311.115756006479
foca 819.9442992329226
gaziemir 6089.294039189855
guzelbahce -3107.157378590629
karabaglar 5133.670928424385
karaburun 2426.517180440619
karsiyaka -1948.2994549278142
kemalpasa 176.61195697109227
kinik 13856.18828047167
konak -4378.472551799987
menderes -4966.423787299547
menemen -5464.442057621713
narlidere -6943.018757753248
odemis 3490.0302339146415
seferihisar -8728.17500624082
selcuk 29070.48468108963
tire -1109.9056447178214
torbali -2306.96652671689
urla -7926.736711099674
29_ekim 1.4299261166828535e-09
2_inonu 3103.9543326956305
4_eylul 1877.572015512863
75_yil -3018.2264657812348
85_yil_cumhuriyet -3154.380675604048
9_eylul -2549.141743204317
adalet -2621.7013049135862
adatepe 8141.79664184907

In [53]:

new_data = pd.DataFrame({
    'city': ['manisa'],
    'district': ['yunusemre'],
    'neighborhood': ['guzelyurt'],
    'room': [4],
    'living_room': [1],
    'area': [200],
    'age': [5],
    'floor': [3]
})

print(model.predict(new_data))

[29748.80875048]


In [54]:
print(df[(df['city'] == 'manisa') & (df['district'] == 'yunusemre') & (df['neighborhood'] == 'guzelyurt')])

Empty DataFrame
Columns: [photo-count, list-view-price, list-view-date, left, celly, celly 2, celly 3, celly 4, list-view-header, list-view-location, img-wrp href, he-lazy-image src, he-lazy-image src 2, city, district, neighborhood, room, living_room, area, age, floor, price]
Index: []

[0 rows x 22 columns]


In [55]:
def tolerance_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[np.abs(residuals) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

def tolerance_percentage_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[(np.abs(residuals) / y_true) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)



In [56]:
print(r2_score(y_test, y_pred))
print(tolerance_r2(y_test, y_pred, 10000))
print(tolerance_percentage_r2(y_test, y_pred, 0.50))

0.28293707422940806
0.505857142539933
0.5929875713645179
