In [18]:
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy
import time
import pickle
import numpy as np
import datetime
import math
from sklearn.linear_model import LinearRegression, Lasso, Ridge, RANSACRegressor, SGDRegressor, ElasticNet
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC, SVR
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import cross_val_score
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
import warnings
import os
warnings.filterwarnings("ignore")

# Pobranie danych

In [92]:
class Parameters:

    def __init__(self, city, region, date=datetime.datetime.now().strftime("%B")):
        self.city = city
        self.region = region
        self.date = date
    def get_links(self):
        core_url = 'https://www.otodom.pl'
        url = f'https://www.otodom.pl/pl/oferty/sprzedaz/mieszkanie/{self.city}/{self.region}'
        n_of_offers = int(BeautifulSoup(requests.get(url).content, 'html.parser').find('span', class_="css-klxieh e1ia8j2v11").getText())
        n_of_pages = int(math.ceil(n_of_offers/36))
        main_page_urls = [f'https://www.otodom.pl/pl/oferty/sprzedaz/mieszkanie/{self.city}/{self.region}?page={a}' for a in range(n_of_pages)]
        
        offers_links = []
        for i in main_page_urls[: n_of_pages]:
            time.sleep(1)
            try:
                page = requests.get(i)
                soup = BeautifulSoup(page.content, 'html.parser')
                for i in soup.find_all('div'):
                    i = i.find_all('a')

                    for a in i:
                        offers_links.append(a.get('href'))

            except:
                continue
        offers_links = list(set(offers_links))
        flats = []
        for i in offers_links:
            flat = re.findall(r'(/pl/oferta.*)', i)
            flats.append(flat)
        flats_updated = list(set([str(core_url) + i[0] for i in flats if len(i)>0]))
        
        return flats_updated
    
    def save_the_links(self):
        with open("links_otodom_"+ self.date +'_'+ self.city +'_'+ self.region + ".pickle", "wb") as output_file:
            pickle.dump(row_parameters.get_links(self), output_file)
        return row_parameters.get_links(self)
    def get_the_parameters(self):
        try:    
            with open("links_otodom_"+ self.date +'_'+ self.city +'_'+ self.region + ".pickle", "rb") as input_file:
                urls = pickle.load(input_file)
        except:
            urls = row_parameters.save_the_links(self)
        try:
            with open("parameters_"+ self.date +'_'+ self.city +'_'+ self.region + ".pickle", "rb") as input_file:
                parameters = pickle.load(input_file)
            return parameters
        except:
            desc = []
            prices = []
            parameters = {'Powierzchnia': [], 'Liczba pokoi': [], 'Rynek': [], 'Rodzaj zabudowy': [], 'Piętro': [],
                          'Ogrzewanie': [], 'Rok budowy': [], 'Stan wykończenia': [], 'Czynsz': [], 'Forma własności': [],
                          'Materiał budynku':[], 'Cena':prices, 'Opis': desc, 'Lokalizacja': []}
            for ofert in urls:
                page = requests.get(ofert)
                soup = BeautifulSoup(page.content, 'html.parser')
                link_parms = {}

                all_data = []
                for data in soup.find_all('div', class_="css-1qzszy5 estckra8"):
                    all_data.append(data.getText())



                price = re.findall(r'adPageHeaderPrice">(.*)</strong><style data-emotion', str(soup))
                all_data.append('Cena')
                try:
                    all_data.append(price[0]) 
                except:
                    all_data.append(price)
                for i in range(len(all_data)):
                    if (i % 2) == 0:
                        link_parms[all_data[i]] = all_data[i+1]

                for i in [i for i in parameters.keys() if i not in ['Opis']]:
                    try:
                        parameters[i].append(link_parms[i])
                    except:
                        parameters[i].append(np.nan)
                price = re.findall(r'adPageHeaderPrice">(.*)</strong><style data-emotion', str(soup))
            parameters['Lokalizacja'] = self.region
            parameters['Miasto'] = self.city

            with open("parameters_"+ self.date +'_'+ self.city +'_'+ self.region + ".pickle", "wb") as output_file:
                pickle.dump(parameters, output_file)
            return parameters
       
            
    

# Przygotowanie danych



In [20]:
class Prepare_the_data:
    
    def __init__(self, city=None, month=datetime.datetime.now().strftime("%B")):
        self.month = month
        self.city = city if city is not None else ''
    
    def get_data(self):
        try:
            with open("cleaned_data_"+ self.month +'_'+ self.city + ".pickle", "rb") as input_file:
                df = pickle.load(input_file)
            return df
        except: 
            files = [f for f in os.listdir() if f.startswith('parameters_'+self.month+'_'+self.city)]
            df = pd.DataFrame()
            for file in files:
                with open(file, "rb") as input_file:
                    parameters = pickle.load(input_file)
                    del parameters['Opis']

                df_parms = pd.DataFrame(parameters)
                df = pd.concat([df, df_parms])
            return self.clean_the_data(df)
    def clean_the_data(self, data):
        try:
            with open("cleaned_data_"+ self.month +'_'+ self.city + ".pickle", "wb") as input_file:
                    final_df = pickle.load(input_file)
        except:
            df = data
            df = df.dropna(subset=['Powierzchnia','Piętro'])
            df['Powierzchnia'] = df['Powierzchnia'].apply(lambda x: x[:-2].replace(' ','').replace(',','.').strip())               
            df['Liczba pokoi'] = df['Liczba pokoi'].apply(lambda x: x.replace('więcej niż 10','10')).astype('int64')
            df = df[df['Piętro'] !='zapytaj']
            df = df.reset_index(drop=True)

            for i in df.columns:
                for a in list(df[(df[i] == 'zapytaj')].index):
                    df[i].iloc[a] = np.nan

            for i in df.columns:
                for a in list(df[(df[i] == 'nan')].index):
                    df[i].iloc[a] = np.nan
            for i in range(df.shape[0]):
                try:
                    df.loc[i,'Piętro'] = df.loc[i,'Piętro'].split('/')[0].replace('parter','0').replace('poddasze', '11').replace('suterena', '12').replace('> ','')
                except:
                    df.loc[i,'Piętro'] = df.loc[i,'Piętro'].replace('parter','0').replace('poddasze', '11').replace('suterena', '12').replace('> ','')

            df = df[df['Cena'] != 'Zapytaj o cenę']
            df = df.dropna(subset=['Cena'])
            df['Cena'] = df['Cena'].astype('str').apply(lambda x: ''.join(c for c in x if c.isdigit()))
            df['Cena'] = df['Cena'].astype('str').apply(lambda x: ''.join(c for c in x if ((c.isdigit())| (c=='.'))))
            df['Powierzchnia'] = df['Powierzchnia'].astype('str').apply(lambda x: ''.join(c for c in x if (c.isdigit())|(c=='.')))
            df = df.reset_index(drop=True)

            #missing values
            index_NaN_rok_pierwotny = list(df[(df["Rok budowy"].isnull()) & (df['Rynek'] == 'pierwotny')].index)
            index_NaN_rok_wtorny = list(df[(df["Rok budowy"].isnull()) & (df['Rynek'] == 'wtórny')].index)
            index_NaN_rok_nan = list(df[(df["Rok budowy"].isnull()) & (df['Rynek'] != 'wtórny')& (df['Rynek'] != 'pierwotny')].index)

            for i in index_NaN_rok_pierwotny :
                rok_med = df["Rok budowy"][df['Rynek']=='pierwotny'].median()
                df['Rok budowy'].iloc[i] = rok_med

            for i in index_NaN_rok_wtorny :
                rok_med = df["Rok budowy"][df['Rynek']=='wtórny'].median()
                df['Rok budowy'].iloc[i] = rok_med

            for i in index_NaN_rok_nan :
                rok_med = df["Rok budowy"].median()
                df['Rok budowy'].iloc[i] = rok_med

            df['Rok budowy'] = df['Rok budowy'].astype('int64')
            df['Cena'] = df['Cena'].astype('float64')
            df['Piętro'] = df['Piętro'].astype('int64')
            df['Powierzchnia'] = df['Powierzchnia'].astype('float64')
            df['Cena_m2'] = df['Cena']/df['Powierzchnia']
            df = df.drop(columns=['Cena'])
            # wysoka korelacja ze zmienną 'Powierzchnia', ponad 0.7
            df = df.drop(columns=['Liczba pokoi'])

            final_df = self.remove_outliers(self.fill_na(df))

            with open("cleaned_data_"+ self.month +'_'+ self.city + ".pickle", "wb") as output_file:
                    pickle.dump(final_df, output_file)
        return final_df

        
    def remove_outliers(self, data):
        df = data
        for x in df.select_dtypes(include=['float','int64']).columns:
            # calculate summary statistics
            data_mean, data_std = np.mean(df[x]), np.std(df[x])
            # identify outliers
            cut_off = data_std * 3
            lower, upper = data_mean - cut_off, data_mean + cut_off

            df = df[(df[x] > lower)&(df[x] < upper)]

        df = df.reset_index(drop=True)

        return df
        
    def fill_na(self, data):
        
        df = data
        
        # za dużo NA
        df = df.drop(columns=['Materiał budynku','Rodzaj zabudowy','Czynsz', 'Ogrzewanie','Forma własności'])
        
        # rok budowy dla wartosci NA wynosi mniej niż 2020/21/22 dlatego możemy uznać, że jest to rynek wtórny
        for i in list(df[(df['Rynek'].isna())].index):
            df['Rynek'].iloc[i] = 'wtórny'

        # rynek pierwotny w większosci przypadków ma stan do wykonczenia
        for i in list(df[(df['Rynek']=='pierwotny') & (df['Stan wykończenia'].isna())].index):
            df['Stan wykończenia'].iloc[i] = 'do wykończenia'

        # brakujace wartosci zmiennej 'Stan wykonczenia' uzupelniam na podstawie mediany roku budowy
        r_median = df[df['Rynek']=='wtórny'].groupby('Stan wykończenia')['Rok budowy'].median()['do remontu']
        z_median = df[df['Rynek']=='wtórny'].groupby('Stan wykończenia')['Rok budowy'].median()['do zamieszkania']
        w_median = df[df['Rynek']=='wtórny'].groupby('Stan wykończenia')['Rok budowy'].median()['do wykończenia']
        for i in list(df[(df['Stan wykończenia'].isna()) & (df['Rok budowy']>= (w_median+z_median)/2)].index):
            df['Stan wykończenia'].iloc[i] = 'do wykończenia'

        for i in list(df[(df['Stan wykończenia'].isna()) & (df['Rok budowy']< (w_median+z_median)/2) & (df['Rok budowy']>= (r_median+z_median)/2)].index):
            df['Stan wykończenia'].iloc[i] = 'do zamieszkania'

        for i in list(df[(df['Stan wykończenia'].isna()) & (df['Rok budowy']< (r_median+z_median)/2)].index):
            df['Stan wykończenia'].iloc[i] = 'do remontu'
            
        return df
                

In [21]:
df = prepare_the_data(month='March', city='warszawa').get_data()

In [33]:
df_sample = df.sample(frac=1)[:1000].reset_index(drop=True)

# Budowa modelu

In [73]:
def choose_best_model(data):
    
    X = data.drop(columns=['Cena_m2'])
    y = data['Cena_m2']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    models = [('LR',LinearRegression()), ('LASSO', Lasso()), ('EN', ElasticNet()), ('KNN', KNeighborsRegressor()),
              ('CART', DecisionTreeRegressor()), ('GBR', GradientBoostingRegressor()), ('XGB', XGBRegressor()),
             ('RF', RandomForestRegressor())]
    categorical_features = X_train.select_dtypes(include=['object']).columns

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])


    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, ['Rok budowy','Powierzchnia']),
            ('cat', categorical_transformer, categorical_features),
        ])
    
    results = []
    names = []
    pipelines = []
    for i in models:
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                       ('poly', PolynomialFeatures(degree=4)),
                       ('pca', pca),
                       i
                       ])
        pipelines.append(pipeline)
        names.append(i[0])
        
    for model in pipelines:
        kfold = KFold(n_splits=5, random_state=21, shuffle=True)
        cv_results = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_absolute_error')
        results.append([round(cv_results.mean(),2), round(cv_results.std(),2)])
    
    df_results = pd.DataFrame(data=results, index=names, columns=['MAE', 'STD'])

       
    return df_results

In [74]:
results = choose_best_model(df_sample)

In [76]:
# wybieram GradientBoostingRegressor -> najniższe std oraz niski MAE
results

Unnamed: 0,MAE,STD
LR,-1035331000000.0,1066739000000.0
LASSO,-2744.09,732.9
EN,-1847.96,196.84
KNN,-1891.89,153.37
CART,-2354.29,152.86
GBR,-1853.6,99.5
XGB,-1941.89,115.43
RF,-1809.01,129.07


Tworzenie pipeline

In [65]:
X = df_sample.drop(columns=['Cena_m2'])
y = df_sample['Cena_m2']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_features = X_train.select_dtypes(include=['object']).columns

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ['Rok budowy','Powierzchnia']),
        ('cat', categorical_transformer, categorical_features),
    ])



rf = RandomForestRegressor(
                             criterion='mse',
                            )
pca = PCA()
dt = DecisionTreeRegressor(max_depth=12)
svr_rbf=SVR(C=1.0, epsilon=0.2, kernel='rbf')
xgb = XGBRegressor()
lr = LinearRegression()
knn = KNeighborsRegressor()
gb = GradientBoostingRegressor()

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('poly', PolynomialFeatures(degree=4)),
                       ('pca', pca),
                       ('model', gb)
                       ])



Hyperparameters

In [67]:
gb_parms = {'model__max_depth':[1,3,5], 'model__n_estimators':[20,50,100,200], 'model__alpha':[0.3,0.5,0.9,1.5]}
gd = GridSearchCV(pipe, gb_parms)


In [68]:
gd.fit(X_train, y_train)


(-0.24639540422580297, 2063.652707237465)

In [81]:
gd.best_estimator_


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['Rok budowy',
                                                   'Powierzchnia']),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['Rynek', 'Stan wykończenia', 'Lokalizacja', 'Miasto'], dtype='object'))])),
                ('poly', PolynomialFeatures(degree=4)), ('pca', PCA()),
                ('model',
                 GradientBoostingRegressor(alpha=0.3, n_estimators=50))])

In [82]:
gb2 = gd.best_estimator_['model']

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('poly', PolynomialFeatures(degree=4)),
                       ('pca', pca),
                       ('model', gb2)
                       ])

In [90]:
X = df_sample.drop(columns=['Cena_m2'])
y = df_sample['Cena_m2']
kfold = KFold(n_splits=5, random_state=21, shuffle=True)
cv_results_final = cross_val_score(pipe, X, y, cv=kfold, scoring='neg_mean_absolute_error')
cv_results_final.mean(), cv_results_final.std()

(-1882.9724714234358, 110.1484538567394)

In [94]:
cv_results_final_r2 = cross_val_score(pipe, X, y, cv=kfold, scoring='r2')
cv_results_final_r2.mean()

0.5271641242553218