https://www.kaggle.com/hugoncosta/price-of-flats-in-moscow

price - цена квартиры в $1000

totsp - общая площадь квартиры, кв.м.

livesp - жилая площадь квартиры, кв.м.

kitsp - площадь кухни, кв.м.

dist - расстояние от центра в км.

metrdist - расстояние до метро в минутах

walk - 1 – пешком от метро, 0 – на транспорте

brick - 1 – кирпичный, монолит ж/б, 0 – другой

floor - 1 – этаж кроме первого и последнего, 0 – иначе.

code - число от 1 до 8, при помощи которого мы группируем наблюдения по подвыборкам: 1. Наблюдения сгруппированы на севере, вокруг Калужско-Рижской линии метрополитена 2. Север, вокруг Серпуховско-Тимирязевской линии метрополитена 3. Северо-запад, вокруг Замоскворецкой линии метрополитена 4. Северо-запад, вокруг Таганско-Краснопресненской линии метрополитена 5. Юго-восток, вокруг Люблинской линии метрополитена 6. Юго-восток, вокруг Таганско-Краснопресненской линии метрополитена 7. Восток, вокруг Калиниской линии метрополитена 8. Восток, вокруг Арбатско-Покровской линии метрополитена

In [154]:
import pandas as pd
import numpy as np
import dill
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline, FeatureUnion


In [155]:
df = pd.read_csv('flats_moscow.csv', index_col=0)

In [156]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2040 entries, 1 to 2040
Data columns (total 10 columns):
price       2040 non-null int64
totsp       2040 non-null int64
livesp      2040 non-null int64
kitsp       2040 non-null float64
dist        2040 non-null float64
metrdist    2040 non-null int64
walk        2040 non-null int64
brick       2040 non-null int64
floor       2040 non-null int64
code        2040 non-null int64
dtypes: float64(2), int64(8)
memory usage: 175.3 KB


In [157]:
df.head()

Unnamed: 0,price,totsp,livesp,kitsp,dist,metrdist,walk,brick,floor,code
1,81,58,40,6.0,12.5,7,1,1,1,3
2,75,44,28,6.0,13.5,7,1,0,1,6
3,128,70,42,6.0,14.5,3,1,1,1,3
4,95,61,37,6.0,13.5,7,1,0,1,1
5,330,104,60,11.0,10.5,7,0,1,1,3


In [158]:
df.describe()

Unnamed: 0,price,totsp,livesp,kitsp,dist,metrdist,walk,brick,floor,code
count,2040.0,2040.0,2040.0,2040.0,2040.0,2040.0,2040.0,2040.0,2040.0,2040.0
mean,127.496569,73.084314,46.337255,8.898529,11.015686,8.117157,0.685784,0.323039,0.790686,4.322059
std,51.87822,15.12345,7.894348,2.787073,3.375539,3.815574,0.464317,0.467752,0.406918,2.183289
min,50.0,44.0,28.0,5.0,3.0,1.0,0.0,0.0,0.0,1.0
25%,95.0,62.0,42.0,7.0,9.0,5.0,0.0,0.0,1.0,3.0
50%,115.0,73.5,45.0,9.0,12.0,7.0,1.0,0.0,1.0,4.0
75%,142.0,79.0,50.0,10.0,13.5,10.0,1.0,1.0,1.0,6.0
max,730.0,192.0,102.0,25.0,17.0,20.0,1.0,1.0,1.0,8.0


In [159]:
df.isnull().sum()

price       0
totsp       0
livesp      0
kitsp       0
dist        0
metrdist    0
walk        0
brick       0
floor       0
code        0
dtype: int64

In [160]:
X_train, X_test, y_train, y_test = train_test_split(df, 
                                                    df['price'], test_size=0.33, random_state=23)


In [161]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [162]:
features = ["totsp", "livesp", "kitsp", "dist", 'metrdist', "walk", "brick", "floor", "code"]
target = 'price'

In [163]:
final_transformers = list()

for feature in features:
    num_transformer = Pipeline([
                ('selector', NumberSelector(feature)),
                ('scaler', MinMaxScaler())
            ])
    final_transformers.append((feature, num_transformer))
    
    
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [164]:

LOWER_ALPHA = 0.1
UPPER_ALPHA = 0.9

regressor_mid = Pipeline([
    ('features',feats),
    ('classifier_mid', GradientBoostingRegressor(loss="ls"))
])
    
regressor_low = Pipeline([
    ('features',feats),
    ('classifier_low', GradientBoostingRegressor(loss="quantile",                   
                                        alpha=LOWER_ALPHA))
])

regressor_up = Pipeline([
    ('features',feats),
    ('classifier_up', GradientBoostingRegressor(loss="quantile",
                                        alpha=UPPER_ALPHA))
])

In [165]:
regressor_mid.fit(X_train, y_train)
regressor_low.fit(X_train, y_train)
regressor_up.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('totsp',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='totsp')),
                                                                 ('scaler',
                                                                  MinMaxScaler())])),
                                                ('livesp',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='livesp')),
                                                                 ('scaler',
                                                                  MinMaxScaler())])),
                                                ('kitsp',
                                                 Pipeline(steps=[('selector',
                                             

In [166]:
predictions_mid = regressor_mid.predict(X_test)
predictions_mid
mean_absolute_error(y_pred=predictions_mid, y_true=y_test), r2_score(y_pred=predictions_mid, y_true=y_test)

(15.468524868247084, 0.7135376789694861)

In [167]:
predictions_low = regressor_low.predict(X_test)
predictions_low
mean_absolute_error(y_pred=predictions_low, y_true=y_test), r2_score(y_pred=predictions_low, y_true=y_test)

(24.014084834495037, 0.3770011780984185)

In [168]:
predictions_up = regressor_up.predict(X_test)
predictions_up
mean_absolute_error(y_pred=predictions_up, y_true=y_test), r2_score(y_pred=predictions_up, y_true=y_test)

(25.56608781068943, 0.5176466644952058)

In [169]:
with open("regressor_mid.dill", "wb") as f:
    dill.dump(regressor_mid, f)

In [175]:
import urllib.request
import json      

def get_prediction(totsp, livesp, kitsp, dist, metrdist, walk, brick, floor, code):
    body = {"totsp": totsp, 
            'livesp': livesp,
            'kitsp': kitsp,
            'dist': dist,
            'metrdist': metrdist,
            'walk': walk,
            'brick': brick,
            'floor': floor,
            'code': code
           } 

    myurl = "http://127.0.0.1:8180/predict"
    req = urllib.request.Request(myurl)
    req.add_header('Content-Type', 'application/json; charset=utf-8')
    jsondata = json.dumps(body)
    jsondataasbytes = jsondata.encode('utf-8')   # needs to be bytes
    req.add_header('Content-Length', len(jsondataasbytes))
    #print (jsondataasbytes)
    response = urllib.request.urlopen(req, jsondataasbytes)
    return json.loads(response.read())['predictions']

In [176]:
import sys
!pip freeze > requirements.txt

In [184]:
get_prediction(40, 20, 10, 3, 5, 1, 1, 1, 6)

138.79084209339868