In [1]:
#Model training with XGboost
!pip install xgboost

import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer


# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import roc_auc_score

def train(dataFrame, y, max_depth, eta):
    # Hot Encoding
    dicts = dataFrame.to_dict(orient="records")
    dv = DictVectorizer(sparse=False)
    X = dv.fit_transform(dicts)
    features = dv.get_feature_names()
    dtrain = xgb.DMatrix(X, label=y, feature_names=features)

    # train
    xgb_params = {
        'eta': eta,
        'max_depth': max_depth,
        'min_child_weight': 1,
        'objective': 'reg:squarederror',
        'nthread': 8,
        'seed':1,
        'verbosity':0
    }
    model = xgb.train(xgb_params, dtrain, num_boost_round=10)
    return dv, model


def predict(dataFrame, dv, model):
    dicts = dataFrame.to_dict(orient="records")
    X = dv.transform(dicts)
    features = dv.get_feature_names()
    dval = xgb.DMatrix(X, feature_names=features)
    y_pred = model.predict(dval)
    return y_pred, X





In [5]:
import numpy as np
import pandas as pd

df = pd.DataFrame([
    {'rent': 1000, 'neighbourhood': 'Mitte', 'city': 'Berlin'},
    {'rent': 1200, 'neighbourhood': 'Mitte', 'city': 'Munich'},
    {'rent': 1100, 'neighbourhood': 'Mitte', 'city': 'Berlin'},
])

df


Unnamed: 0,rent,neighbourhood,city
0,1000,Mitte,Berlin
1,1200,Mitte,Munich
2,1100,Mitte,Berlin


In [3]:
df_train = df[['neighbourhood']]
y_train = df[['rent']]

display("Train", df_train)
display("Y", y_train)


dv, model = train(df_train, y_train, 6, 1)
y_pred_val, X_val = predict(df_train, dv, model)


df.assign(predicted_rent = y_pred_val.astype('int'))

'Train'

Unnamed: 0,neighbourhood
0,Mitte
1,Mitte
2,Mitte


'Y'

Unnamed: 0,rent
0,1000
1,1200
2,1100


Unnamed: 0,rent,neighbourhood,city,predicted_rent
0,1000,Mitte,Berlin,1099
1,1200,Mitte,Munich,1099
2,1100,Mitte,Berlin,1099


In [4]:
cities = df['city'].unique()
targets = {}
for city in cities:
    targets[city] = df[df['city']==city].rent.mean()
df['target_rent'] = df['city'].apply(lambda x: targets[x])


df_train = df[['neighbourhood', 'target_rent']]
y_train = df[['rent']]

display("df", df)
display("Train", df_train)
display("Y", y_train)


dv, model = train(df_train, y_train, 6, 1)
y_pred_val, X_val = predict(df_train, dv, model)


df.assign(predicted_rent = y_pred_val.astype('int'))

'df'

Unnamed: 0,rent,neighbourhood,city,target_rent
0,1000,Mitte,Berlin,1050.0
1,1200,Mitte,Munich,1200.0
2,1100,Mitte,Berlin,1050.0


'Train'

Unnamed: 0,neighbourhood,target_rent
0,Mitte,1050.0
1,Mitte,1200.0
2,Mitte,1050.0


'Y'

Unnamed: 0,rent
0,1000
1,1200
2,1100


Unnamed: 0,rent,neighbourhood,city,target_rent,predicted_rent
0,1000,Mitte,Berlin,1050.0,1049
1,1200,Mitte,Munich,1200.0,1199
2,1100,Mitte,Berlin,1050.0,1049
