In [None]:
# The dataset has 13 fields.

#date - date of publication of the announcement;
#time - the time when the ad was published;
#geo_lat - Latitude
#geo_lon - Longitude
#region - Region of Russia. There are 85 subjects in the country in total.
#building_type - Facade type. 0 - Other. 1 - Panel. 2 - Monolithic. 3 - Brick. 4 - Blocky. 5 - Wooden
#object_type - Apartment type. 1 - Secondary real estate market; 2 - New building;
#level - Apartment floor
#levels - Number of storeys
#rooms - the number of living rooms. If the value is "-1", then it means "studio apartment"
#area - the total area of ​​the apartment square meters
#kitchen_area - Kitchen area
#price - Price. in rubles

from sklearn.linear_model import LogisticRegression, Ridge, LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from category_encoders import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from matplotlib.dates import date2num 
from scipy import stats
import matplotlib.dates as dates
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier 
from pandas_profiling import ProfileReport
from pdpbox.pdp import pdp_isolate, pdp_plot, pdp_interact, pdp_interact_plot
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
df = pd.read_csv("../input/russian-homes/russian_homes.csv")

In [None]:
#Read in data, set the date as the index col, remove extreme outliers in the price column
def wrangle(filepath):
    df = pd.read_csv(filepath, parse_dates=['date'], index_col='date')
    df.drop(['time'],axis=1,inplace=True)
    df = df.drop_duplicates()
    
    return df

df = wrangle('../input/russian-homes/russian_homes.csv')

overfivemil = df[df['price']>=365865000].index  
df.drop(overfivemil, inplace=True)

guh2 = df[df['price']<10000].index  
df.drop(guh2, inplace=True)


df.head()

In [None]:
df.shape

In [None]:
#define y variable 
target = 'price'
y = df[target]
X = df.drop(columns=target)

In [None]:
#split data into train and test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y_train.shape

In [None]:
y_val.shape

In [None]:
y_pred = [y_train.mean()] * len(y_train)

print("Mean Price", y_train.mean())
print('Baseline MAE: ', mean_absolute_error(y_train, y_pred))

In [None]:
df.profile_report(correlations=None)

In [None]:
#initial models 'not tuned'
model_lr = LinearRegression()

model_lr.fit(X_train, y_train)

In [None]:
#initial models 'not tuned'
model_r = Ridge()

model_r.fit(X_train, y_train)

In [None]:
#initial models 'not tuned'
model_rf = RandomForestRegressor(random_state=42, n_jobs=-1)

model_rf.fit(X_train, y_train)

In [None]:
#initial models 'not tuned'
model_xgb = XGBRegressor(random_state=42, n_jobs=-1)

model_xgb.fit(X_train, y_train)

In [None]:
def check_metrics(model):
  print('Training MAE', mean_absolute_error(y_train, model.predict(X_train)))
  print('Validation MAE', mean_absolute_error(y_val, model.predict(X_val)))
  print('Validation R^2', model.score(X_val, y_val))
  print('Training R^2',model.score(X_train, y_train))
  print()


models = [model_lr, model_r, model_rf, model_xgb]

for m in models:
  check_metrics(m)

In [None]:
#model tuning
clf_xgb = make_pipeline(
    
    SimpleImputer(),
    XGBRegressor(random_state=42, n_jobs=-1)
)



In [None]:
params={
    'xgbregressor__learning_rate'   : [0.03, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    'xgbregressor__max_depth'       : [3, 4, 5, 6, 7, 8, 10, 12, 15],
    'xgbregressor__min_child_weight': [1, 3, 5, 7, 8, 9, 10],
    'xgbregressor__gamma'           : [0.0, 0.1, 0.2, 0.3, 0.4],
    'xgbregressor__colsample_bytree': [0.3, 0.4, 0.5, 0.7, 0.08, 0.09],
    'xgbregressor__max_leaf_nodes'  : [3,5,6,7,9,10,12,15],
    'xgbregressor__n_estimators'    : [50,75,100,125,150,175,200]
}

In [None]:
model_RfRs = RandomizedSearchCV(
    clf_xgb, 
    param_distributions=params,
    n_iter=10,
    cv=5,
    n_jobs=-1,
    verbose=1
)

model_RfRs.fit(X_train, y_train)

In [None]:
best_score = model_RfRs.best_score_
best_params = model_RfRs.best_params_

print('Best score for `model`:', best_score)
print()
print('Best params for `model`:', best_params)

In [None]:
#model tuning
clf_rf = make_pipeline(
    SimpleImputer(),
    RandomForestRegressor(random_state=42, n_jobs=-1)
)


In [None]:
params={
    'randomforestregressor__n_estimators'     : [50,75,100,125,150,175,200],
    'randomforestregressor__max_depth'        : range(25, 40, 5),
    'randomforestregressor__min_samples_leaf' : range(25, 400, 25),
    'randomforestregressor__max_samples'      : [.1, .2, .3, .4, .5, .6, .7, .8, .9],
    'randomforestregressor__max_leaf_nodes'   : range(140, 170, 10),
    'randomforestregressor__max_features'     : range(0, 15, 1)
}

In [None]:
model_RfR = RandomizedSearchCV(
    clf_rf, 
    param_distributions=params,
    n_iter=10,
    cv=5,
    n_jobs=-1,
    verbose=1
)

model_RfR.fit(X_train, y_train)

In [None]:
best_score = model_RfR.best_score_
best_params = model_RfR.best_params_

print('Best score for `model`:', best_score)
print()
print('Best params for `model`:', best_params)

In [None]:
model_lr = LinearRegression()

model_lr.fit(X_train, y_train)

In [None]:
model_r = Ridge()

model_r.fit(X_train, y_train)

In [None]:
#Final model
model_rf = RandomForestRegressor(random_state=42, n_jobs=-1, n_estimators=125, min_samples_leaf=125, max_samples=0.9, max_leaf_nodes=150, max_features=8, max_depth=35)

model_rf.fit(X_train, y_train)

In [None]:
#Final model
model_xgb = XGBRegressor(random_state=42, n_jobs=-1, n_estimators=175, min_child_weight=8, max_leaf_nodes=3, max_depth=15, learning_rate=0.2, gamma=0.0, colsample_bytree=0.7)

model_xgb.fit(X_train, y_train)

In [None]:
def check_metrics(model):
  print('Training MAE', mean_absolute_error(y_train, model.predict(X_train)))
  print('Validation MAE', mean_absolute_error(y_val, model.predict(X_val)))
  print('Validation R^2', model.score(X_val, y_val))
  print()


models = [model_lr, model_r, model_rf, model_xgb]

for m in models:
  check_metrics(m)

In [None]:
perm_imp = permutation_importance(model_xgb, 
                                  X_val, 
                                  y_val, 
                                  n_repeats=5, 
                                  n_jobs=-1, 
                                  random_state=42)

In [None]:
sns.heatmap(df.corr())

In [None]:
data = {'imp_mean': perm_imp['importances_mean'],
        'imp_std': perm_imp['importances_std']}

importances = pd.DataFrame(data, index=X_val.columns).sort_values(by='imp_mean')

importances.head()

In [None]:
importances['imp_mean'].tail(10).plot(kind='barh')

In [None]:
sns.scatterplot(x='geo_lat', y='geo_lon', data=df)