In [None]:
# The dataset has 13 fields.

# date - date of publication of the announcement;
# time - the time when the ad was published;
# geo_lat - Latitude
# geo_lon - Longitude
# region - Region of Russia. There are 85 subjects in the country in total.
# building_type - Facade type. 0 - Other. 1 - Panel. 2 - Monolithic. 3 - Brick. 4 - Blocky. 5 - Wooden
# object_type - Apartment type. 1 - Secondary real estate market; 2 - New building;
# level - Apartment floor
# levels - Number of storeys
# rooms - the number of living rooms. If the value is "-1", then it means "studio apartment"
# area - the total area of ​​the apartment square meters
# kitchen_area - Kitchen area
# price - Price. in rubles
from sklearn.linear_model import LogisticRegression, Ridge, LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from category_encoders import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from matplotlib.dates import date2num 
from scipy import stats
import matplotlib.dates as dates
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier 
from pandas_profiling import ProfileReport
from pdpbox.pdp import pdp_isolate, pdp_plot, pdp_interact, pdp_interact_plot
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import classification_report, roc_curve, plot_confusion_matrix, plot_roc_curve, mean_absolute_error, roc_auc_score
df = pd.read_csv('../input/russian-homescsv/russian_homes.csv')

In [None]:
#Read in data, set the date as the index col, remove extreme outliers in the price column
def wrangle(filepath):
    df = pd.read_csv(filepath, parse_dates=['date'], index_col='date')
    df.drop(['time'],axis=1,inplace=True)
    df = df.drop_duplicates()
    
    return df

df = wrangle('../input/russian-homescsv/russian_homes.csv')

overfivemil = df[df['price']>=365865000].index  
df.drop(overfivemil, inplace=True)

guh2 = df[df['price']<10000].index  
df.drop(guh2, inplace=True)


df.head()

In [None]:
#define y variable 
target = 'building_type'
y = df[target]
X = df.drop(columns=target)

In [None]:
#split data into train and test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print('Baseline Accuracy Score:', y_train.value_counts(normalize=True).max())

In [None]:
#initial models 'not tuned'
model_rf = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=25)
)

model_rf.fit(X_train, y_train);

In [None]:
#initial models 'not tuned'
model_xgb = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    XGBClassifier(random_state=42, n_jobs=-1, n_estimators=25)
)

model_xgb.fit(X_train, y_train);

In [None]:
print('sklearn Training Accuracy:', model_rf.score(X_train, y_train))
print('sklearn Validation Accuracy:', model_rf.score(X_val, y_val))

In [None]:
print('XGBoost Training Accuracy:', model_xgb.score(X_train, y_train))
print('XGBoost Validation Accuracy:', model_xgb.score(X_val, y_val))

In [None]:
#model tuning
model_rf2 = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=25)
)


In [None]:
params={
    
    'randomforestclassifier__max_depth'        : range(80, 200, 20),
    'randomforestclassifier__min_samples_leaf' : range(250, 360, 20),
    'randomforestclassifier__max_samples'      : [.1, .2, .3, .4, .5, .6, .7, .8, .9],
    'randomforestclassifier__max_leaf_nodes'   : range(200, 400, 25),
    'randomforestclassifier__min_samples_split': [14, 16, 18, 20, 22, 24, 26]
}

In [None]:
model_RfR = RandomizedSearchCV(
    model_rf2, 
    param_distributions=params,
    n_iter=10,
    cv=5,
    n_jobs=-1,
    verbose=1
)

model_RfR.fit(X_train, y_train)

In [None]:
best_score = model_RfR.best_score_
best_params = model_RfR.best_params_

print('Best score for `model`:', best_score)
print()
print('Best params for `model`:', best_params)

In [None]:
#model tuning
model_xgb2 = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    XGBClassifier(random_state=42, n_jobs=-1, n_estimators=25)
)

model_xgb2.fit(X_train, y_train);

In [None]:
param={
    'xgbclassifier__learning_rate'   : [0.03, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    'xgbclassifier__max_depth'       : range(80, 200, 20),
    'xgbclassifier__min_child_weight': [8, 9, 10, 11, 12, 13, 14],
    'xgbclassifier__gamma'           : [0.2, 0.3, 0.4, 0.5, 0.6, 0.7],
    'xgbclassifier__colsample_bytree': [0.01, 0.02, 0.3, 0.4, 0.5, 0.7, 0.08, 0.09]  
}

In [None]:
model_RfR2 = RandomizedSearchCV(
    model_xgb2, 
    param_distributions=param,
    n_iter=10,
    cv=5,
    n_jobs=-1,
    verbose=1
)

model_RfR2.fit(X_train, y_train)

In [None]:
best_score = model_RfR2.best_score_
best_params = model_RfR2.best_params_

print('Best score for `model`:', best_score)
print()
print('Best params for `model`:', best_params)

In [None]:
#Final model
model_xgb3 = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    XGBClassifier(random_state=42, n_jobs=-1, n_estimators=25, min_child_weight=13, max_depth=40, learning_rate=0.25, gamma=0.4, colsample_bytree=0.5)
)

model_xgb3.fit(X_train, y_train);

In [None]:
print('XGBoost Training Accuracy:', model_xgb3.score(X_train, y_train))
print('XGBoost Validation Accuracy:', model_xgb3.score(X_val, y_val))

In [None]:
perm_imp = permutation_importance(model_xgb3, 
                                  X_val, 
                                  y_val, 
                                  n_repeats=5, 
                                  n_jobs=-1, 
                                  random_state=42)

In [None]:
data = {'imp_mean': perm_imp['importances_mean'],
        'imp_std': perm_imp['importances_std']}

importances = pd.DataFrame(data, index=X_val.columns).sort_values(by='imp_mean')

importances.head()

In [None]:
importances['imp_mean'].tail(10).plot(kind='barh')

In [None]:
print("Random Forest")
print(classification_report(y_val, model_xgb3.predict(X_val)))
plot_confusion_matrix(model_xgb3, X_val, y_val);