In [110]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
import xgboost as xgb
import lightgbm as lgb
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold
from sklearn import ensemble
import scipy
from pandas.api.types import is_object_dtype
from metrics import metrics_stat


In [169]:
def data_read(file, sep=None):
    data = pd.read_csv(file, sep=sep)
    return data

def check_data_info(data):
    descr = data.describe()
    info = data.info()
    return descr, info

def fill_nans(data):
    for col in data.columns:
        if data[col].isnull().values.any() == True:
            if data[col].dtype == 'float64':
                data[col].fillna(data[col].mean(), inplace=True)
            else:
                data[col].fillna(data[col].mode()[0], inplace=True)
        else:
            pass
    return data

def features_label(data, label):
    features = data.drop(label,  axis=1)
    label = data[label]
    return features, label

def train_test(features, label):
    X_train, X_test, y_train, y_test = train_test_split(
        features, label, test_size=0.33, random_state=42)
    return X_train, X_test, y_train, y_test

def kfold_cv(data, k=2):
    kf = KFold(n_splits=k)
    cv_clf = ensemble.GradientBoostingClassifier(**params)
    val_scores = np.zeros((n_estimators,), dtype=np.float64)
    for train, test in kf.split(X_train, y_train):
        cv_clf.fit(X_train[train], y_train[train])
        val_scores += heldout_score(cv_clf, X_train[test], y_train[test])
    val_scores /= n_splits
    return val_scores



In [213]:
train = data_read("data/train.csv")
test = data_read("data/test.csv")


  return func(*args, **kwargs)


In [215]:
train = fill_nans(train)

Unnamed: 0,city,floor,id,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,...,reform_mean_floor_count_1000,reform_mean_floor_count_500,reform_mean_year_building_1000,reform_mean_year_building_500,region,total_square,street,date,realty_type,price_type
0,Пермь,1.0,COL_0,57.998207,56.292797,4,19,35,52,0,...,5.762963,5.530612,1964.118519,1960.959184,Пермский край,32.000000,S27289,2020-01-05,10,0
1,Шатура,1.0,COL_1,55.574284,39.543835,3,24,37,59,0,...,2.894366,3.527778,1952.321678,1957.222222,Московская область,280.000000,S17052,2020-01-05,10,0
2,Ярославль,1.0,COL_2,57.619140,39.850525,1,30,67,128,0,...,6.141414,7.222222,1968.150000,1973.370370,Ярославская область,297.400000,S16913,2020-01-05,110,0
3,Новокузнецк,1.0,COL_3,53.897083,87.108604,0,0,5,21,0,...,8.581081,9.000000,1992.716216,2014.000000,Кемеровская область,190.000000,S10148,2020-01-05,110,0
4,Москва,1.0,COL_4,55.802590,37.487110,1,23,64,153,0,...,7.263889,5.684211,1963.229167,1960.500000,Москва,60.200000,S1338,2020-01-05,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279787,Томск,1.0,COL_280111,56.459183,84.979334,2,33,111,222,0,...,5.714286,5.882353,1972.260870,1973.460000,Томская область,358.320073,S11114,2020-08-23,10,1
279788,Санкт-Петербург,1.0,COL_280479,59.936954,30.356383,10,274,718,1340,0,...,4.719388,4.706667,1876.994898,1873.186667,Санкт-Петербург,119.637556,S28440,2020-08-23,110,1
279789,Калининград,3.0,COL_280518,54.729233,20.514968,0,12,34,84,0,...,3.950413,4.885714,1964.258333,1970.571429,Калининградская область,312.789725,S6671,2020-08-23,10,1
279790,Кемерово,1.0,COL_280529,55.360680,86.081460,5,57,100,134,0,...,4.691489,4.125000,1957.425532,1954.625000,Кемеровская область,89.201305,S17667,2020-08-23,110,1


In [216]:
train = train.drop(["street","floor", "id", "city", "date", "osm_city_nearest_name","region"], axis=1)


Unnamed: 0,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,osm_building_points_in_0.005,osm_building_points_in_0.0075,osm_building_points_in_0.01,...,reform_count_of_houses_500,reform_house_population_1000,reform_house_population_500,reform_mean_floor_count_1000,reform_mean_floor_count_500,reform_mean_year_building_1000,reform_mean_year_building_500,total_square,realty_type,price_type
0,57.998207,56.292797,4,19,35,52,0,0,0,0,...,49,2503.0,765.0,5.762963,5.530612,1964.118519,1960.959184,32.000000,10,0
1,55.574284,39.543835,3,24,37,59,0,0,0,1,...,37,1336.0,514.0,2.894366,3.527778,1952.321678,1957.222222,280.000000,10,0
2,57.619140,39.850525,1,30,67,128,0,0,1,1,...,27,1883.0,573.0,6.141414,7.222222,1968.150000,1973.370370,297.400000,110,0
3,53.897083,87.108604,0,0,5,21,0,0,0,1,...,2,1801.0,54.0,8.581081,9.000000,1992.716216,2014.000000,190.000000,110,0
4,55.802590,37.487110,1,23,64,153,0,1,1,1,...,38,3090.0,619.0,7.263889,5.684211,1963.229167,1960.500000,60.200000,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279787,56.459183,84.979334,2,33,111,222,0,0,1,1,...,62,3008.0,979.0,5.714286,5.882353,1972.260870,1973.460000,358.320073,10,1
279788,59.936954,30.356383,10,274,718,1340,0,2,5,16,...,80,7661.0,1659.0,4.719388,4.706667,1876.994898,1873.186667,119.637556,110,1
279789,54.729233,20.514968,0,12,34,84,0,2,2,2,...,37,1225.0,460.0,3.950413,4.885714,1964.258333,1970.571429,312.789725,10,1
279790,55.360680,86.081460,5,57,100,134,0,0,0,0,...,24,1649.0,429.0,4.691489,4.125000,1957.425532,1954.625000,89.201305,110,1


In [237]:
test = test.drop(["street","floor", "id", "city", "date", "osm_city_nearest_name","region"], axis=1)

KeyError: "['street' 'floor' 'id' 'city' 'date' 'osm_city_nearest_name' 'region'] not found in axis"

In [217]:
features, label = features_label(train, 'per_square_meter_price')


In [218]:
X_train, X_test, y_train, y_test = train_test(features, label)

In [238]:
params = {'n_estimators': 3000, 'learning_rate': 0.001, 'random_state': 3, 'verbose':250}


model = CatBoostRegressor(**params)

model.fit(features, label)

prediction = model.predict(test)
#metrics = metrics_stat(test, prediction)

#print("Raif metric: {:.4f}".format(metrics))


0:	learn: 171979.3549629	total: 17.8ms	remaining: 53.2s
250:	learn: 157096.7833606	total: 3.21s	remaining: 35.1s
500:	learn: 146685.2009255	total: 6.57s	remaining: 32.8s
750:	learn: 139183.1726365	total: 10.6s	remaining: 31.8s
1000:	learn: 133733.5751028	total: 14.3s	remaining: 28.6s
1250:	learn: 129617.1559494	total: 18.1s	remaining: 25.2s
1500:	learn: 126515.4004135	total: 21.9s	remaining: 21.9s
1750:	learn: 124138.4415922	total: 26s	remaining: 18.6s
2000:	learn: 122295.9275851	total: 29.9s	remaining: 14.9s
2250:	learn: 120828.8913359	total: 33.5s	remaining: 11.1s
2500:	learn: 119602.6809816	total: 37.1s	remaining: 7.41s
2750:	learn: 118574.4375272	total: 40.8s	remaining: 3.69s
2999:	learn: 117724.8566737	total: 44.4s	remaining: 0us


In [245]:
solution = test_1[['id']].copy()
solution['per_square_meter_price'] = pd.Series(np.full(len(test_1), prediction))
solution.to_csv('first.csv', sep=',', index=False)

Unnamed: 0,id,per_square_meter_price
0,COL_289284,67294.456923
1,COL_289305,60985.843486
2,COL_289318,46997.302808
3,COL_289354,61899.022239
4,COL_289399,106545.241482
...,...,...
2969,COL_455089,45076.282791
2970,COL_455212,53018.893559
2971,COL_455261,70543.329949
2972,COL_455381,68595.501140
