In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
import lightgbm as lgb
import xgboost as xgb
from matplotlib import pyplot as plt
import seaborn as sns
import math
import time
import datetime
from pandas.tseries.offsets import *
from xgboost import XGBRegressor
from sklearn import preprocessing
from sklearn import linear_model

from pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = 10, 8
pd.options.display.max_rows = 999

In [14]:
%%time
flat = pd.read_csv('flat_fix.csv', parse_dates=['sale','date_salestart', 'date_settle','flat_startsale'])
price = pd.read_csv('price.csv',encoding='cp1251',parse_dates=['datefrom','dateto','date_salestart'])
status = pd.read_csv('status.csv',encoding='cp1251',parse_dates=['datefrom','dateto'])
test = pd.read_csv('test.csv', parse_dates=['date1'],encoding='cp1251')
train = pd.read_csv('train.csv', parse_dates=['date1'],encoding='cp1251')

y = train.value

train = train.drop(['value','id','start_square','plan_s','plan_m','plan_l','vid_0','vid_1','vid_2'], axis=1)
test = test.drop(['id'], axis=1)

columns = ['id_bulk', 'spalen', 'date1', 'price', 'mean_sq', 'mean_fl','month', 'month_cnt', 'objclass', 'roomcount', 'wall', 'area', 'vh_groups', 'kindergarten', 'school',
           'hospital', 'phok', 'trainarea', 'carwash', 'stores', 'wheelchairs', 'conditioner', 'ventilation', 'elevator', 'garbage_col', 'cctv', 'u_parking', 
           'noauto_yard', 'auto_places', 'factory_zone', 'green_zone', 'to_kremlin', 'to_mall', 'to_sadoviy', 'to_bigroad', 'to_autocross', 'to_undeground_walk',
           'to_factory', 'to_park', 'to_park_walk', 'u_station_ring', 'yard_area', 'dollar', 'mortgage', 'deposit1', 'deposit13', 'depostit3']

train.columns = columns
test.columns = columns

train['value'] = y

train = pd.concat([train, pd.get_dummies(train['objclass'],prefix='c')], axis=1)
test = pd.concat([test, pd.get_dummies(test['objclass'],prefix='c')], axis=1)

for i in ['wall', 'vh_groups','carwash', 'stores', 'wheelchairs', 'u_parking', 'noauto_yard']:
    train[i] = train[i].replace('да', 1).replace('нет', 0)
    test[i] = test[i].replace('да', 1).replace('нет', 0)
    
train = train.drop(['garbage_col','elevator','trainarea', 'objclass'], axis=1)
test = test.drop(['garbage_col','elevator','trainarea', 'objclass'], axis=1)

train = pd.concat([train, pd.get_dummies(train['month'],prefix='m')], axis=1).drop(['month'], axis=1)
months = pd.concat([pd.get_dummies(test['month'],prefix='m'), pd.DataFrame(data = np.zeros((test.shape[0],9)),\
                columns = ['m_1','m_2','m_6','m_7','m_8','m_9','m_10','m_11','m_12'])], axis=1).sort_index(axis=1)
test = pd.concat([test, months], axis=1).drop(['month'], axis=1)

train['id_bulk_sp'] = train['id_bulk'].map(str)+'_'+train['spalen'].map(str)
test['id_bulk_sp'] = test['id_bulk'].map(str)+'_'+test['spalen'].map(str)
flat['id_bulk_sp'] = flat['bulk_id'].map(str)+'_'+flat['spalen'].astype(int).map(str)

flat['otdelka'] = flat.apply(lambda x: 1 if x.otdelka not in ['Не производится', None] else 0, axis=1)

train = pd.concat([train, pd.get_dummies(train['spalen'],prefix='sp')], axis=1)
test = pd.concat([test, pd.get_dummies(test['spalen'],prefix='sp')], axis=1)

# salestart = pd.to_datetime(pd.concat([test,train],ignore_index=False).reset_index(drop=True).groupby('id_bulk_sp')['date1'].min())
salestart = pd.to_datetime(flat.reset_index(drop=True).groupby('id_bulk_sp')['flat_startsale'].min())
train['dayfromstart'] = train.apply(lambda x: float((x['date1'] - salestart[x['id_bulk_sp']]).days), axis=1)
test['dayfromstart'] = test.apply(lambda x: float((x['date1'] - salestart[x['id_bulk_sp']]).days), axis=1)

flat.date_settle = flat.date_settle.fillna(flat.date_settle.min())
train['daytosettle'] = train.apply(lambda x: float(( flat[flat.id_bulk_sp == x['id_bulk_sp']]['date_settle'].iloc[0] - x['date1']).days), axis=1)
test['daytosettle'] = test.apply(lambda x: float(( flat[flat.id_bulk_sp == x['id_bulk_sp']]['date_settle'].iloc[0] - x['date1']).days), axis=1)

test['flat_count'] = test.apply(lambda x: len(flat[flat.id_bulk_sp == x['id_bulk_sp']]), axis=1)
train['flat_count'] = train.apply(lambda x: len(flat[flat.id_bulk_sp == x['id_bulk_sp']]), axis=1)

test['flat_left'] = test.apply(lambda x: len(flat[(flat.id_bulk_sp == x['id_bulk_sp'])&(flat.sale >= x['date1'])]), axis=1) 
train['flat_left'] = train.apply(lambda x: len(flat[(flat.id_bulk_sp == x['id_bulk_sp'])&(flat.sale >= x['date1'])]), axis=1)

def getasaled(x,df,period):
    try:
        t = df[(df.id_bulk_sp == x['id_bulk_sp'])&(df.date1 == x['date1'] - DateOffset(months=period))].sort_values('date1').iloc[0]['value']
        return(t)
    except:
        return(None)
    
def getlastprice(x,df,period):
    try:
        t = df[(df.id_bulk_sp == x['id_bulk_sp'])&(df.date1 == x['date1'] - DateOffset(months=period))].sort_values('date1').iloc[0]['price']
        return(t)
    except:
        return(None)

##лаги продаж
train['saledlastmonth'] = train.apply(lambda x: getasaled(x,train,1), axis=1)
train['saledlast2month'] = train.apply(lambda x: getasaled(x,train,2), axis=1)
train['saledlast3month'] = train.apply(lambda x: getasaled(x,train,3), axis=1)
train['saledlast4month'] = train.apply(lambda x: getasaled(x,train,4), axis=1)
train['saledlast5month'] = train.apply(lambda x: getasaled(x,train,5), axis=1)
train['saledlast6month'] = train.apply(lambda x: getasaled(x,train,6), axis=1)

test['saledlastmonth'] = test.apply(lambda x: getasaled(x,train,1), axis=1)
test['saledlast2month'] = test.apply(lambda x: getasaled(x,train,2), axis=1)
test['saledlast3month'] = test.apply(lambda x: getasaled(x,train,3), axis=1)
test['saledlast4month'] = test.apply(lambda x: getasaled(x,train,4), axis=1)
test['saledlast5month'] = test.apply(lambda x: getasaled(x,train,5), axis=1)
test['saledlast6month'] = test.apply(lambda x: getasaled(x,train,6), axis=1)

##лаги цен
train['pricelastmonth'] = train.apply(lambda x: getlastprice(x,train,1), axis=1)
train['pricelast2month'] = train.apply(lambda x: getlastprice(x,train,2), axis=1)
train['pricelast3month'] = train.apply(lambda x: getlastprice(x,train,3), axis=1)
train['pricelast4month'] = train.apply(lambda x: getlastprice(x,train,4), axis=1)
train['pricelast5month'] = train.apply(lambda x: getlastprice(x,train,5), axis=1)
train['pricelast6month'] = train.apply(lambda x: getlastprice(x,train,6), axis=1)

test['pricelastmonth'] = test.apply(lambda x: getlastprice(x,pd.concat([train,test]),1), axis=1)
test['pricelast2month'] = test.apply(lambda x: getlastprice(x,pd.concat([train,test]),2), axis=1)
test['pricelast3month'] = test.apply(lambda x: getlastprice(x,pd.concat([train,test]),3), axis=1)
test['pricelast4month'] = test.apply(lambda x: getlastprice(x,pd.concat([train,test]),4), axis=1)
test['pricelast5month'] = test.apply(lambda x: getlastprice(x,pd.concat([train,test]),5), axis=1)
test['pricelast6month'] = test.apply(lambda x: getlastprice(x,pd.concat([train,test]),6), axis=1)

##лаги остатков

train['prevcount'] = train.apply(lambda x: len(train[(train.date1 < x['date1'])&(train.id_bulk_sp == x['id_bulk_sp'])]), axis=1)
test['prevcount'] = test.apply(lambda x: len(pd.concat([train,test])[(pd.concat([train,test]).date1 < x['date1'])\
                                                                     &(pd.concat([train,test]).id_bulk_sp == x['id_bulk_sp'])]), axis=1)

train['otdelka'] = train.apply(lambda x: flat[flat.id_bulk_sp == x['id_bulk_sp']]['otdelka'].mean(), axis=1)
train['balcon'] = train.apply(lambda x: flat[flat.id_bulk_sp == x['id_bulk_sp']]['balcon'].mean(), axis=1)
test['otdelka'] = test.apply(lambda x: flat[flat.id_bulk_sp == x['id_bulk_sp']]['otdelka'].mean(), axis=1)
test['balcon'] = test.apply(lambda x: flat[flat.id_bulk_sp == x['id_bulk_sp']]['balcon'].mean(), axis=1)

euro = pd.read_csv('euro.csv',encoding='cp1251', sep=';')
euro['Средний курс'] = euro['Средний курс'].str.replace(',','.').astype(float)
euro['Месяц'] = pd.to_datetime(euro['Месяц'], errors='raise', dayfirst=True)
train['euro'] = train.apply(lambda x: euro[euro['Месяц'] == x['date1']]['Средний курс'].values[0], axis=1)
test['euro'] = test.apply(lambda x: euro[euro['Месяц'] == x['date1']]['Средний курс'].values[0], axis=1)

trainfull = train
testfull = test

train = train.drop(['id_bulk_sp', 'date1', 'id_bulk', 'value'], axis=1)
test = test.drop(['id_bulk_sp', 'date1', 'id_bulk'], axis=1)

cv = KFold(n_splits=5, shuffle=True, random_state=12)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




CPU times: user 22min 12s, sys: 527 ms, total: 22min 12s
Wall time: 22min 12s


In [96]:
euro = pd.read_csv('euro.csv',encoding='cp1251', sep=';')
euro['Средний курс'] = euro['Средний курс'].str.replace(',','.').astype(float)
euro['Месяц'] = pd.to_datetime(euro['Месяц'], errors='raise', dayfirst=True)
train['euro'] = train.apply(lambda x: euro[euro['Месяц'] == x['date1']]['Средний курс'].values[0], axis=1)
test['euro'] = test.apply(lambda x: euro[euro['Месяц'] == x['date1']]['Средний курс'].values[0], axis=1)
# euro.index = euro['Месяц']

In [38]:
train = trainfull
test = testfull

train = train.drop(['id_bulk_sp', 'date1', 'id_bulk', 'value'], axis=1)
test = test.drop(['id_bulk_sp', 'date1', 'id_bulk'], axis=1)

In [115]:
flat = pd.concat([flat,pd.get_dummies(flat.vid,prefix='vid')],axis=1)

In [116]:
flat.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,111350,111351,111352,111353,111354,111355,111356,111357,111358,111359
id_sec,846EA675-93FF-E411-8098-001EC9D56418,846EA675-93FF-E411-8098-001EC9D56418,846EA675-93FF-E411-8098-001EC9D56418,846EA675-93FF-E411-8098-001EC9D56418,846EA675-93FF-E411-8098-001EC9D56418,846EA675-93FF-E411-8098-001EC9D56418,846EA675-93FF-E411-8098-001EC9D56418,846EA675-93FF-E411-8098-001EC9D56418,846EA675-93FF-E411-8098-001EC9D56418,846EA675-93FF-E411-8098-001EC9D56418,...,0998511F-CA05-E811-893D-00505688958B,0998511F-CA05-E811-893D-00505688958B,0998511F-CA05-E811-893D-00505688958B,0998511F-CA05-E811-893D-00505688958B,0998511F-CA05-E811-893D-00505688958B,0998511F-CA05-E811-893D-00505688958B,0998511F-CA05-E811-893D-00505688958B,0998511F-CA05-E811-893D-00505688958B,0998511F-CA05-E811-893D-00505688958B,0998511F-CA05-E811-893D-00505688958B
floor,2,2,2,2,2,2,2,3,3,3,...,25,25,25,25,25,25,25,25,25,25
spalen,3,1,1,0,2,1,3,3,1,1,...,0,1,0,3,1,2,2,2,1,3
stage_number,1,2,3,4,5,6,7,1,2,3,...,4,5,6,7,8,9,10,11,12,13
square,85.9,34.2,33.7,22.4,57.5,34.2,85.9,85.9,34.2,33.7,...,20.3,35.9,20.3,73.6,42.3,60.2,56.7,56.3,39.5,93.3
balcon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
otdelka,Не производится,Не производится,Не производится,Не производится,Не производится,Не производится,Не производится,Не производится,Не производится,Не производится,...,ПИК2-лоукост,ПИК2-лоукост,ПИК2-лоукост,ПИК2-лоукост,ПИК2-лоукост,ПИК2-лоукост,ПИК2-лоукост,ПИК2-лоукост,ПИК2-лоукост,ПИК2-лоукост
plan0,3M-1*,1S-1*,1S-2*,S-1*,2M-1,1S-1,3M-2,3M-1*,1S-1*,1S-2*,...,1NS1_3.6-2,1ES3_6.3-1,1NS1_3.6-2*,3KM24_10.5-1,1EL21_8.7-1,2EM7_10.5-1,2EM9_9.9-2,2EM7_9.9-2,1EM3*_6.9-1,3EL23_10.5-2
bulk_id,4B3B6D6A-93FF-E411-8098-001EC9D56418,4B3B6D6A-93FF-E411-8098-001EC9D56418,4B3B6D6A-93FF-E411-8098-001EC9D56418,4B3B6D6A-93FF-E411-8098-001EC9D56418,4B3B6D6A-93FF-E411-8098-001EC9D56418,4B3B6D6A-93FF-E411-8098-001EC9D56418,4B3B6D6A-93FF-E411-8098-001EC9D56418,4B3B6D6A-93FF-E411-8098-001EC9D56418,4B3B6D6A-93FF-E411-8098-001EC9D56418,4B3B6D6A-93FF-E411-8098-001EC9D56418,...,F07BF3C7-C905-E811-893D-00505688958B,F07BF3C7-C905-E811-893D-00505688958B,F07BF3C7-C905-E811-893D-00505688958B,F07BF3C7-C905-E811-893D-00505688958B,F07BF3C7-C905-E811-893D-00505688958B,F07BF3C7-C905-E811-893D-00505688958B,F07BF3C7-C905-E811-893D-00505688958B,F07BF3C7-C905-E811-893D-00505688958B,F07BF3C7-C905-E811-893D-00505688958B,F07BF3C7-C905-E811-893D-00505688958B
section,1,1,1,1,1,1,1,1,1,1,...,2,2,2,2,2,2,2,2,2,2


In [119]:
train['vid_средний'] = trainfull.apply(lambda x: flat[(flat.id_bulk_sp==x['id_bulk_sp'])\
                                            &(flat.sale < x['date1'])]['vid_средний'].mean(), axis=1)
train['vid_двор'] = trainfull.apply(lambda x: flat[(flat.id_bulk_sp==x['id_bulk_sp'])\
                                            &(flat.sale < x['date1'])]['vid_двор'].mean(), axis=1)
train['vid_хороший'] = trainfull.apply(lambda x: flat[(flat.id_bulk_sp==x['id_bulk_sp'])\
                                            &(flat.sale < x['date1'])]['vid_хороший'].mean(), axis=1)
train['vid_эконом'] = trainfull.apply(lambda x: flat[(flat.id_bulk_sp==x['id_bulk_sp'])\
                                            &(flat.sale < x['date1'])]['vid_эконом'].mean(), axis=1)

0       0.555556
1       0.166667
2       0.545455
3       0.000000
4       0.736842
5       0.194444
6       0.740741
7       0.058824
8       0.711538
9       0.254237
10      0.774194
11      0.125000
12      0.619048
13      0.283784
14      0.775000
15      0.137931
16      0.609091
17      0.319588
18      0.787234
19      0.162162
20           NaN
21      1.000000
22      1.000000
23           NaN
24      0.000000
25           NaN
26           NaN
27      0.000000
28      1.000000
29      1.000000
30      0.000000
31      0.000000
32      0.000000
33      0.000000
34      0.000000
35      1.000000
36      1.000000
37      0.000000
38      0.000000
39      0.000000
40      0.000000
41      0.000000
42      1.000000
43      1.000000
44      0.000000
45      0.000000
46      0.000000
47      0.000000
48      0.000000
49      1.000000
50      1.000000
51      0.000000
52      0.000000
53      0.000000
54      0.000000
55      0.000000
56      1.000000
57      1.000000
58      0.0000

In [53]:
mashatest = pd.read_csv('Masha/means/test.csv')
mashatrain = pd.read_csv('Masha/means/train.csv')
train2 = pd.concat([train, mashatrain], axis=1) 
test2 = pd.concat([test, mashatest], axis=1)

In [48]:
flat = pd.read_csv('flat_fix.csv', parse_dates=['sale','date_salestart', 'date_settle','flat_startsale'])
flat['id_bulk_sp'] = flat['bulk_id'].map(str)+'_'+flat['spalen'].astype(int).map(str)
# salestart = pd.to_datetime(pd.concat([test,train],ignore_index=False).reset_index(drop=True).groupby('id_bulk_sp')['date1'].min())
salestart = pd.to_datetime(flat.reset_index(drop=True).groupby('id_bulk_sp')['flat_startsale'].min())
train['dayfromstart'] = trainfull.apply(lambda x: float((x['date1'] - salestart[x['id_bulk_sp']]).days), axis=1)
test['dayfromstart'] = testfull.apply(lambda x: float((x['date1'] - salestart[x['id_bulk_sp']]).days), axis=1)

In [106]:
X_train, X_test, y_train, y_test = train_test_split(train1, y, test_size=0.2, random_state=1)
# trainfull1 = X_train
# X_train = X_train.drop(['id_bulk_sp', 'date1', 'id_bulk', 'value'], axis=1).reset_index(drop=True)
# X_test = X_test.drop(['id_bulk_sp', 'date1', 'id_bulk'], axis=1).reset_index(drop=True)

def postproccesing(output):
    output.columns = ['id', 'value']
    output.value = output.value.apply(lambda x: 0 if x<0 else x)
#     output['value'] = output.apply(lambda x: ((X_test.loc[x['id'],'mean_sq']*X_test.loc[x['id'],'flat_left']) 
#                                               if x['value']>(X_test.loc[x['id'],'mean_sq']*X_test.loc[x['id'],'flat_left']) 
#                                               else x['value']), axis=1)
#     df = pd.concat([testfull1, output.value], axis=1)
#     df['max'] = df.mean_sq*df.flat_left
#     merged = pd.concat([df.groupby('id_bulk_sp')['value'].sum(),df.groupby('id_bulk_sp')['max'].mean()], axis=1)

#     koef = merged[merged.value>merged['max']]['max']/merged[merged.value>merged['max']]['value']
#     df['value'] = df.apply(lambda x: x['value'] if x['id_bulk_sp'] not in koef else x['value']*koef[x['id_bulk_sp']], axis=1)

#     output = pd.DataFrame(data = df.value).reset_index()
#     output.columns = ['id', 'value']
#     output.to_csv('result.csv', index=False)
    return(output)

clf = XGBRegressor(n_estimators=400, learning_rate=0.08, gamma=1, subsample=0.75, colsample_bytree=1, max_depth=6, nthread=-1)
clf.fit(X_train,y_train)
pr = clf.predict(X_test)
output1 = pd.DataFrame(data = pr).reset_index()
# output1 = postproccesing(output)

cat = CatBoostRegressor(logging_level='Silent',n_estimators=1500)
cat.fit(X_train,y_train)
pr = cat.predict(X_test)
output2 = pd.DataFrame(data = pr).reset_index()
# output2 = postproccesing(output)

gbm = lgb.LGBMRegressor(objective='regression',
                        num_leaves=80,
                        max_depth = 6,
                        learning_rate=0.03,
                        n_estimators=2000,
                       nthread=-1)

gbm.fit(X_train,y_train)
pr = gbm.predict(X_test)
output3 = pd.DataFrame(data = pr).reset_index()

# output3 = postproccesing(output)
# df = pd.DataFrame(clf.feature_importances_,index=list(train.T.index))

# %matplotlib inline
# df[0].sort_values(ascending=False).plot(kind='bar',figsize=(20,10))

comparsion = pd.concat([output1[0], output2[0], output3[0], y_test.reset_index(drop=True)], axis=1)
comparsion.columns = ['xgb', 'cat', 'lgb', 'test']
comparsion['differ_xgb'] = comparsion.xgb - comparsion.test
comparsion['differ_cat'] = comparsion.cat - comparsion.test
comparsion['differ_lgb'] = comparsion.lgb - comparsion.test
comparsion['mean1'] = (comparsion.lgb*0.5+comparsion.xgb*0.5)
comparsion['mean2'] = (comparsion.cat+comparsion.xgb+comparsion.lgb)/3
# comparsion['differ_mean'] = abs(comparsion.mean2 - comparsion.test)

np.sqrt(mean_squared_error(comparsion['mean2'], y_test))

198.99019945915953

In [None]:
comparsion.sort_values('differ_ridge', ascending=False)

In [27]:
np.sqrt(mean_squared_error(comparsion['mean2'], y_test))

201.0973672120065

In [None]:
comparsion.loc[:,['mean2','test','differ_mean']].sort_values('differ_mean', ascending=False)

In [103]:
from sklearn.feature_selection import RFECV

gbm = lgb.LGBMRegressor(objective='regression',
                        num_leaves=80,
                        max_depth = 6,
                        learning_rate=0.03,
                        n_estimators=2000, nthread=-1)
# clf = XGBRegressor(n_estimators=400, learning_rate=0.08, gamma=1, subsample=0.75, colsample_bytree=1, max_depth=6, nthread=-1)
# train1 = train.fillna(0)
rfecv = RFECV(estimator=gbm, step=1, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
rfecv.fit(train1, y)

print("Optimal number of features : %d" % rfecv.n_features_)

Optimal number of features : 67


In [104]:
np.sqrt(abs(rfecv.grid_scores_))

array([309.06000606, 299.73298927, 267.40745136, 257.26557161,
       243.34603692, 237.88662076, 228.89184035, 225.2179408 ,
       223.4124908 , 222.5647934 , 220.45658921, 215.99582419,
       214.8422561 , 213.24831611, 213.52338756, 214.82097166,
       212.12681576, 212.06813053, 210.80674628, 211.07379585,
       210.8382259 , 211.10700737, 210.58673501, 210.15359389,
       209.42146513, 209.76038018, 210.27888672, 209.69068231,
       210.75434351, 209.86404159, 210.85181186, 210.41475531,
       210.41505425, 209.2707632 , 210.42240312, 209.72864849,
       209.14795699, 209.30624742, 209.5912339 , 209.23851801,
       208.80791985, 208.58696431, 208.44754903, 208.47718186,
       208.85018539, 208.25802551, 208.44260822, 208.52279202,
       208.60692869, 208.91552396, 208.89136432, 208.28058018,
       208.94729614, 208.71346671, 208.90466326, 208.1928179 ,
       208.51794935, 209.22585625, 208.46296791, 208.04442815,
       208.43237193, 208.3201479 , 208.59548409, 208.69

In [None]:
ranks = rfecv.ranking_.tolist()
names = train.T.index.tolist()
ch = pd.DataFrame(data=ranks, columns=['ranks'])
ch['names'] = names
ch.sort_values('ranks', ascending=False)

In [None]:
laststatus = status.sort_values('datefrom').drop_duplicates(subset='id_flatwork', keep='last') 
laststatus.index = laststatus.id_flatwork
laststatus = laststatus[laststatus.stat_name=='Реализован']

sorteddates = pd.concat([flat, laststatus.datefrom], axis=1).loc[:,['date_settle','date_salestart','sale','datefrom']].dropna()

sorteddates['diffe'] = abs(sorteddates['sale']-sorteddates['datefrom'])

sorteddates[sorteddates.diffe>=pd.Timedelta('2 days')].sort_values('datefrom', ascending=False)

In [None]:
flat = pd.read_csv('flat.csv', parse_dates=['sale','date_salestart', 'date_settle'],encoding='cp1251')
price = pd.read_csv('price.csv',encoding='cp1251',parse_dates=['datefrom','dateto','date_salestart'])
# fixedsaledate = []
notinstatus = []
counter = 0
maximum = flat.sort_values('sale',ascending=False).iloc[0]['sale']
minimum = flat.sort_values('sale',ascending=True).iloc[0]['sale']
def flatgreatagain(x):
    global counter
    counter+=1
    if counter % 10000 == 0:
        print(str(counter))
    try:
        laststatus = status[status.id_flatwork==x['id_flatwork']].sort_values('dateto').iloc[-1]
    except:
        notinstatus.append(x['id_flatwork'])
        return(x['sale'])
    if (x['sale'].date().strftime("%Y-%m") == laststatus['datefrom'].date().strftime("%Y-%m"))\
       &(laststatus['stat_name']=='Реализован'):
        return(x['sale'])
    else:
        if laststatus['stat_name']=='Статус после покупки':
            return(x['sale'])
        elif laststatus['stat_name']=='Реализован':
#             fixedsaledate.append(x['id_flatwork'])
#             return(laststatus['datefrom'])
            return(x['sale'])
        elif laststatus['stat_name']=='Не реализуется':
            listofstrange.append(x['id_flatwork'])
            return(minimum)
        else:
            return(x['sale'])

fixedsale = flat.apply(flatgreatagain, axis=1)

In [18]:
laststatus = status.sort_values('datefrom').drop_duplicates(subset=['id_flatwork'],keep='last').reset_index(drop=True).drop('stat', axis=1)
flats = flat.loc[:,['id_flatwork','sale']].reset_index(drop=True)
flats.index=flats.id_flatwork
laststatus.index=laststatus.id_flatwork
result = pd.concat([flats, laststatus], axis=1).dropna()
result['differ'] = abs(result.datefrom-result.sale)

In [49]:
errors = result.drop('id_flatwork', axis=1).sort_values(['sale','differ'],ascending=False)\
[(result.stat_name=='Реализован')&(result['differ'] > pd.Timedelta('5 days'))]

  """Entry point for launching an IPython kernel.


In [50]:
errors.to_csv('errors.csv')

In [246]:
startper = testfull.date1.min()
test_first = testfull[testfull.date1==startper]
test_second = testfull[testfull.date1==startper+DateOffset(months=1)]
test_third = testfull[testfull.date1==startper+DateOffset(months=2)]

In [None]:
test_first['flat_left'] = test_first.apply(lambda x: len(flat[(flat.id_bulk_sp == x['id_bulk_sp'])&(flat.sale >= x['date1'])]), axis=1)
test_first['saledlastmonth'] = test_first.apply(lambda x: len(flat[(flat.id_bulk_sp == x['id_bulk_sp'])&(flat.sale < x['date1'])&(flat.sale >= x['date1'] - DateOffset(months=1))]), axis=1)
test_first['saledlast2month'] = test_first.apply(lambda x: len(flat[(flat.id_bulk_sp == x['id_bulk_sp'])&(flat.sale < x['date1'])&(flat.sale >= x['date1'] - DateOffset(months=2))]), axis=1)
test_first['saledlast3month'] = test_first.apply(lambda x: len(flat[(flat.id_bulk_sp == x['id_bulk_sp'])&(flat.sale < x['date1'])&(flat.sale >= x['date1'] - DateOffset(months=3))]), axis=1)

In [None]:
test_second['flat_left'] = test_first['flat_left'] - test_first['value']/test_first['mean_sq']
test_second['saledlastmonth'] = test_first['value']/test_first['mean_sq']

In [185]:
test = pd.read_csv('test.csv', parse_dates=['date1'])
test['id_bulk_sp'] = test['bulk_id'].map(str)+'_'+test['spalen'].map(str)
test.apply(lambda x: x['price'] - test[(test.id_bulk_sp == x['id_bulk_sp'])&(test.date1 == (x['date1'] - DateOffset(months=1)))]['price'].get(0,0))

In [108]:
train1 = train.fillna(0)
linreg = LinearRegression()
ridge = linear_model.Ridge(alpha = 0.5)
rf = RandomForestRegressor(n_estimators=1000, criterion='mse', n_jobs=-1)
print('Linear: '+str(np.sqrt(abs(np.mean(cross_val_score(linreg, train1, y, cv=cv, scoring='neg_mean_squared_error'))))))
print('RFReg: '+str(np.sqrt(abs(np.mean(cross_val_score(rf, train1, y, cv=cv, scoring='neg_mean_squared_error'))))))
# print('Ridge: '+str(np.sqrt(abs(np.mean(cross_val_score(ridge, train1, y, cv=cv, scoring='neg_mean_squared_error'))))))

Linear: 252.5586498556621
RFReg: 218.6779725916835


In [55]:
train1 = train2.fillna(0)
t = datetime.datetime.now()
xgb = XGBRegressor(n_estimators=500, learning_rate=0.08, gamma=1, subsample=0.75, max_depth=6, nthread=-1, loss_function='RMSE', random_state=59)
print(np.sqrt(abs(np.mean(cross_val_score(xgb, train1, y, cv=cv, scoring='neg_mean_squared_error',n_jobs=-1)))))
print((datetime.datetime.now() - t).total_seconds())

210.15652154404563
18.798876


In [107]:
train1 = train1.sort_index(axis=1)
test1 = test1.sort_index(axis=1)

def postproccesing(output):
    output.columns = ['id', 'value']
    output.value = output.value.apply(lambda x: 0 if x<0 else x)
    output['value'] = output.apply(lambda x: ((test.loc[x['id'],'mean_sq']*test.loc[x['id'],'flat_left']) 
                                              if x['value']>(test.loc[x['id'],'mean_sq']*test.loc[x['id'],'flat_left']) 
                                              else x['value']), axis=1)
    df = pd.concat([testfull, output.value], axis=1)
    df['max'] = df.mean_sq*df.flat_left
    merged = pd.concat([df.groupby('id_bulk_sp')['value'].sum(),df.groupby('id_bulk_sp')['max'].mean()], axis=1)

    koef = merged[merged.value>merged['max']]['max']/merged[merged.value>merged['max']]['value']
    df['value'] = df.apply(lambda x: x['value'] if x['id_bulk_sp'] not in koef else x['value']*koef[x['id_bulk_sp']], axis=1)

    output = pd.DataFrame(data = df.value).reset_index()
    output.columns = ['id', 'value']
#     output.to_csv('result.csv', index=False)
    return(output)

clf = XGBRegressor(n_estimators=390, learning_rate=0.08, gamma=1, subsample=0.75, colsample_bytree=1, max_depth=5)
clf.fit(train1,y)
pr = clf.predict(test1)
output = pd.DataFrame(data = pr).reset_index()
output1 = postproccesing(output)

cat = CatBoostRegressor(logging_level='Silent',n_estimators=1500)
cat.fit(train1,y)
pr = cat.predict(test1)
output = pd.DataFrame(data = pr).reset_index()
output2 = postproccesing(output)

gbm = lgb.LGBMRegressor(objective='regression',
                        num_leaves=80,
                        max_depth = 6,
                        learning_rate=0.03,
                        n_estimators=2000)

gbm.fit(train1,y)
pr = gbm.predict(test1)
output = pd.DataFrame(data = pr).reset_index()
output3 = postproccesing(output)

comparsion = pd.concat([output1.value, output2.value, output3.value], axis=1)
comparsion.columns = ['xgb', 'cat', 'lgb']
comparsion['mean1'] = (comparsion.cat*0.5+comparsion.xgb*0.5)
comparsion['mean2'] = (comparsion.cat+comparsion.xgb+comparsion.lgb)/3
output = pd.DataFrame(data = comparsion['mean2']).reset_index()
output.columns = ['id', 'value']
output.to_csv('result.csv', index=False)

In [None]:
df = pd.DataFrame(clf.feature_importances_,index=list(train.T.index))

%matplotlib inline
df[0].sort_values(ascending=False).plot(kind='bar',figsize=(20,10))

In [62]:
comparsion = pd.concat([output1.value, output2.value, output3.value], axis=1)
comparsion.columns = ['xgb', 'cat', 'lgb']
comparsion['mean1'] = (comparsion.cat*0.5+comparsion.xgb*0.5)
comparsion['mean2'] = (comparsion.cat+comparsion.xgb+comparsion.lgb)/3
output = pd.DataFrame(data = comparsion['mean1']).reset_index()
output.columns = ['id', 'value']
output.to_csv('result.csv', index=False)

In [398]:
train = train.sort_index(axis=1)
test = test.sort_index(axis=1)

In [113]:
output = pd.DataFrame(data = comparsion['mean2']).reset_index()
output.columns = ['id', 'value']
output.to_csv('result.csv', index=False)

In [None]:
205.92 - LB 187.6409
205.71 - 

In [None]:
xgb = XGBRegressor()
cat = CatBoostRegressor(logging_level='Silent')
parameters = { 
#               'min_child_weight': [1, 5, 10],
#               'gamma': [0,0.5, 1, 1.5, 2, 5],
#               'subsample': [0.6, 0.8, 1.0],
#               'colsample_bytree': [0.6, 1, 1.2],
#               'learning_rate':[0.02,0.06,0.10],
              'n_estimators':[390,395,400],
              'max_depth': [4,5]
              }
clf = GridSearchCV(xgb, parameters, scoring='neg_mean_squared_error', cv=cv,return_train_score=False, n_jobs=-1)
clf.fit(train, y)
gradresults = pd.DataFrame(clf.cv_results_)
gradresults.sort_values('rank_test_score', axis=0).T

In [None]:
t = datetime.datetime.now()
gbm = lgb.LGBMRegressor(objective='regression',
                        num_leaves=80,
                        max_depth = 12,
                        learning_rate=0.05,
                        n_estimators=400)                       
print(np.sqrt(abs(np.mean(cross_val_score(gbm, train, y, cv=cv, scoring='neg_mean_squared_error')))))
print((datetime.datetime.now() - t).total_seconds())

t = datetime.datetime.now()
cat = CatBoostRegressor(n_estimators=1500, logging_level='Silent')
print(np.sqrt(abs(np.mean(cross_val_score(cat, train, y, cv=cv, scoring='neg_mean_squared_error')))))
print((datetime.datetime.now() - t).total_seconds())

In [166]:
t = datetime.datetime.now()
cat = CatBoostRegressor(n_estimators=2500, logging_level='Silent',loss_function='RMSE')
print(np.sqrt(abs(np.mean(cross_val_score(cat, train, y, cv=cv, scoring='neg_mean_squared_error')))))
print((datetime.datetime.now() - t).total_seconds())

207.3920863516101
110.311268
