In [1]:
import pandas as pd, numpy as np

In [2]:
pd.options.display.max_columns = 99
from collections import Counter
import re
import xml.etree.ElementTree as ET
from tqdm import tqdm

In [3]:
from sklearn.externals import joblib

In [4]:
from scipy.stats import gmean
from datetime import datetime, timedelta

In [48]:
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder
# from sklearn.linear_model import Au
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
import xgboost as xgb

In [6]:
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
from collections import Iterable

class LabelCountEncoder(object):
    """
    Class to encode labels from most to least frequent
    """
    
    def __init__(self, most_freq_labels=None):
        if issubclass(type(most_freq_labels), Iterable) and not issubclass(type(most_freq_labels), str):
            self.mfl = list(most_freq_labels)
        else:
            self.mfl = [most_freq_labels]
        
    def fit_transform(self, X):
        lbls, inv, cnts = np.unique(X, return_inverse=True, return_counts=True)
        inf = np.iinfo(np.int32).max
        cnts[np.isin(lbls, self.mfl)] = inf
        self.sorted_idx = np.argsort(-cnts)
        self.inv_sorted_idx = np.argsort(self.sorted_idx)
        self.invt = lbls[self.sorted_idx]
        return self.inv_sorted_idx[inv]
    
    def inverse_transform(self, y):
        return self.invt[y]

In [8]:
train = pd.read_csv("./data/train.csv", parse_dates=[0, 5, 6, 7, 8]) 

In [9]:
test = pd.read_csv("./data/test.csv", parse_dates=[0, 5, 6])

# Dataset processing

In [10]:
train = train.append(test, ignore_index=True, sort=False)

In [11]:
# train
target = train["Задержка отправления в минутах"]
train = train.drop(["Дата рейса", "Время отправления фактическое", "Время прибытия фактическое", #"1 КЗ Код", 
                    "Задержка отправления в минутах"], axis=1)

In [12]:
col = "Время отправления по расписанию"
train[col + "_year"] = train[col].dt.year
train[col + "_month"] = train[col].dt.month
train[col + "_day"] = train[col].dt.day
train[col + "_hour"] = train[col].dt.hour
train[col + "_minute"] = train[col].dt.minute
train[col + "_dow"] = train[col].dt.dayofweek
train[col + "_epoch"] = train[col].astype(np.int64) // 1000000000

col = "Время прибытия по расписанию"
train[col + "_year"] = train[col].dt.year
train[col + "_month"] = train[col].dt.month
train[col + "_day"] = train[col].dt.day
train[col + "_hour"] = train[col].dt.hour
train[col + "_minute"] = train[col].dt.minute
train[col + "_dow"] = train[col].dt.dayofweek
train[col + "_epoch"] = train[col].astype(np.int64) // 1000000000

In [13]:
train["arrival_diff"] = (train["Время прибытия по расписанию"] - train["Время отправления по расписанию"]).dt.seconds

In [14]:
def parse_xml(root):
    
    keys = ["url", "price", "currencyId", "categoryId", "picture", "vendor", "vendorCode", "model"]
    
    result = {}
    for airport in root.getchildren():
        res_airp = {}
        res_airp["type"] = airport.attrib['type']
        res_airp["season"] = airport.attrib['season']
        res_airp["town"] = airport.find("town").attrib['town']
        res_airp["country"] = airport.find("town").attrib['country']
        res_airp["latitude"] = airport.find("coordinates").find('latitude').attrib['value']
        res_airp["longitude"] = airport.find("coordinates").find('longitude').attrib['value']
        if 'value' in airport.find("size").attrib:
            res_airp["size_value"] = airport.find("size").attrib['value']
        else: 
            res_airp["size_value"] = "Medium"
        
        if 'pax' in airport.find("size").attrib:
            res_airp["size_pax"] = int(airport.find("size").attrib['pax'])
        else:
            res_airp["size_pax"] = np.NaN
            
        terminals = airport.find("terminals").getchildren()
        
        res_airp["num_terminals"] = terminals.__len__()
        res_airp["num_gates"] = sum([int(t.attrib['gates']) for t in terminals])
        
        runways = airport.find("runways").getchildren()
        res_airp["num_runways"] = runways.__len__()
        c = Counter([r.attrib['surface'] for r in runways])
        if (c.__len__() > 0):
            res_airp["runway_material"] = c.most_common(1)[0][0]
        else:
            res_airp["runway_material"] = np.NaN
        res_airp["runway_len"] = np.mean([int(r.attrib['length']) for r in runways])
        result[airport.attrib['iata']] = res_airp
    return result

In [15]:
def parse_lat(latitude):
    N = 'N' in latitude
    ch_arr = [str(i) for i in range(10)]
    while(latitude[-1] not in ch_arr):
        latitude = latitude[:-1]
        
    rms = lambda x: re.sub("\D", '', x)
    
    d = float(rms(latitude.split('°')[0]))
    m, s = latitude.split('°')[1].split("''")[0].split("'")
    m, s = float(rms(m)), float(rms(s))
    ret = (d + m / 60. + s / 3600.) * (1 if N else -1)
    return ret

def parse_lon(longitude):
    W = 'W' in longitude
    ch_arr = [str(i) for i in range(10)]
    while(longitude[-1] not in ch_arr):
        longitude = longitude[:-1]
    
    rms = lambda x: re.sub("\D", '', x)
    d = float(rms(longitude.split('°')[0]))
    m, s = longitude.split('°')[1].split("''")[0].split("'")
    
    m, s = float(rms(m)), float(rms(s))
    ret = (d + m / 60. + s / 3600.) * (1 if W else -1)
    return ret

In [17]:
etree = ET.parse("./data/airports.xml") #create an ElementTree object 
root = etree.getroot()
airport_data = parse_xml(root)
etree = ET.parse("./data/airports_asia.xml") #create an ElementTree object 
root = etree.getroot()
airport_data.update(parse_xml(root))
etree = ET.parse("./data/airports_africa.xml") #create an ElementTree object 
root = etree.getroot()
airport_data.update(parse_xml(root))
etree = ET.parse("./data/airports_south_america.xml") #create an ElementTree object 
root = etree.getroot()
airport_data.update(parse_xml(root))
etree = ET.parse("./data/airports_north_america.xml") #create an ElementTree object 
root = etree.getroot()
airport_data.update(parse_xml(root))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [18]:
df_airport = pd.DataFrame(airport_data).transpose()
df_airport.loc[df_airport.size_pax.isna(), "size_pax"] = 960
df_airport.size_pax = df_airport.size_pax.astype(np.int32)
df_airport.loc[df_airport.runway_len.isna(), "runway_len"] = 0
df_airport.loc[df_airport.runway_len.isna(), "runway_material"] = "Asphalt"
df_airport.loc[:, ["num_gates", "num_runways", "num_terminals", "runway_len"]] = \
    df_airport[["num_gates", "num_runways", "num_terminals", "runway_len"]].astype(np.int32)
df_airport.loc[df_airport.longitude == "109'57'48''E", "longitude"] = "109°57'48''E"

In [19]:
df_airport.loc[:, "latitude"] = df_airport.latitude.apply(parse_lat)
df_airport.loc[:, "longitude"] = df_airport.longitude.apply(parse_lon)

In [20]:
mtrain = train.join(df_airport, on="А/П отправл", rsuffix="departure").join(df_airport, on="А/П прибыт", rsuffix="_arrival")

In [21]:
cols = ["Рейс", "А/П отправл", "А/П прибыт", "Номер ВС"]
lces = dict(zip(cols, [LabelCountEncoder() for i in range(len(cols))]))

In [22]:
for k, v in lces.items():
    mtrain[k + "_lce"] = lces[k].fit_transform(mtrain[k])

In [23]:
mtrain

Unnamed: 0,Рейс,А/П отправл,А/П прибыт,Номер ВС,Время отправления по расписанию,Время прибытия по расписанию,1 КЗ Код,Время отправления по расписанию_year,Время отправления по расписанию_month,Время отправления по расписанию_day,Время отправления по расписанию_hour,Время отправления по расписанию_minute,Время отправления по расписанию_dow,Время отправления по расписанию_epoch,Время прибытия по расписанию_year,Время прибытия по расписанию_month,Время прибытия по расписанию_day,Время прибытия по расписанию_hour,Время прибытия по расписанию_minute,Время прибытия по расписанию_dow,Время прибытия по расписанию_epoch,arrival_diff,country,latitude,longitude,num_gates,num_runways,num_terminals,runway_len,runway_material,season,size_pax,size_value,town,type,country_arrival,latitude_arrival,longitude_arrival,num_gates_arrival,num_runways_arrival,num_terminals_arrival,runway_len_arrival,runway_material_arrival,season_arrival,size_pax_arrival,size_value_arrival,town_arrival,type_arrival,Рейс_lce,А/П отправл_lce,А/П прибыт_lce,Номер ВС_lce
0,387,SVO,HAV,127,2015-10-27 07:40:00,2015-10-27 20:45:00,,2015,10,27,7,40,1,1445931600,2015,10,27,20,45,1,1445978700,47100,149,55.971667,-37.415000,28,2,1,3625,Concrete,AllYear,29256,Large,Moscow,LongHaulInternational,198,23.016667,82.383056,17,1,1,4000,Asphalt,AllYear,4119,Medium,Havana,LongHaulInternational,452,0,109,196
1,1,SVO,JFK,235,2015-10-27 09:50:00,2015-10-27 20:35:00,03.608,2015,10,27,9,50,1,1445939400,2015,10,27,20,35,1,1445978100,38700,149,55.971667,-37.415000,28,2,1,3625,Concrete,AllYear,29256,Large,Moscow,LongHaulInternational,122,40.639722,73.778889,137,4,7,3372,Asphalt,AllYear,50423,Verylarge,"New York, NY",LongHaulInternational,134,0,46,174
2,37,SVO,MIA,194,2015-10-27 10:45:00,2015-10-27 23:35:00,,2015,10,27,10,45,1,1445942700,2015,10,27,23,35,1,1445988900,46200,149,55.971667,-37.415000,28,2,1,3625,Concrete,AllYear,29256,Large,Moscow,LongHaulInternational,122,25.793056,80.290556,120,4,3,3159,Asphalt,Summer,40500,Verylarge,"Miami, FL",LongHaulInternational,669,0,124,187
3,29,SVO,LAX,196,2015-10-27 12:30:00,2015-10-28 01:20:00,,2015,10,27,12,30,1,1445949000,2015,10,28,1,20,2,1445995200,46200,149,55.971667,-37.415000,28,2,1,3625,Concrete,AllYear,29256,Large,Moscow,LongHaulInternational,122,33.942500,118.408056,112,4,9,3230,Concrete,AllYear,66667,Largest,"Los Angeles, CA",LongHaulInternational,101,0,101,190
4,671,OTP,SVO,18,2015-10-27 14:15:00,2015-10-27 16:40:00,55,2015,10,27,14,15,1,1445955300,2015,10,27,16,40,1,1445964000,8700,147,44.573611,-26.103333,24,2,1,3500,Asphalt,AllYear,7643,Medium,Bucharest,LongHaulInternational,149,55.971667,-37.415000,28,2,1,3625,Concrete,AllYear,29256,Large,Moscow,LongHaulInternational,215,90,0,122
5,845,HAM,SVO,164,2015-10-27 14:30:00,2015-10-27 17:15:00,93,2015,10,27,14,30,1,1445956200,2015,10,27,17,15,1,1445966100,9900,1002,53.630278,-9.988056,25,2,1,3458,Asphalt,AllYear,13558,Large,Hamburg,LongHaulInternational,149,55.971667,-37.415000,28,2,1,3625,Concrete,AllYear,29256,Large,Moscow,LongHaulInternational,18,84,0,53
6,15,SVO,JFK,241,2015-10-27 14:35:00,2015-10-28 01:25:00,,2015,10,27,14,35,1,1445956500,2015,10,28,1,25,2,1445995500,39000,149,55.971667,-37.415000,28,2,1,3625,Concrete,AllYear,29256,Large,Moscow,LongHaulInternational,122,40.639722,73.778889,137,4,7,3372,Asphalt,AllYear,50423,Verylarge,"New York, NY",LongHaulInternational,222,0,46,185
7,1239,DXB,SVO,255,2015-10-27 15:40:00,2015-10-27 21:20:00,03.608,2015,10,27,15,40,1,1445960400,2015,10,27,21,20,1,1445980800,20400,156,25.254722,-55.364167,101,2,2,4223,Asphalt,AllYear,66431,Largest,Dubai,LongHaulInternational,149,55.971667,-37.415000,28,2,1,3625,Concrete,AllYear,29256,Large,Moscow,LongHaulInternational,541,73,0,160
8,478,SVO,VVO,203,2015-10-27 16:10:00,2015-10-28 00:35:00,,2015,10,27,16,10,1,1445962200,2015,10,28,0,35,2,1445992500,30300,149,55.971667,-37.415000,28,2,1,3625,Concrete,AllYear,29256,Large,Moscow,LongHaulInternational,149,43.399167,-132.151389,16,4,1,1893,Asphalt,AllYear,1853,Small,Vladivostok,LongHaulInternational,291,0,63,143
9,1227,TLV,SVO,200,2015-10-27 16:45:00,2015-10-27 20:55:00,,2015,10,27,16,45,1,1445964300,2015,10,27,20,55,1,1445979300,15000,128,32.011389,-34.886667,30,3,1,2984,Asphalt,AllYear,12978,Large,Tel Aviv,LongHaulInternational,149,55.971667,-37.415000,28,2,1,3625,Concrete,AllYear,29256,Large,Moscow,LongHaulInternational,338,16,0,262


In [24]:
cat_train = mtrain.drop(["Время отправления по расписанию", "Время прибытия по расписанию", "1 КЗ Код"], axis=1)

In [28]:
cat_train[["country_arrival", "country"]] = cat_train[["country_arrival", "country"]].astype(np.int32)

In [29]:
icats = [i for i, dtt in enumerate(cat_train.dtypes) if not np.issubdtype(dtt, np.number)]
icats.extend([0, 3, 9, 16, 19, 32, 45, 46, 47, 48])
icats = np.sort(icats)

# Additional dataset processing

In [31]:
def generate_mean_encoding(column, target):
    column_name = column.name
    return pd.concat([column, target],
                     axis='columns').groupby(column_name).mean()[target.name]

In [33]:
%%time
encodings_mean = [generate_mean_encoding(cat_train.iloc[:, i], target).fillna(0) for i in icats]
encodings_VC = [cat_train.iloc[:, i].value_counts() for i in icats]
encodings_lce = [LabelCountEncoder().fit_transform(cat_train.iloc[:, i]) for i in icats]
encodings_le = [LabelEncoder().fit_transform(cat_train.iloc[:, i]) for i in icats]

CPU times: user 11 s, sys: 27.2 ms, total: 11 s
Wall time: 11 s


In [34]:
%%time
r = [cat_train.iloc[:, col].apply(lambda x: encodings_VC[i].get(x)) for i, col in enumerate(icats)]

CPU times: user 2min 11s, sys: 207 ms, total: 2min 11s
Wall time: 2min 11s


In [35]:
%%time
z = [cat_train.iloc[:, col].apply(lambda x: encodings_mean[i].get(x)) for i, col in enumerate(icats)]

CPU times: user 1min 40s, sys: 236 ms, total: 1min 40s
Wall time: 1min 40s


In [37]:
data = pd.DataFrame(np.array(encodings_le).T, columns = cat_train.columns[icats])
data = data.join(pd.DataFrame(np.array(encodings_lce).T, columns = cat_train.columns[icats]), rsuffix="_lce")
data = data.join(pd.concat(r, axis=1), rsuffix="_VC", lsuffix="_lc")
data = data.join(pd.concat(z, axis=1), rsuffix="_ME", lsuffix="_lc")
data = data.join(cat_train.iloc[:, list(set(np.arange(cat_train.shape[1])) - set(icats))])

In [41]:
lx = train.shape[0] - test.shape[0]

# Extra trees (best one model* solution)
*blending of folds of same model, not different

In [43]:
X, y = data.values[:lx], target.values[:lx]

In [44]:
kf = KFold(n_splits=5, shuffle=True, random_state=1234)

In [49]:
regrs = []
predictions = []
for idx_tr, idx_te in kf.split(X):
    regressor = ExtraTreesRegressor(n_estimators=1000, n_jobs=-1, max_depth=7, random_state=142, verbose=True,
                                   bootstrap=True)
    regressor.fit(X[idx_tr], y[idx_tr])
    prediction = regressor.predict(X[idx_te])
    print(np.sqrt(mean_squared_error(y[idx_te], prediction)))
    predictions.append(prediction)
    regrs.append(regressor)

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.0s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished


38.98980437461934


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.0s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished


38.146252661063144


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   14.9s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished


39.2119591113676


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.3s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished


37.50810080841932


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.3s finished


38.744890208083774


[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished


In [51]:
ans = np.zeros((5, data.values[lx:].shape[0]))
for i, regr in enumerate(regrs):
    pred = regr.predict(data.values[lx:])
    ans[i] = pred

[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished


In [54]:
xtrees_1000 = gmean(ans)
res_df = pd.DataFrame(xtrees_1000, columns=["Задержка отправления в минутах"])
res_df.to_csv("./out/gmean_xtrees_1000_kf.csv", index=True, index_label='index')

# Catboost

In [76]:
from catboost import CatBoostRegressor, Pool

In [78]:
X, y = cat_train.values[:lx], target.values[:lx]

In [83]:
cat_params = dict(bootstrap_type='Bernoulli', one_hot_max_size=8, loss_function='RMSE',
                     max_depth=8, learning_rate=0.1, n_estimators=1000, reg_lambda=1., verbose=True,
                  thread_count=6, subsample=1., rsm=1., use_best_model=True)

In [84]:
idx_tr, idx_te = next(ShuffleSplit(test_size=0.2, random_state=1234).split(X, y))

In [85]:
regrs = []
predictions = []
for idx_tr, idx_te in kf.split(X):
    cbr = CatBoostRegressor(**cat_params)
    cbr.fit(X[idx_tr], y[idx_tr], icats, eval_set=(X[idx_te], y[idx_te]))
    prediction = cbr.predict(X[idx_te])
    print(np.sqrt(mean_squared_error(target[idx_te], prediction)))
    predictions.append(prediction)
    regrs.append(cbr)

0:	learn: 44.6774003	test: 46.0074246	best: 46.0074246 (0)	total: 592ms	remaining: 5.33s
1:	learn: 43.8235622	test: 45.1411147	best: 45.1411147 (1)	total: 1.19s	remaining: 4.76s
2:	learn: 43.0488745	test: 44.3464089	best: 44.3464089 (2)	total: 1.82s	remaining: 4.24s
3:	learn: 42.4125611	test: 43.6661250	best: 43.6661250 (3)	total: 2.34s	remaining: 3.51s
4:	learn: 41.8732890	test: 43.0695110	best: 43.0695110 (4)	total: 2.89s	remaining: 2.89s
5:	learn: 41.4174191	test: 42.6050141	best: 42.6050141 (5)	total: 3.45s	remaining: 2.3s
6:	learn: 41.0503106	test: 42.2432152	best: 42.2432152 (6)	total: 4.04s	remaining: 1.73s
7:	learn: 40.7245944	test: 41.8898353	best: 41.8898353 (7)	total: 4.54s	remaining: 1.14s
8:	learn: 40.4705610	test: 41.6322666	best: 41.6322666 (8)	total: 5.02s	remaining: 558ms
9:	learn: 40.2265834	test: 41.4023241	best: 41.4023241 (9)	total: 5.56s	remaining: 0us

bestTest = 41.40232415
bestIteration = 9

41.40232414751297
0:	learn: 45.2937785	test: 43.2427350	best: 43.24273

In [86]:
ans = np.zeros((5, data.values[lx:].shape[0]))
for i, regr in enumerate(regrs):
    pred = regr.predict(data.values[lx:])
    ans[i] = pred

In [87]:
catboost_default = gmean(np.clip(ans, 0.01, None))
res_df = pd.DataFrame(catboost_default, columns=["Задержка отправления в минутах"])
res_df.to_csv("./out/catboost_default_kf.csv", index=True, index_label='index')

# Random forest

In [90]:
X, y = data.values[:lx], target.values[:lx]

In [168]:
regrs = []
predictions = []
for idx_tr, idx_te in kf.split(X):
    regressor = RandomForestRegressor(n_estimators=500, n_jobs=9, max_depth=7, random_state=42, verbose=True)
    regressor.fit(X[idx_tr], y[idx_tr])
    prediction = regressor.predict(X[idx_te])
    print(np.sqrt(mean_squared_error(y[idx_te], prediction)))
    predictions.append(prediction)
    regrs.append(regressor)

[Parallel(n_jobs=9)]: Done  32 tasks      | elapsed:  1.4min
[Parallel(n_jobs=9)]: Done 182 tasks      | elapsed:  7.5min
[Parallel(n_jobs=9)]: Done 432 tasks      | elapsed: 17.5min
[Parallel(n_jobs=9)]: Done 500 out of 500 | elapsed: 20.0min finished
[Parallel(n_jobs=9)]: Done  32 tasks      | elapsed:    0.0s
[Parallel(n_jobs=9)]: Done 182 tasks      | elapsed:    0.3s
[Parallel(n_jobs=9)]: Done 432 tasks      | elapsed:    0.6s
[Parallel(n_jobs=9)]: Done 500 out of 500 | elapsed:    0.7s finished


38.67325240387334


[Parallel(n_jobs=9)]: Done  32 tasks      | elapsed:  1.4min
[Parallel(n_jobs=9)]: Done 182 tasks      | elapsed:  7.1min
[Parallel(n_jobs=9)]: Done 432 tasks      | elapsed: 16.5min
[Parallel(n_jobs=9)]: Done 500 out of 500 | elapsed: 19.1min finished
[Parallel(n_jobs=9)]: Done  32 tasks      | elapsed:    0.0s
[Parallel(n_jobs=9)]: Done 182 tasks      | elapsed:    0.2s
[Parallel(n_jobs=9)]: Done 432 tasks      | elapsed:    0.6s
[Parallel(n_jobs=9)]: Done 500 out of 500 | elapsed:    0.7s finished


38.10449966876701


[Parallel(n_jobs=9)]: Done  32 tasks      | elapsed:  1.4min
[Parallel(n_jobs=9)]: Done 182 tasks      | elapsed:  7.0min
[Parallel(n_jobs=9)]: Done 432 tasks      | elapsed: 16.3min
[Parallel(n_jobs=9)]: Done 500 out of 500 | elapsed: 18.7min finished
[Parallel(n_jobs=9)]: Done  32 tasks      | elapsed:    0.0s
[Parallel(n_jobs=9)]: Done 182 tasks      | elapsed:    0.2s
[Parallel(n_jobs=9)]: Done 432 tasks      | elapsed:    0.6s
[Parallel(n_jobs=9)]: Done 500 out of 500 | elapsed:    0.7s finished


38.899669943088064


[Parallel(n_jobs=9)]: Done  32 tasks      | elapsed:  1.3min
[Parallel(n_jobs=9)]: Done 182 tasks      | elapsed:  6.9min
[Parallel(n_jobs=9)]: Done 432 tasks      | elapsed: 16.2min
[Parallel(n_jobs=9)]: Done 500 out of 500 | elapsed: 18.5min finished
[Parallel(n_jobs=9)]: Done  32 tasks      | elapsed:    0.0s
[Parallel(n_jobs=9)]: Done 182 tasks      | elapsed:    0.3s
[Parallel(n_jobs=9)]: Done 432 tasks      | elapsed:    0.6s
[Parallel(n_jobs=9)]: Done 500 out of 500 | elapsed:    0.7s finished


37.34021279239205


[Parallel(n_jobs=9)]: Done  32 tasks      | elapsed:  1.4min
[Parallel(n_jobs=9)]: Done 182 tasks      | elapsed:  6.8min
[Parallel(n_jobs=9)]: Done 432 tasks      | elapsed: 16.2min
[Parallel(n_jobs=9)]: Done 500 out of 500 | elapsed: 18.6min finished
[Parallel(n_jobs=9)]: Done  32 tasks      | elapsed:    0.0s
[Parallel(n_jobs=9)]: Done 182 tasks      | elapsed:    0.2s


38.50316779747674


[Parallel(n_jobs=9)]: Done 432 tasks      | elapsed:    0.6s
[Parallel(n_jobs=9)]: Done 500 out of 500 | elapsed:    0.7s finished


In [169]:
ans = np.zeros((5, data.values[lx:].shape[0]))
for i, regr in enumerate(regrs):
    pred = regr.predict(data.values[lx:])
    ans[i] = pred

[Parallel(n_jobs=9)]: Done  32 tasks      | elapsed:    0.0s
[Parallel(n_jobs=9)]: Done 182 tasks      | elapsed:    0.1s
[Parallel(n_jobs=9)]: Done 432 tasks      | elapsed:    0.1s
[Parallel(n_jobs=9)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=9)]: Done  32 tasks      | elapsed:    0.0s
[Parallel(n_jobs=9)]: Done 182 tasks      | elapsed:    0.1s
[Parallel(n_jobs=9)]: Done 432 tasks      | elapsed:    0.1s
[Parallel(n_jobs=9)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=9)]: Done  32 tasks      | elapsed:    0.0s
[Parallel(n_jobs=9)]: Done 182 tasks      | elapsed:    0.1s
[Parallel(n_jobs=9)]: Done 432 tasks      | elapsed:    0.1s
[Parallel(n_jobs=9)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=9)]: Done  32 tasks      | elapsed:    0.0s
[Parallel(n_jobs=9)]: Done 182 tasks      | elapsed:    0.1s
[Parallel(n_jobs=9)]: Done 432 tasks      | elapsed:    0.1s
[Parallel(n_jobs=9)]: Done 500 out of 500 | elapsed:    0.

In [None]:
rf_wide = gmean(ans)
res_df = pd.DataFrame(rf_wide, columns=["Задержка отправления в минутах"])
res_df.to_csv("./out/gmean_big_rf_wide_all_encoded_features.csv", index=True, index_label='index')