In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.metrics import auc, make_scorer, roc_auc_score
from geopy.distance import vincenty
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
import pickle

from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

import tqdm
import time

sns.set_context("notebook")
plt.style.use('ggplot')
plt.rcParams['figure.figsize']=(10,5)
%matplotlib inline

In [2]:
data_train = pd.read_csv('CAX_TrainingData_McK.csv')
data_test = pd.read_csv('CAX_TestData_McK.csv')

In [3]:
data_train.head(2)

Unnamed: 0,offer_gk,weekday_key,hour_key,driver_gk,order_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,duration_min,offer_class_group,ride_type_desc,driver_response
0,1105373,5,20,6080,174182,55.818842,37.334562,55.814567,37.35501,-1.0,-1.0,Economy,private,0
1,759733,5,14,6080,358774,55.805342,37.515023,55.819329,37.466398,18.802,25.217,Standard,private,1


Убираем пропуски для трейна

In [4]:
data_train_copy = data_train.copy()
geocolumns = data_train.loc[:, 'driver_latitude':'origin_order_longitude'].columns
for column in geocolumns:
    data_train_copy[column].replace(-1, np.nan, inplace = True)
    data_train_copy[column].replace(0, np.nan, inplace = True)
data_origin_train = data_train_copy.loc[:, ['origin_order_latitude', 'origin_order_longitude']]
data_driver_train = data_train_copy.loc[:, ['driver_latitude', 'driver_longitude']]
data_origin_train.dropna(inplace = True)
data_driver_train.dropna(inplace = True)
data_train_copy.shape, data_origin_train.shape, data_driver_train.shape

((892557, 14), (892379, 2), (889776, 2))

Убираем пропуски для теста

In [5]:
data_test_copy = data_test.copy()
geocolumns = data_test.loc[:, 'driver_latitude':'origin_order_longitude'].columns
for column in geocolumns:
    data_test_copy[column].replace(-1, np.nan, inplace = True)
    data_test_copy[column].replace(0, np.nan, inplace = True)
data_origin_test = data_test_copy.loc[:, ['origin_order_latitude', 'origin_order_longitude']]
data_driver_test = data_test_copy.loc[:, ['driver_latitude', 'driver_longitude']]
data_origin_test.dropna(inplace = True)
data_driver_test.dropna(inplace = True)
data_test_copy.shape, data_origin_test.shape, data_driver_test.shape

((237813, 14), (237767, 2), (236929, 2))

In [6]:
data1 = pd.read_csv('10k_w_districts_1.csv', encoding = 'utf-8') #НИКИТЕ: вот с этого момента запускай

KNN для данных origin без пропусков

In [7]:
knn = KNeighborsClassifier(n_neighbors=3, n_jobs=-1,weights='distance')
data_t = data1[data1['part'] != 'UNKNOWN']
knn.fit(X=data_t[['origin_order_latitude', 'origin_order_longitude']], y=data_t.part)
predict_train_origin = knn.predict(X=data_origin_train[['origin_order_latitude', 'origin_order_longitude']])
predict_test_origin = knn.predict(X=data_origin_test[['origin_order_latitude', 'origin_order_longitude']])

In [8]:
data_origin_train['district_origin'] = predict_train_origin
data_origin_test['district_origin'] = predict_test_origin

In [9]:
data_train_copy['district_origin'] = data_origin_train['district_origin']
data_test_copy['district_origin'] = data_origin_test['district_origin']

KNN для данных driver без пропусков

In [10]:
knn = KNeighborsClassifier(n_neighbors=3, n_jobs=-1,weights='distance')
data_t = data1[data1['part_d'] != 'UNKNOWN']
knn.fit(X=data_t[['driver_latitude', 'driver_longitude']], y=data_t['part_d'])
predict_train_driver = knn.predict(X=data_driver_train[['driver_latitude', 'driver_longitude']])
predict_test_driver = knn.predict(X=data_driver_test[['driver_latitude', 'driver_longitude']])

# knn = KNeighborsClassifier(n_neighbors=3, n_jobs=-1,weights='distance')
# data_t = data1[data1['part'] != 'UNKNOWN']
# knn.fit(X=data_t[['driver_latitude', 'driver_longitude']], y=data_t.part_d)
# predict_train = knn.predict(X=data_train[['driver_latitude', 'driver_longitude']])
# predict_test = knn.predict(X=data_test[['driver_latitude', 'driver_longitude']])

In [11]:
data_driver_train['district_driver'] = predict_train_driver
data_driver_test['district_driver'] = predict_test_driver

In [12]:
data_train_copy['district_driver'] = data_driver_train['district_driver']
data_test_copy['district_driver'] = data_driver_test['district_driver']

Заполняем пропуски для 'district_driver' и 'district_origin'

In [13]:
data_train_copy.loc[data_train_copy['district_driver'].isnull(), ['district_driver']] = data_train_copy[data_train_copy['district_driver'].isnull()]['district_origin']
data_train_copy.loc[data_train_copy['district_origin'].isnull(), ['district_origin' ]] = data_train_copy[data_train_copy['district_origin' ].isnull()]['district_driver']

In [14]:
data_test_copy.loc[data_test_copy['district_driver'].isnull(), ['district_driver']] = data_test_copy[data_test_copy['district_driver'].isnull()]['district_origin']
data_test_copy.loc[data_test_copy['district_origin'].isnull(), ['district_origin' ]] = data_test_copy[data_test_copy['district_origin' ].isnull()]['district_driver']

Восстанавливаем пропущенные координаты:

In [15]:
#создадим словарь с координатами центров районов
centers_origin = {}
for district in data_train_copy.district_origin.unique():
    centers_origin.update({district:[data_train_copy[data_train_copy.district_origin == district]['origin_order_longitude'].mean(),
                                     data_train_copy[data_train_copy.district_origin == district]['origin_order_latitude'].mean()]})
    
centers_driver = {}
for district in data_train_copy.district_driver.unique():
    centers_driver.update({district:[data_train_copy[data_train_copy.district_driver == district]['driver_longitude'].mean(),
                                     data_train_copy[data_train_copy.district_driver == district]['driver_latitude'].mean()]}) 
centers_driver['Солнечногорский район'] = [37.125881282828281, 55.982758343434334]

In [16]:
# plt.figure(figsize = (8,5))
# plt.scatter(data_train_copy[(data_train_copy['district_origin'] == 'Центральный административный округ')].origin_order_longitude, 
#                 data_train_copy[(data_train_copy['district_origin'] == 'Центральный административный округ')].origin_order_latitude, c='purple')
# plt.scatter(centers_origin['Центральный административный округ'][0],centers_origin['Центральный административный округ'][1], c = 'red')

Заменяем пропущенные координаты на средние по районам на трейне

In [17]:
def fill_driver_null(row):
    row['driver_longitude'] = centers_driver[row['district_driver']][0]
    row['driver_latitude'] = centers_driver[row['district_driver']][1]
    return row

def fill_origin_null(row):
    row['origin_order_longitude'] = centers_driver[row['district_origin']][0]
    row['origin_order_latitude'] = centers_driver[row['district_origin']][1]
    return row

In [18]:
data_null_driver = data_train_copy.loc[data_train_copy['driver_latitude'].isnull(), ['driver_longitude', 'driver_latitude', 'district_driver']]
data_null_origin = data_train_copy.loc[data_train_copy['origin_order_latitude'].isnull(), ['origin_order_longitude', 'origin_order_latitude', 'district_origin']]

In [19]:
data_train_copy.loc[data_train_copy['driver_longitude'].isnull(), 
                    ['driver_longitude', 'driver_latitude']] = data_null_driver.apply(fill_driver_null, axis = 1)[['driver_longitude', 'driver_latitude']]
data_train_copy.loc[data_train_copy['origin_order_longitude'].isnull(), 
                    ['origin_order_longitude', 'origin_order_latitude']] = data_null_origin.apply(fill_origin_null, axis = 1)[['origin_order_longitude', 'origin_order_latitude']]

In [20]:
data_null_driver_test = data_test_copy.loc[data_test_copy['driver_latitude'].isnull(), ['driver_longitude', 'driver_latitude', 'district_driver']]
data_null_origin_test = data_test_copy.loc[data_test_copy['origin_order_latitude'].isnull(), ['origin_order_longitude', 'origin_order_latitude', 'district_origin']]

In [21]:
data_test_copy.loc[data_test_copy['driver_longitude'].isnull(), 
                    ['driver_longitude', 'driver_latitude']] = data_null_driver_test.apply(fill_driver_null, axis = 1)[['driver_longitude', 'driver_latitude']]
data_test_copy.loc[data_test_copy['origin_order_longitude'].isnull(), 
                    ['origin_order_longitude', 'origin_order_latitude']] = data_null_origin_test.apply(fill_origin_null, axis = 1)[['origin_order_longitude', 'origin_order_latitude']]

In [22]:
# plt.figure(figsize = (16,15))

# for i in data_train_copy['district_origin'].unique():
#     plt.scatter(data_train_copy[(data_train_copy['district_origin'] == i)].origin_order_longitude, 
#                 data_train_copy[(data_train_copy['district_origin'] == i)].origin_order_latitude, 
#                 label=i, 
#                 c = ([np.random.rand() for i in range(3)]))
# plt.legend()

Теперь разберемся с некорректными значениями в переменных distance и duration:    


In [23]:
for column in ['distance_km', 'duration_min']:
    data_train_copy[column].replace(-1, 0, inplace = True)
    data_test_copy[column].replace(-1, 0, inplace = True)

#### Дистанция:

Создаём обучающую выборку (объеденяем ненулевые значения для трейн/тест)

In [24]:
full_data = data_train_copy.append(data_test_copy).reset_index(drop = True)
full_data = full_data[full_data['distance_km'] > 0]

In [25]:
common_district = (full_data['district_origin'] == full_data['district_driver']).values.astype(int)
full_data['common_district'] = common_district
print(full_data.shape)

(777639, 17)


In [26]:
x_xgb = pd.get_dummies(data = full_data[['weekday_key', 
                                   'hour_key', 
                                   'offer_class_group', 
                                   'ride_type_desc',
                                   'driver_latitude', 
                                   'driver_longitude',
                                   'origin_order_latitude',
                                   'origin_order_longitude',
                                   'common_district']], 
                       columns = ['weekday_key', 
                                  'hour_key', 
                                  'offer_class_group', 
                                  'ride_type_desc'])
y_xgb = full_data['distance_km']

Обучаем модель

In [27]:
dfull_distance_train = xgb.DMatrix(x_xgb, label=y_xgb)
params = {
         'num_round': 150,
         'eta': 0.2,
         'max_depth': 5,
         'min_child_weight': 3,
         'subsample': 0.7,
         'gamma': 0,
         'objective': 'reg:linear',
         'nthread' : 4,
         'silent' : 1
         }
model = xgb.train(params, dfull_distance_train, int(params['num_round']))

Заполняем нулевые значения distance_km для трейна

In [28]:
nan_train = data_train_copy[data_train_copy['distance_km'] <= 0]
common_district = (nan_train['district_origin'] == nan_train['district_driver']).values.astype(int)
nan_train['common_district'] = common_district
print(nan_train.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


(255598, 17)


In [29]:
to_predict_train = pd.get_dummies(data = nan_train[['weekday_key', 
                                   'hour_key', 
                                   'offer_class_group', 
                                   'ride_type_desc',
                                   'driver_latitude', 
                                   'driver_longitude',
                                   'origin_order_latitude',
                                   'origin_order_longitude',
                                   'common_district']], 
                       columns = ['weekday_key', 
                                  'hour_key', 
                                  'offer_class_group', 
                                  'ride_type_desc'])
to_predict_train.drop(['ride_type_desc_affiliate'], axis = 1, inplace = True)
to_predict_y = nan_train['distance_km']

In [30]:
d_to_predict_train = xgb.DMatrix(to_predict_train, label=to_predict_y)
predictions = model.predict(d_to_predict_train)
nan_train['predicted_distance'] = predictions
data_train_copy.loc[data_train_copy['distance_km'] <=0, ['distance_km']] = nan_train['predicted_distance']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Заполняем нулевые значения distance_km для теста

In [31]:
nan_test = data_test_copy[data_test_copy['distance_km'] <= 0]
common_district = (nan_test['district_origin'] == nan_test['district_driver']).values.astype(int)
nan_test['common_district'] = common_district
print(nan_test.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


(97133, 17)


In [32]:
to_predict_test = pd.get_dummies(data = nan_test[['weekday_key', 
                                   'hour_key', 
                                   'offer_class_group', 
                                   'ride_type_desc',
                                   'driver_latitude', 
                                   'driver_longitude',
                                   'origin_order_latitude',
                                   'origin_order_longitude',
                                   'common_district']], 
                       columns = ['weekday_key', 
                                  'hour_key', 
                                  'offer_class_group', 
                                  'ride_type_desc'])
to_predict_test.drop(['ride_type_desc_affiliate'], axis = 1, inplace = True)
to_predict_y = nan_test['distance_km']

In [33]:
d_to_predict_test = xgb.DMatrix(to_predict_test, label=to_predict_y)
predictions = model.predict(d_to_predict_test)
nan_test['predicted_distance'] = predictions
data_test_copy.loc[data_test_copy['distance_km'] <=0, ['distance_km']] = nan_test['predicted_distance']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [34]:
data_train_copy.head()

Unnamed: 0,offer_gk,weekday_key,hour_key,driver_gk,order_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,duration_min,offer_class_group,ride_type_desc,driver_response,district_origin,district_driver
0,1105373,5,20,6080,174182,55.818842,37.334562,55.814567,37.35501,15.688855,0.0,Economy,private,0,городской округ Красногорск,городской округ Красногорск
1,759733,5,14,6080,358774,55.805342,37.515023,55.819329,37.466398,18.802,25.217,Standard,private,1,Северо-Западный административный округ,Северный административный округ
2,416977,6,14,6080,866260,55.813978,37.347688,55.814827,37.354074,6.747,9.8,Economy,private,0,городской округ Красногорск,городской округ Красногорск
3,889660,2,6,6080,163522,55.745922,37.421748,55.743469,37.43113,19.724066,0.0,Economy,private,1,Западный административный округ,Западный административный округ
4,1120055,4,16,6080,506710,55.803578,37.521602,55.812559,37.527407,12.383,19.25,Economy,private,1,Северный административный округ,Северный административный округ


#### Длительность поездки

In [35]:
full_data_duration = data_train_copy.append(data_test_copy).reset_index(drop = True)
full_data_duration = full_data_duration[full_data_duration['duration_min'] > 0]

In [36]:
common_district = (full_data_duration['district_origin'] == full_data_duration['district_driver']).values.astype(int)
full_data_duration['common_district'] = common_district
print(full_data_duration.shape)

(777627, 17)


In [37]:
x_xgb_duration = pd.get_dummies(data = full_data_duration[['weekday_key', 
                                   'hour_key', 
                                   'offer_class_group', 
                                   'ride_type_desc',
                                   'driver_latitude', 
                                   'driver_longitude',
                                   'origin_order_latitude',
                                   'origin_order_longitude',
                                   'common_district']], 
                       columns = ['weekday_key', 
                                  'hour_key', 
                                  'offer_class_group', 
                                  'ride_type_desc'])
y_xgb_duration = full_data_duration['duration_min']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(
    x_xgb_duration, y_xgb_duration, test_size=0.3, random_state=17)

Обучаем модель для длительности поездки

In [39]:
dfull_duration_train = xgb.DMatrix(x_xgb_duration, label=y_xgb_duration)
params = {
         'num_round': 150,
         'eta': 0.2,
         'max_depth': 7,
         'min_child_weight': 1,
         'subsample': 0.8,
         'gamma': 0.4,
         'objective': 'reg:linear',
         'nthread' : 4,
         'silent' : 1
         }
model = xgb.train(params, dfull_duration_train, int(params['num_round']))

Заполняем нулевые значения duration_min для трейна

In [40]:
nan_train_duration = data_train_copy[data_train_copy['duration_min'] <= 0]
common_district = (nan_train_duration['district_origin'] == nan_train_duration['district_driver']).values.astype(int)
nan_train_duration['common_district'] = common_district
print(nan_train_duration.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


(255607, 17)


In [41]:
to_predict_train_duration = pd.get_dummies(data = nan_train_duration[['weekday_key', 
                                   'hour_key', 
                                   'offer_class_group', 
                                   'ride_type_desc',
                                   'driver_latitude', 
                                   'driver_longitude',
                                   'origin_order_latitude',
                                   'origin_order_longitude',
                                   'common_district']], 
                       columns = ['weekday_key', 
                                  'hour_key', 
                                  'offer_class_group', 
                                  'ride_type_desc'])
to_predict_train_duration.drop(['ride_type_desc_affiliate'], axis = 1, inplace = True)
to_predict_y = nan_train_duration['duration_min']

In [42]:
d_to_predict_train_duration = xgb.DMatrix(to_predict_train_duration, label=to_predict_y)
predictions = model.predict(d_to_predict_train_duration)
nan_train_duration['predicted_duration'] = predictions
data_train_copy.loc[data_train_copy['duration_min'] <=0, ['duration_min']] = nan_train_duration['predicted_duration']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Заполняем нулевые значения duration_min для теста

In [43]:
nan_test_duration = data_test_copy[data_test_copy['duration_min'] <= 0]
common_district = (nan_test_duration['district_origin'] == nan_test_duration['district_driver']).values.astype(int)
nan_test_duration['common_district'] = common_district
print(nan_test_duration.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


(97136, 17)


In [44]:
to_predict_test_duration = pd.get_dummies(data = nan_test_duration[['weekday_key', 
                                   'hour_key', 
                                   'offer_class_group', 
                                   'ride_type_desc',
                                   'driver_latitude', 
                                   'driver_longitude',
                                   'origin_order_latitude',
                                   'origin_order_longitude',
                                   'common_district']], 
                       columns = ['weekday_key', 
                                  'hour_key', 
                                  'offer_class_group', 
                                  'ride_type_desc'])
to_predict_test_duration.drop(['ride_type_desc_affiliate'], axis = 1, inplace = True)
to_predict_y = nan_test_duration['duration_min']

In [45]:
d_to_predict_test_duration = xgb.DMatrix(to_predict_test_duration, label=to_predict_y)
predictions = model.predict(d_to_predict_test_duration)
nan_test_duration['predicted_duration'] = predictions
data_test_copy.loc[data_test_copy['duration_min'] <=0, ['duration_min']] = nan_test_duration['predicted_duration']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### NANS ARE OUT

### Извлечение расстояния из геоданных

In [46]:
def get_direction(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

def get_vincenty_distance(latitude1, longitude1, latitude2, longitude2):
    vincenty_distance = np.zeros(latitude1.shape[0])
    i = 0
    for lat1,lng1,lat2,lng2 in zip(latitude1, longitude1, latitude2, longitude2):
        vincenty_distance[i] = vincenty((lng1, lat1), (lng2, lat2)).kilometers
        i += 1
    return vincenty_distance

def get_manhattan_distance(lat1, lng1, lat2, lng2):
    a = get_vincenty_distance(lat1, lng1, lat1, lng2)
    b = get_vincenty_distance(lat1, lng1, lat2, lng1)
    return a + b

Извлечение расстояния из геоданных для трейна

In [47]:
data_train_copy.loc[:, 'distance_to_origin'] = get_vincenty_distance(data_train_copy['driver_latitude'].values, 
                                             data_train_copy['driver_longitude'].values, 
                                             data_train_copy['origin_order_latitude'].values, 
                                             data_train_copy['origin_order_longitude'].values)
data_train_copy.loc[:, 'direction'] = get_direction(data_train_copy['driver_latitude'].values, 
                                             data_train_copy['driver_longitude'].values, 
                                             data_train_copy['origin_order_latitude'].values, 
                                             data_train_copy['origin_order_longitude'].values)
data_train_copy.loc[:, 'distance_manhattan'] = get_manhattan_distance(data_train_copy['driver_latitude'].values, 
                                                            data_train_copy['driver_longitude'].values, 
                                                            data_train_copy['origin_order_latitude'].values, 
                                                            data_train_copy['origin_order_longitude'].values)

In [48]:
%%time
data_test_copy.loc[:, 'distance_to_origin'] = get_vincenty_distance(data_test_copy['driver_latitude'].values, 
                                             data_test_copy['driver_longitude'].values, 
                                             data_test_copy['origin_order_latitude'].values, 
                                             data_test_copy['origin_order_longitude'].values)
data_test_copy.loc[:, 'direction'] = get_direction(data_test_copy['driver_latitude'].values, 
                                             data_test_copy['driver_longitude'].values, 
                                             data_test_copy['origin_order_latitude'].values, 
                                             data_test_copy['origin_order_longitude'].values)
data_test_copy.loc[:, 'distance_manhattan'] = get_manhattan_distance(data_test_copy['driver_latitude'].values, 
                                                            data_test_copy['driver_longitude'].values, 
                                                            data_test_copy['origin_order_latitude'].values, 
                                                            data_test_copy['origin_order_longitude'].values)

Wall time: 38.1 s


#### Фича - доля принятых заказов по районам

In [49]:
lab_encoder_origin = LabelEncoder()
lab_encoder_driver = LabelEncoder()

data_train_copy['district_origin'] = lab_encoder_origin.fit_transform(data_train_copy['district_origin'])
origin_avg_response = data_train_copy.groupby('district_origin')['driver_response'].mean().values
dict_origin_avg_response = dict(zip(list(range(0,len(origin_avg_response))), 
                                        origin_avg_response))

data_train_copy['district_driver'] = lab_encoder_driver.fit_transform(data_train_copy['district_driver'])
driver_avg_response = data_train_copy.groupby('district_driver')['driver_response'].mean().values
dict_driver_avg_response = dict(zip(list(range(0,len(driver_avg_response))), 
                                        driver_avg_response))

In [50]:
np_district_origin = data_train_copy['district_origin'].values
origin_avg_response = np.zeros(np_district_origin.shape[0])
for i in range(len(np_district_origin)):
    origin_avg_response[i] = dict_origin_avg_response[np_district_origin[i]]
data_train_copy['district_origin_rate'] = origin_avg_response

np_district_driver = data_train_copy['district_driver'].values
driver_avg_response = np.zeros(np_district_driver.shape[0])
for i in range(len(np_district_driver)):
    driver_avg_response[i] = dict_driver_avg_response[np_district_driver[i]]
data_train_copy['district_driver_rate'] = driver_avg_response

In [51]:
data_test_copy['district_origin'] = lab_encoder_origin.transform(data_test_copy['district_origin'])
data_test_copy['district_driver'] = lab_encoder_driver.transform(data_test_copy['district_driver'])

In [52]:
np_district_origin = data_test_copy['district_origin'].values
origin_avg_response = np.zeros(np_district_origin.shape[0])
for i in range(len(np_district_origin)):
    origin_avg_response[i] = dict_origin_avg_response[np_district_origin[i]]
data_test_copy['district_origin_rate'] = origin_avg_response

np_district_driver = data_test_copy['district_driver'].values
driver_avg_response = np.zeros(np_district_driver.shape[0])
for i in range(len(np_district_driver)):
    driver_avg_response[i] = dict_driver_avg_response[np_district_driver[i]]
data_test_copy['district_driver_rate'] = driver_avg_response

In [53]:
data_train_copy['common_district'] = (data_train_copy['district_origin'] == data_train_copy['district_driver']).values.astype(int)
data_test_copy['common_district'] = (data_test_copy['district_origin'] == data_test_copy['district_driver']).values.astype(int)

Сделаем численный признак для водителя:

#### driver_response_rate
Доля приняты заказов каждого водителя

In [54]:
drivers_rate = {}
for driver in data_train_copy.driver_gk.unique():
    drivers_rate.update({driver:data_train_copy[data_train_copy.driver_gk == driver].driver_response.mean()})

In [55]:
data_train_copy['driver_response_rate'] = data_train_copy['driver_gk'].map(drivers_rate)
data_test_copy['driver_response_rate'] = data_test_copy['driver_gk'].map(drivers_rate)

In [56]:
data_train_copy[['driver_response', 'district_origin_rate']].corr('spearman')

Unnamed: 0,driver_response,district_origin_rate
driver_response,1.0,0.286884
district_origin_rate,0.286884,1.0


#### order_count
Количество водителей, которым предложен данный заказ:

In [57]:
orders_train = dict(zip(data_train_copy.order_gk.value_counts().index, 
                        data_train_copy.order_gk.value_counts().values))
orders_test = dict(zip(data_test_copy.order_gk.value_counts().index, 
                        data_test_copy.order_gk.value_counts().values))

In [58]:
%%time
data_train_copy['order_count'] = data_train_copy['order_gk'].map(orders_train)
data_test_copy['order_count'] = data_test_copy['order_gk'].map(orders_test)

Wall time: 1.72 s


#### driver_orders
количество заказов, предложенных ему на трейне:

In [59]:
drivers_orders = {}
for driver in data_train_copy.driver_gk.unique():
    drivers_orders.update({driver:len(data_train_copy[data_train_copy.driver_gk == driver])})

In [60]:
data_train_copy['driver_orders'] = data_train_copy['driver_gk'].map(drivers_orders)
data_test_copy['driver_orders'] = data_test_copy['driver_gk'].map(drivers_orders)

#### driver_orders_accepted
количество заказов, принятых им на трейне:

In [61]:
data_train_copy['driver_orders_accepted'] = data_train_copy['driver_orders'] * data_train_copy['driver_response_rate']
data_test_copy['driver_orders_accepted'] = data_test_copy['driver_orders'] * data_test_copy['driver_response_rate']

#### order_speed
скорость на выполнении заказа

In [62]:
data_train_copy['order_speed'] = data_train_copy['distance_km'] / data_train_copy['duration_min']
data_test_copy['order_speed'] = data_test_copy['distance_km'] / data_test_copy['duration_min']

In [63]:
def get_district_class(df, column, lower = True, test = False, df_train = None):
    if test:
        if lower:
            district_indexes = df_train.groupby(column)['driver_response'].mean()[df_train.groupby(column)['driver_response'].mean() < 0.5].index
        else:
            district_indexes = df_train.groupby(column)['driver_response'].mean()[df_train.groupby(column)['driver_response'].mean() > 0.8].index
    else:    
        if lower:
            district_indexes = df.groupby(column)['driver_response'].mean()[df.groupby(column)['driver_response'].mean() < 0.5].index
        else:
            district_indexes = df.groupby(column)['driver_response'].mean()[df.groupby(column)['driver_response'].mean() > 0.8].index
    new_column_values = np.zeros(df.shape[0])
    iter_arr = 0
    for i in df[column].values:
        if i in district_indexes:
            new_column_values[iter_arr] = 1
        iter_arr += 1
    return new_column_values

In [64]:
data_train_copy['low_driver_rate'] = get_district_class(data_train_copy, 'district_driver', True)
data_train_copy['high_driver_rate'] = get_district_class(data_train_copy, 'district_driver', False)
data_train_copy['low_origin_rate'] = get_district_class(data_train_copy, 'district_origin', True)
data_train_copy['high_origin_rate'] = get_district_class(data_train_copy, 'district_origin', False)

In [65]:
data_test_copy['low_driver_rate'] = get_district_class(data_test_copy, 'district_driver', True, True, data_train_copy)
data_test_copy['high_driver_rate'] = get_district_class(data_test_copy, 'district_driver', False, True, data_train_copy)
data_test_copy['low_origin_rate'] = get_district_class(data_test_copy, 'district_origin', True, True, data_train_copy)
data_test_copy['high_origin_rate'] = get_district_class(data_test_copy, 'district_origin', False, True, data_train_copy)

#### driver_districts driver_origins

Признак: количество районов, где водитель был, и количество районов, куда ему поступали заявки

In [25]:
driver_districts = {}
for i in data_train_copy.driver_gk.unique():
    driver_districts.update({i:data_train_copy[data_train_copy.driver_gk == i]['district_driver'].nunique()})
    

driver_origins = {}
for i in data_train_copy.driver_gk.unique():
    driver_origins.update({i:data_train_copy[data_train_copy.driver_gk == i]['district_origin'].nunique()})
    

data_train_copy['driver_districts'] = data_train_copy.driver_gk.map(driver_districts)
data_train_copy['driver_origins'] = data_train_copy.driver_gk.map(driver_origins)
data_test_copy['driver_districts'] = data_test_copy.driver_gk.map(driver_districts)
data_test_copy['driver_origins'] = data_test_copy.driver_gk.map(driver_origins)

#### driver_districts_accepted driver_origins_accepted

Признак: количество районов, где водитель был, и количество районов, куда ему поступали заявки, при условии, что он согласился:

In [26]:
driver_districts_accepted = {}
for i in data_train_copy.driver_gk.unique():
    driver_districts_accepted.update({i:data_train_copy[(data_train_copy.driver_gk == i)&(data_train_copy.driver_response == 1)]['district_driver'].nunique()})
    

driver_origins_accepted = {}
for i in data_train_copy.driver_gk.unique():
    driver_origins_accepted.update({i:data_train_copy[(data_train_copy.driver_gk == i)&(data_train_copy.driver_response == 1)]['district_origin'].nunique()})
    

data_train_copy['driver_districts_accepted'] = data_train_copy.driver_gk.map(driver_districts_accepted)
data_train_copy['driver_origins_accepted'] = data_train_copy.driver_gk.map(driver_origins_accepted)
data_test_copy['driver_districts_accepted'] = data_test_copy.driver_gk.map(driver_districts_accepted)
data_test_copy['driver_origins_accepted'] = data_test_copy.driver_gk.map(driver_origins_accepted)

#### driver_districts_accepted_rate origin_districts_accepted_rate

Средний ответ водителя для района, где он, и куда заказ:

In [27]:
driver_districts_accepted_rate = {}
for i in data_train_copy.district_driver.unique():
    driver_districts_accepted_rate.update({i:data_train_copy[data_train_copy.district_driver == i]['driver_response'].mean()})
    

origin_districts_accepted_rate = {}
for i in data_train_copy.district_origin.unique():
    origin_districts_accepted_rate.update({i:data_train_copy[data_train_copy.district_origin == i]['driver_response'].mean()})
    

data_train_copy['driver_districts_accepted_rate'] = data_train_copy.district_driver.map(driver_districts_accepted_rate)
data_train_copy['origin_districts_accepted_rate'] = data_train_copy.district_origin.map(origin_districts_accepted_rate)
data_test_copy['driver_districts_accepted_rate'] = data_test_copy.district_driver.map(driver_districts_accepted_rate)
data_test_copy['origin_districts_accepted_rate'] = data_test_copy.district_origin.map(origin_districts_accepted_rate)

Время до клиента

In [29]:
data_train_copy['time_to_origin'] = data_train_copy['distance_to_origin']/data_train_copy['order_speed']
data_test_copy['time_to_origin'] = data_test_copy['distance_to_origin']/data_test_copy['order_speed']

Еще параметры водителя - параметры распределения временных признаков

In [30]:
driver_week_std = {}
for i in data_train_copy.driver_gk.unique():
    driver_week_std.update({i:data_train_copy[(data_train_copy.driver_gk == i)&(data_train_copy.driver_response == 1)]['weekday_key'].std()})

driver_hour_std = {}
for i in data_train_copy.driver_gk.unique():
    driver_hour_std.update({i:data_train_copy[(data_train_copy.driver_gk == i)&(data_train_copy.driver_response == 1)]['hour_key'].std()})

driver_week_mean = {}
for i in data_train_copy.driver_gk.unique():
    driver_week_mean.update({i:data_train_copy[(data_train_copy.driver_gk == i)&(data_train_copy.driver_response == 1)]['weekday_key'].mean()})

driver_hour_mean = {}
for i in data_train_copy.driver_gk.unique():
    driver_hour_mean.update({i:data_train_copy[(data_train_copy.driver_gk == i)&(data_train_copy.driver_response == 1)]['hour_key'].mean()})

data_train_copy['driver_week_std'] = data_train_copy.driver_gk.map(driver_week_std)
data_train_copy['driver_week_mean'] = data_train_copy.driver_gk.map(driver_week_mean)
data_test_copy['driver_week_std'] = data_test_copy.driver_gk.map(driver_week_std)
data_test_copy['driver_week_mean'] = data_test_copy.driver_gk.map(driver_week_mean)
data_train_copy['driver_hour_std'] = data_train_copy.driver_gk.map(driver_hour_std)
data_train_copy['driver_hour_mean'] = data_train_copy.driver_gk.map(driver_hour_mean)
data_test_copy['driver_hour_std'] = data_test_copy.driver_gk.map(driver_hour_std)
data_test_copy['driver_hour_mean'] = data_test_copy.driver_gk.map(driver_hour_mean)

Также добавим разницу между средним значением часа для водителя и часом заказа ( а также недели) в абс значении

In [31]:
data_train_copy['diff_hour'] = np.abs(data_train_copy['hour_key'] - data_train_copy['driver_hour_mean'])
data_train_copy['diff_weekday'] = np.abs(data_train_copy['weekday_key'] - data_train_copy['driver_week_mean'])
data_test_copy['diff_hour'] = np.abs(data_test_copy['hour_key'] - data_test_copy['driver_hour_mean'])
data_test_copy['diff_weekday'] = np.abs(data_test_copy['weekday_key'] - data_test_copy['driver_week_mean'])

расстояние до центра Москвы:

In [32]:
def get_center_distance(latitude1, longitude1, latitude2, longitude2):
    vincenty_distance = np.zeros(latitude2.shape[0])
    i = 0
    for lat2,lng2 in zip(latitude2, longitude2):
        vincenty_distance[i] = vincenty((longitude1, latitude1), (lng2, lat2)).kilometers
        i += 1
    return vincenty_distance

c_m = [55.7547, 37.6212]

data_train_copy.loc[:, 'distance_to_center_origin'] = get_center_distance(c_m[0], 
                                             c_m[1], 
                                             data_train_copy['origin_order_latitude'].values, 
                                             data_train_copy['origin_order_longitude'].values)

data_test_copy.loc[:, 'distance_to_center_origin'] = get_center_distance(c_m[0], 
                                             c_m[1], 
                                             data_test_copy['origin_order_latitude'].values, 
                                             data_test_copy['origin_order_longitude'].values)

data_train_copy.loc[:, 'distance_to_center_driver'] = get_center_distance(c_m[0], 
                                             c_m[1], 
                                             data_train_copy['driver_latitude'].values, 
                                             data_train_copy['driver_longitude'].values)

data_test_copy.loc[:, 'distance_to_center_driver'] = get_center_distance(c_m[0], 
                                             c_m[1], 
                                             data_test_copy['driver_latitude'].values, 
                                             data_test_copy['driver_longitude'].values)

data_train_copy['to_center_dist'] = data_train_copy['distance_to_center_driver'] - data_train_copy['distance_to_center_origin']
data_test_copy['to_center_dist'] = data_test_copy['distance_to_center_driver'] - data_test_copy['distance_to_center_origin']

In [33]:
data_test_copy.head()

Unnamed: 0,offer_gk,weekday_key,hour_key,driver_gk,order_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,...,time_to_origin,driver_week_std,driver_week_mean,driver_hour_std,driver_hour_mean,diff_hour,diff_weekday,distance_to_center_origin,distance_to_center_driver,to_center_dist
0,152446,5,0,5021,648419,55.763302,37.593368,55.75823,37.613689,17.445,...,3.211843,2.07266,3.469388,6.607111,13.632653,13.632653,1.530612,0.889991,3.181057,2.291067
1,281031,5,7,5817,405907,55.75547,37.648689,55.741544,37.622868,12.661445,...,6.555218,2.053035,3.166667,7.413794,11.37037,4.37037,1.833333,1.176091,3.051749,1.875658
2,779964,5,1,3870,894998,55.619002,37.59614,55.615923,37.607872,19.761181,...,1.964883,1.96684,3.146119,7.039667,13.826484,12.826484,1.853881,12.341621,12.300337,-0.041284
3,16720,5,13,5607,505054,55.620905,37.60655,55.614517,37.591161,14.576858,...,2.713209,1.706125,3.366412,4.844245,15.625954,2.625954,1.633588,12.819306,11.924271,-0.895035
4,492087,5,13,3786,459994,55.893228,37.673552,55.888084,37.662087,15.943624,...,1.6302,1.638936,3.37037,6.532634,13.728395,0.728395,1.62963,12.616616,13.535874,0.919258


In [34]:
data_train_copy.head()

Unnamed: 0,offer_gk,weekday_key,hour_key,driver_gk,order_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,...,time_to_origin,driver_week_std,driver_week_mean,driver_hour_std,driver_hour_mean,diff_hour,diff_weekday,distance_to_center_origin,distance_to_center_driver,to_center_dist
0,1105373,5,20,6080,174182,55.818842,37.334562,55.814567,37.35501,15.688855,...,3.045752,1.77443,3.819876,4.21686,11.360248,8.639752,1.180124,30.01426,32.314916,2.300656
1,759733,5,14,6080,358774,55.805342,37.515023,55.819329,37.466398,18.802,...,7.425724,1.77443,3.819876,4.21686,11.360248,2.639752,1.180124,18.105605,12.605104,-5.500502
2,416977,6,14,6080,866260,55.813978,37.347688,55.814827,37.354074,6.747,...,1.035233,1.77443,3.819876,4.21686,11.360248,2.639752,2.180124,30.120575,30.805613,0.685038
3,889660,2,6,6080,163522,55.745922,37.421748,55.743469,37.43113,19.724066,...,1.69468,1.77443,3.819876,4.21686,11.360248,5.360248,1.819876,21.118747,22.150263,1.031517
4,1120055,4,16,6080,506710,55.803578,37.521602,55.812559,37.527407,12.383,...,1.589417,1.77443,3.819876,4.21686,11.360248,4.639752,0.180124,11.597003,11.867619,0.270616


In [35]:
data_test_copy.shape, data_train_copy.shape

((237813, 47), (892557, 47))

In [36]:
data_train_copy.shape, data_test_copy.shape

((892557, 47), (237813, 47))

In [37]:
# data_train_copy.to_csv('nikita_47_features_train.csv', index = False)
# data_test_copy.to_csv('nikita_47_features_test.csv', index = False)

In [2]:
data_train_copy = pd.read_csv('nikita_47_features_train.csv')

In [3]:
data_test_copy = pd.read_csv('nikita_47_features_test.csv')

In [4]:
data_train_copy.head()

Unnamed: 0,offer_gk,weekday_key,hour_key,driver_gk,order_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,...,time_to_origin,driver_week_std,driver_week_mean,driver_hour_std,driver_hour_mean,diff_hour,diff_weekday,distance_to_center_origin,distance_to_center_driver,to_center_dist
0,1105373,5,20,6080,174182,55.818842,37.334562,55.814567,37.35501,15.688855,...,3.045752,1.77443,3.819876,4.21686,11.360248,8.639752,1.180124,30.01426,32.314916,2.300656
1,759733,5,14,6080,358774,55.805342,37.515023,55.819329,37.466398,18.802,...,7.425724,1.77443,3.819876,4.21686,11.360248,2.639752,1.180124,18.105605,12.605104,-5.500502
2,416977,6,14,6080,866260,55.813978,37.347688,55.814827,37.354074,6.747,...,1.035233,1.77443,3.819876,4.21686,11.360248,2.639752,2.180124,30.120575,30.805613,0.685038
3,889660,2,6,6080,163522,55.745922,37.421748,55.743469,37.43113,19.724066,...,1.69468,1.77443,3.819876,4.21686,11.360248,5.360248,1.819876,21.118747,22.150263,1.031517
4,1120055,4,16,6080,506710,55.803578,37.521602,55.812559,37.527407,12.383,...,1.589417,1.77443,3.819876,4.21686,11.360248,4.639752,0.180124,11.597003,11.867619,0.270616


In [5]:
data_test_copy.head()

Unnamed: 0,offer_gk,weekday_key,hour_key,driver_gk,order_gk,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,...,time_to_origin,driver_week_std,driver_week_mean,driver_hour_std,driver_hour_mean,diff_hour,diff_weekday,distance_to_center_origin,distance_to_center_driver,to_center_dist
0,152446,5,0,5021,648419,55.763302,37.593368,55.75823,37.613689,17.445,...,3.211843,2.07266,3.469388,6.607111,13.632653,13.632653,1.530612,0.889991,3.181057,2.291067
1,281031,5,7,5817,405907,55.75547,37.648689,55.741544,37.622868,12.661445,...,6.555218,2.053035,3.166667,7.413794,11.37037,4.37037,1.833333,1.176091,3.051749,1.875658
2,779964,5,1,3870,894998,55.619002,37.59614,55.615923,37.607872,19.761181,...,1.964883,1.96684,3.146119,7.039667,13.826484,12.826484,1.853881,12.341621,12.300337,-0.041284
3,16720,5,13,5607,505054,55.620905,37.60655,55.614517,37.591161,14.576858,...,2.713209,1.706125,3.366412,4.844245,15.625954,2.625954,1.633588,12.819306,11.924271,-0.895035
4,492087,5,13,3786,459994,55.893228,37.673552,55.888084,37.662087,15.943624,...,1.6302,1.638936,3.37037,6.532634,13.728395,0.728395,1.62963,12.616616,13.535874,0.919258


In [6]:
data_train_copy.shape, data_test_copy.shape

((892557, 47), (237813, 47))

### МОДЕЛИ

In [7]:
data_for_train = data_train_copy.drop(['offer_gk', 'driver_gk', 'order_gk', 'district_driver_rate', 'driver_response'], axis = 1)
data_for_train.head()

Unnamed: 0,weekday_key,hour_key,driver_latitude,driver_longitude,origin_order_latitude,origin_order_longitude,distance_km,duration_min,offer_class_group,ride_type_desc,...,time_to_origin,driver_week_std,driver_week_mean,driver_hour_std,driver_hour_mean,diff_hour,diff_weekday,distance_to_center_origin,distance_to_center_driver,to_center_dist
0,5,20,55.818842,37.334562,55.814567,37.35501,15.688855,20.768587,Economy,private,...,3.045752,1.77443,3.819876,4.21686,11.360248,8.639752,1.180124,30.01426,32.314916,2.300656
1,5,14,55.805342,37.515023,55.819329,37.466398,18.802,25.217,Standard,private,...,7.425724,1.77443,3.819876,4.21686,11.360248,2.639752,1.180124,18.105605,12.605104,-5.500502
2,6,14,55.813978,37.347688,55.814827,37.354074,6.747,9.8,Economy,private,...,1.035233,1.77443,3.819876,4.21686,11.360248,2.639752,2.180124,30.120575,30.805613,0.685038
3,2,6,55.745922,37.421748,55.743469,37.43113,19.724066,31.42535,Economy,private,...,1.69468,1.77443,3.819876,4.21686,11.360248,5.360248,1.819876,21.118747,22.150263,1.031517
4,4,16,55.803578,37.521602,55.812559,37.527407,12.383,19.25,Economy,private,...,1.589417,1.77443,3.819876,4.21686,11.360248,4.639752,0.180124,11.597003,11.867619,0.270616


In [8]:
X_train_forest = pd.get_dummies(data_for_train, columns = ['weekday_key', 
                                                    'hour_key', 
                                                    'offer_class_group', 
                                                    'ride_type_desc'])
X_train_xgb = pd.get_dummies(data_train_copy[['driver_gk', 'weekday_key', 'hour_key', 'distance_km', 'duration_min',
       'offer_class_group', 'ride_type_desc',
       'district_origin', 'district_driver', 'distance_to_origin', 'direction',
       'distance_manhattan', 'driver_response_rate', 'order_count',
       'driver_orders', 'driver_orders_accepted', 'driver_districts',
       'driver_origins', 'driver_districts_accepted',
       'driver_origins_accepted', 'driver_districts_accepted_rate',
       'origin_districts_accepted_rate', 'driver_week_std', 'driver_week_mean',
       'driver_hour_std', 'driver_hour_mean', 'diff_hour', 'diff_weekday',
                                         'order_speed', 'time_to_origin',
                                         'distance_to_center_driver', 'distance_to_center_origin', 'to_center_dist',
                                         'driver_latitude', 'driver_longitude',
       'origin_order_latitude', 'origin_order_longitude']], columns = ['weekday_key', 'hour_key', 'offer_class_group', 
                                                                  'ride_type_desc', 'district_origin', 'district_driver'])
y_train = data_train_copy['driver_response']

In [9]:
X_train_xgb.shape, y_train.shape

((892557, 139), (892557,))

In [10]:
data_for_test = data_test_copy.drop(['offer_gk', 'driver_gk', 'order_gk', 'district_driver_rate', 'driver_response'], axis = 1)

In [11]:
X_test_forest = pd.get_dummies(data_for_test, columns = ['weekday_key', 
                                                    'hour_key', 
                                                    'offer_class_group', 
                                                    'ride_type_desc'])
X_test_xgb = pd.get_dummies(data_test_copy[['driver_gk', 'weekday_key', 'hour_key', 'distance_km', 'duration_min',
       'offer_class_group', 'ride_type_desc',
       'district_origin', 'district_driver', 'distance_to_origin', 'direction',
       'distance_manhattan', 'driver_response_rate', 'order_count',
       'driver_orders', 'driver_orders_accepted', 'driver_districts',
       'driver_origins', 'driver_districts_accepted',
       'driver_origins_accepted', 'driver_districts_accepted_rate',
       'origin_districts_accepted_rate', 'driver_week_std', 'driver_week_mean',
       'driver_hour_std', 'driver_hour_mean', 'diff_hour', 'diff_weekday',
                                        'order_speed', 'time_to_origin', 
                                        'distance_to_center_driver', 'distance_to_center_origin', 'to_center_dist',
                                        'driver_latitude', 'driver_longitude',
       'origin_order_latitude', 'origin_order_longitude']], columns = ['weekday_key', 'hour_key', 'offer_class_group', 
                                                                  'ride_type_desc', 'district_origin', 'district_driver'])
y_test = data_test_copy['driver_response']

In [12]:
X_test_xgb.shape, y_test.shape

((237813, 139), (237813,))

## Model Stacking

#### Подготовка данных для моделей, основаных на деревьях

In [13]:
X_train_models_forest, X_valid_models_forest, y_train_models, y_valid_models = train_test_split(X_train_forest, 
                                                                                  y_train, 
                                                                                  test_size=0.3, 
                                                                                  random_state=17)

X_train_stacking_forest, X_valid_stacking_forest, y_train_stacking, y_valid_stacking = train_test_split(X_valid_models_forest, 
                                                                                  y_valid_models, 
                                                                                  test_size=0.3, 
                                                                                  random_state=17)

In [14]:
X_train_models_xgb, X_valid_models_xgb, y_train_models, y_valid_models = train_test_split(X_train_xgb, 
                                                                                  y_train, 
                                                                                  test_size=0.3, 
                                                                                  random_state=17)

X_train_stacking_xgb, X_valid_stacking_xgb, y_train_stacking, y_valid_stacking = train_test_split(X_valid_models_xgb, 
                                                                                  y_valid_models, 
                                                                                  test_size=0.3, 
                                                                                  random_state=17)

#### XGBoost на деревьях

In [54]:
%%time
params = {
         'num_round': 500,
         'max_depth': 8,
         'eta': 0.08,
         'objective': "binary:logistic",
         'nthread' : 4,
         'silent' : 1
         }
dtrain = xgb.DMatrix(X_train_models_xgb, label=y_train_models)
clf_xgb_trees_0_7 = xgb.train(params, dtrain, int(params['num_round']))
# clf_xgb_trees = xgb.XGBClassifier(max_depth = 8, eta = 0.08, n_estimators = 5, nthread = 4, random_state = 42)
# clf_xgb_trees.fit(X_train_models_xgb, y_train_models)

Wall time: 45min 27s


In [55]:
# pickle.dump(clf_xgb_trees, open("clf_xgb_trees_0_7.pickle.dat", "wb"))

In [56]:
dvalid = xgb.DMatrix(X_valid_models_xgb, label=y_valid_models)
prediction_xgb_trees = clf_xgb_trees_0_7.predict(dvalid)
# prediction_xgb_trees = clf_xgb_trees.predict(X_valid_models_xgb)
score_xgb_trees = roc_auc_score(y_valid_models.values, prediction_xgb_trees)
print(score_xgb_trees)

0.940772066475


In [57]:
%%time
params = {
         'num_round': 500,
         'max_depth': 8,
         'eta': 0.08,
         'objective': "binary:logistic",
         'nthread' : 4,
         'silent' : 1
         }
dtrain = xgb.DMatrix(X_train_xgb, label=y_train)
clf_xgb_trees_10 = xgb.train(params, dtrain, int(params['num_round']))
# clf_xgb_trees = xgb.XGBClassifier(max_depth = 8, eta = 0.08, n_estimators = 5, nthread = 4, random_state = 42)
# clf_xgb_trees.fit(X_train_models_xgb, y_train_models)

Wall time: 57min 40s


In [58]:
# pickle.dump(clf_xgb_trees_10, open("clf_xgb_trees_1_0.pickle.dat", "wb"))

#### RandomForest

In [15]:
%%time
model_random_forest_0_7 = RandomForestClassifier(n_estimators=350, n_jobs=4, criterion='gini', random_state = 17)
model_random_forest_0_7.fit(X_train_models_forest, y_train_models)

Wall time: 21min 23s


In [17]:
prediction_random_forest = model_random_forest_0_7.predict_proba(X_valid_models_forest)[:, 1]
score_random_forest = roc_auc_score(y_valid_models.values, prediction_random_forest)
print(score_random_forest)

0.937399385565


In [None]:
# pickle.dump(model_random_forest_0_7, open("model_random_forest_0_7.pickle.dat", "wb"))

In [42]:
%%time
model_random_forest_1_0 = RandomForestClassifier(n_estimators=350, n_jobs=4, criterion='gini', random_state = 17)
model_random_forest_1_0.fit(X_train_forest, y_train)

Wall time: 23min 54s


#### Подготовка данных для моделей, основаных на регрессии

In [18]:
data_for_train_logistic = data_train_copy.drop(['offer_gk', 'driver_gk', 'order_gk', 'district_driver_rate', 
                                                'driver_response', 'duration_min', 'distance_manhattan'], axis = 1)
data_for_test_logistic = data_test_copy.drop(['offer_gk', 'driver_gk', 'order_gk', 'district_driver_rate', 
                                     'driver_response', 'duration_min', 'distance_manhattan'], axis = 1)

In [19]:
X_train_for_logistic = pd.get_dummies(data_for_train_logistic, columns = ['weekday_key', 
                                                                'hour_key', 
                                                                'offer_class_group', 
                                                                'ride_type_desc',
                                                                'district_origin',
                                                                'district_driver'])
X_test_for_logistic = pd.get_dummies(data_for_test_logistic, columns = ['weekday_key', 
                                                                'hour_key', 
                                                                'offer_class_group', 
                                                                'ride_type_desc',
                                                                'district_origin',
                                                                'district_driver'])

In [20]:
X_train_models_logistic, X_valid_models_logistic, y_train_models, y_valid_models = train_test_split(X_train_for_logistic, 
                                                                                  y_train, 
                                                                                  test_size=0.3, 
                                                                                  random_state=17)

In [21]:
X_train_stacking_logistic, X_valid_stacking_logistic, y_train_stacking, y_valid_stacking = train_test_split(X_valid_models_logistic, 
                                                                                  y_valid_models, 
                                                                                  test_size=0.3, 
                                                                                  random_state=17)

In [22]:
X_train_for_logistic.shape

(892557, 142)

In [23]:
X_test_for_logistic.shape

(237813, 142)

#### XGBoost на регрессии

In [24]:
%%time
params = {
    'num_round': 500,
    'booster': 'gblinear',
    'lambda': 1,
    'alpha': 1,
    'objective': "binary:logistic",
    'nthread' : 4,
    'silent' : 1
         }
dtrain = xgb.DMatrix(X_train_models_logistic, label=y_train_models)
dvalid = xgb.DMatrix(X_valid_models_logistic, label=y_valid_models)
clf_xgb_linear_0_7 = xgb.train(params, dtrain, int(params['num_round']))

Wall time: 7min 58s


In [25]:
predictions_xgb_linear = clf_xgb_linear_0_7.predict(dvalid)
score_xgb_linear = roc_auc_score(y_valid_models.values, predictions_xgb_linear)
print(score_xgb_linear)

0.914218164488


In [44]:
%%time
params = {
    'num_round': 500,
    'booster': 'gblinear',
    'lambda': 1,
    'alpha': 1,
    'objective': "binary:logistic",
    'nthread' : 4,
    'silent' : 1
         }
dtrain = xgb.DMatrix(X_train_for_logistic, label=y_train)
clf_xgb_linear_1_0 = xgb.train(params, dtrain, int(params['num_round']))

Wall time: 8min 53s


## Подготовка данных для стекинга

In [27]:
def prepare_stacking_data(xgb_trees_model, xgb_forest_model, xgb_linear_model, 
                              data_xgb_trees, data_trees, data_linear, y_labels):
    d_trees = xgb.DMatrix(data_xgb_trees, label=y_labels)
    prediction_xgb_trees = xgb_trees_model.predict(d_trees)
    
    prediction_random_forest = xgb_forest_model.predict_proba(data_trees)[:, 1]
    
    d_linear = xgb.DMatrix(data_linear, label=y_labels)
    prediction_xgb_linear = xgb_linear_model.predict(d_linear)
    
    stacking_data = pd.DataFrame(index = y_labels.index)
    stacking_data['xgb_trees'] = prediction_xgb_trees
    stacking_data['random_forest'] = prediction_random_forest
    stacking_data['xgb_linear'] = prediction_xgb_linear
    return stacking_data

In [30]:
clf_xgb_trees_0_7 = pickle.load(open("clf_xgb_trees_0_7.pickle.dat", "rb"))

In [45]:
clf_xgb_trees_1_0 = pickle.load(open("clf_xgb_trees_1_0.pickle.dat", "rb"))

In [31]:
stacking_train_data = prepare_stacking_data(clf_xgb_trees_0_7, model_random_forest_0_7, clf_xgb_linear_0_7,
                                            X_valid_models_xgb, X_valid_models_forest, X_valid_models_logistic, y_valid_models)

In [36]:
stacking_linear = LogisticRegression(random_state = 17, C = 1)
stacking_linear.fit(stacking_train_data, y_valid_models)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [41]:
# pickle.dump(stacking_linear, open("stacking_linear.pickle.dat", "wb"))

In [46]:
stacking_data_test = prepare_stacking_data(clf_xgb_trees_1_0, model_random_forest_1_0, clf_xgb_linear_1_0,
                                            X_test_xgb, X_test_forest, X_test_for_logistic, y_test)
prediction_stacking = stacking_linear.predict_proba(stacking_data_test)[:, 1]
submission = pd.read_csv('McK_SubmissionFormat.csv')
submission.driver_response = prediction_stacking
submission.to_csv('Stacking_4.csv', index = False)

In [47]:
submission.head()

Unnamed: 0,offer_gk,driver_response
0,152446,0.957585
1,281031,0.962028
2,779964,0.966284
3,16720,0.965578
4,492087,0.721087
