In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from joblib import load, dump
from sklearn.metrics import mean_squared_error
from math import sqrt, ceil, floor

In [2]:
model = load('../processed_data_and_models/xgboost_model.joblib')

In [3]:
train = pd.read_csv('../processed_data_and_models/Training_Dataset.csv')
train.rename({'available_hours':'online_hours'}, axis = 1, inplace = True)
train = train[['date','driver_id','dayofweek','weekend','gender','age','number_of_kids','online_hours']]

train['dayofweek']= train['dayofweek'].astype('int64')
train['age']= train['age'].astype('int64')
train['number_of_kids']= train['number_of_kids'].astype('int64')
org_test = pd.read_csv('./../data/test.csv')
test = org_test.sort_values(by = ['driver_id','date'])

try:
    test.drop(columns= ['online_hours'], inplace = True)
except:
    pass

In [4]:
org_test.head(10)

Unnamed: 0,driver_id,date,online_hours
0,979863,2017-6-28,7
1,979863,2017-6-27,9
2,979863,2017-6-26,9
3,979863,2017-6-25,10
4,979863,2017-6-24,9
5,979863,2017-6-23,8
6,979863,2017-6-22,7
7,780123,2017-6-28,0
8,780123,2017-6-27,4
9,780123,2017-6-26,4


In [5]:
driver_profile = pd.read_csv('../processed_data_and_models/driver.csv')
test = pd.merge(test, driver_profile, on=['driver_id'])

In [6]:
test['gender'].replace({'FEMALE':1, 'MALE':0}, inplace = True)
test['date'] = pd.to_datetime(test['date'])
test['dayofweek'] = test['date'].dt.dayofweek
test['weekend'] = test['dayofweek'].apply(lambda x: 0 if x < 5 else 1)

In [7]:
test.drop_duplicates(subset=['driver_id', 'date'], inplace = True)
test.head(10)

Unnamed: 0,driver_id,date,gender,age,number_of_kids,dayofweek,weekend
0,111556,2017-06-22,1,49,4,3,0
1,111556,2017-06-23,1,49,4,4,0
2,111556,2017-06-24,1,49,4,5,1
3,111556,2017-06-25,1,49,4,6,1
4,111556,2017-06-26,1,49,4,0,0
5,111556,2017-06-27,1,49,4,1,0
6,111556,2017-06-28,1,49,4,2,0
7,111575,2017-06-22,0,49,0,3,0
8,111575,2017-06-23,0,49,0,4,0
9,111575,2017-06-24,0,49,0,5,1


In [8]:
train[train['driver_id']==111556]

Unnamed: 0,date,driver_id,dayofweek,weekend,gender,age,number_of_kids,online_hours
0,2017-06-01,111556,3,0,1,49,4,2.2
1,2017-06-02,111556,4,0,1,49,4,2.5
2,2017-06-03,111556,5,1,1,49,4,0.0
3,2017-06-04,111556,6,1,1,49,4,0.0
4,2017-06-05,111556,0,0,1,49,4,4.7
5,2017-06-06,111556,1,0,1,49,4,3.1
6,2017-06-07,111556,2,0,1,49,4,2.6
7,2017-06-08,111556,3,0,1,49,4,3.2
8,2017-06-09,111556,4,0,1,49,4,4.4
9,2017-06-10,111556,5,1,1,49,4,0.0


```[['date', 'driver_id', 'dayofweek', 'weekend', 'gender', 'age',
       'number_of_kids', 'lag_1', 'lag_2', 'lag_3', 'lag_4',
       'lag_5', 'lag_6', 'lag_7', 'rolling_mean', 'online_hours']]
```

In [9]:
driver_test_list = np.unique(test.driver_id.values)

In [10]:
train = train[train['driver_id'].isin(driver_test_list)]

In [11]:
train.head(10)

Unnamed: 0,date,driver_id,dayofweek,weekend,gender,age,number_of_kids,online_hours
0,2017-06-01,111556,3,0,1,49,4,2.2
1,2017-06-02,111556,4,0,1,49,4,2.5
2,2017-06-03,111556,5,1,1,49,4,0.0
3,2017-06-04,111556,6,1,1,49,4,0.0
4,2017-06-05,111556,0,0,1,49,4,4.7
5,2017-06-06,111556,1,0,1,49,4,3.1
6,2017-06-07,111556,2,0,1,49,4,2.6
7,2017-06-08,111556,3,0,1,49,4,3.2
8,2017-06-09,111556,4,0,1,49,4,4.4
9,2017-06-10,111556,5,1,1,49,4,0.0


In [12]:
test['online_hours'] = -1
test = test[train.columns]

test_data = pd.concat([train,test])

test_data['date'] = pd.to_datetime(test_data['date'])

test_data = test_data.set_index(
    ['date', 'driver_id']
).unstack().fillna(method = 'ffill').asfreq(
    'D'
).stack().sort_index(level=1).reset_index()

test_data['dayofweek'].fillna(test_data['date'].dt.dayofweek, inplace = True)
test_data['weekend'] = test_data['dayofweek'].apply(lambda x: 0 if x < 5 else 1)
test_data[test_data.driver_id==111556]
test_data = test_data.sort_values(by=['driver_id', 'date']).drop_duplicates(subset=['date','driver_id'])
test_data = test_data.set_index(['date', 'driver_id', 'dayofweek', 'weekend', 'gender', 'age',
       'number_of_kids'])

test_data['lag_1'] = test_data.groupby(level=['driver_id'])['online_hours'].shift(1)
test_data['lag_2'] = test_data.groupby(level=['driver_id'])['online_hours'].shift(2)
test_data['lag_3'] = test_data.groupby(level=['driver_id'])['online_hours'].shift(3)
test_data['lag_4'] = test_data.groupby(level=['driver_id'])['online_hours'].shift(4)
test_data['lag_5'] = test_data.groupby(level=['driver_id'])['online_hours'].shift(5)
test_data['lag_6'] = test_data.groupby(level=['driver_id'])['online_hours'].shift(6)
test_data['lag_7'] = test_data.groupby(level=['driver_id'])['online_hours'].shift(7)
test_data['rolling_mean'] = test_data.groupby(level=['driver_id'])['online_hours'].apply(lambda x: x.rolling(window = 7).mean()).shift(1)
test_data = test_data.reset_index(drop = False).dropna()
test_data = test_data[['date', 'driver_id', 'dayofweek', 'weekend', 'gender', 'age',
       'number_of_kids', 'lag_1', 'lag_2', 'lag_3', 'lag_4',
       'lag_5', 'lag_6', 'lag_7', 'rolling_mean','online_hours']]

In [13]:
test_data

Unnamed: 0,date,driver_id,dayofweek,weekend,gender,age,number_of_kids,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,rolling_mean,online_hours
7,2017-06-08,111556,3,0,1,49,4,2.6,3.1,4.7,0.0,0.0,2.5,2.2,2.157143,3.2
8,2017-06-09,111556,4,0,1,49,4,3.2,2.6,3.1,4.7,0.0,0.0,2.5,2.300000,4.4
9,2017-06-10,111556,5,1,1,49,4,4.4,3.2,2.6,3.1,4.7,0.0,0.0,2.571429,0.0
10,2017-06-11,111556,6,1,1,49,4,0.0,4.4,3.2,2.6,3.1,4.7,0.0,2.571429,0.0
11,2017-06-12,111556,0,0,1,49,4,0.0,0.0,4.4,3.2,2.6,3.1,4.7,2.571429,3.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69911,2017-06-24,998740,5,1,0,27,0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,-0.285714,-1.0
69912,2017-06-25,998740,6,1,0,27,0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,-0.428571,-1.0
69913,2017-06-26,998740,0,0,0,27,0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,-0.571429,-1.0
69914,2017-06-27,998740,1,0,0,27,0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,-0.714286,-1.0


In [14]:
def reset_test(test_data):
    test_data = test_data.set_index(['date', 'driver_id', 'dayofweek', 'weekend', 'gender', 'age', 'number_of_kids'])
    test_data['lag_1'] = test_data.groupby(level=['driver_id'])['online_hours'].shift(1)
    test_data['lag_2'] = test_data.groupby(level=['driver_id'])['lag_1'].shift(1)
    test_data['lag_3'] = test_data.groupby(level=['driver_id'])['lag_2'].shift(1)
    test_data['lag_4'] = test_data.groupby(level=['driver_id'])['lag_3'].shift(1)
    test_data['lag_5'] = test_data.groupby(level=['driver_id'])['lag_4'].shift(1)
    test_data['lag_6'] = test_data.groupby(level=['driver_id'])['lag_5'].shift(1)
    test_data['lag_7'] = test_data.groupby(level=['driver_id'])['lag_6'].shift(1)
    test_data = test_data.reset_index()
    test_data['rolling_mean'] = test_data[['lag_1','lag_2','lag_3','lag_4','lag_5','lag_6','lag_7']].mean(axis=1)
    return test_data

In [15]:
start_date = datetime.strptime('2017-06-22','%Y-%m-%d')
end_date = pd.to_datetime(np.max(test_data['date'].values))
delta = timedelta(days=1)
while start_date <= end_date:
    chunk = test_data[test_data['date']==start_date]    
    X = chunk.iloc[:,2:-1]
    y = model.predict(X)
    y = [round(i,1) for i in y]
    chunk['online_hours'] = test_data.loc[test_data['date']==start_date, 'online_hours'] = y
    test_data = reset_test(test_data)
    start_date += delta


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [16]:
result = test_data[['date','driver_id', 'online_hours']]

In [17]:
result['date'] = pd.to_datetime(result['date'])
org_test['date'] = pd.to_datetime(org_test['date'])

pred_result = pd.merge(org_test[['date', 'driver_id']], result, on =['date','driver_id'])
pred_result.to_csv('../processed_data_and_models/PREDICTION.csv',index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [18]:
y_pred = pred_result['online_hours'].values

In [19]:
y_true = org_test['online_hours'].values

In [20]:
sqrt(mean_squared_error(y_true, y_pred))

3.0375597341599287