In [10]:
import warnings
warnings.filterwarnings("ignore")
orders = pd.read_csv(r"data/final_test.csv")
nodes = pd.read_csv(r"data/nodes_test.csv")

In [67]:
import pandas as pd
import requests
import pytz
import json
import datetime
import numpy as np


def get_pick_hour_rate(orders):
    orders['running_time'] = pd.to_datetime(orders['running_time'])
    orders['hour'] = orders['running_time'].dt.hour
    orders_per_hour = pd.read_pickle('orders_per_hour.pkl')
    orders = orders.merge(orders_per_hour, on='hour').rename(columns={'Id_y': 'orders_per_hour', 'Id_x': 'Id'})
    orders.drop(['hour'], axis=1, inplace=True)
    return orders

def get_coordinates(nodes):
    all_nodes = np.union1d(nodes.node_start.unique(), nodes.node_finish.unique())
    fieldnames = list(pd.read_xml(f"https://www.openstreetmap.org/api/0.6/node/{all_nodes[0]}").columns)
    df = pd.DataFrame(columns=fieldnames)
    for node in all_nodes:
        try:
            df = df.append(pd.read_xml(f"https://www.openstreetmap.org/api/0.6/node/{node}"), ignore_index=True)
        except:
            pass
    nodes_info = df[["id", "lat", "lon"]]
    nodes = nodes.merge(nodes_info, left_on='node_start', right_on='id')
    nodes.rename(columns = {
        'lat' : 'lat_start',
        'lon' : 'lon_start'
    }, inplace=True)
    nodes = nodes.merge(nodes_info, left_on='node_finish', right_on='id')
    nodes.rename(columns = {
        'lat' : 'lat_finish',
        'lon' : 'lon_finish'
    }, inplace=True)
    nodes.drop(['id_x', 'id_y'], axis=1, inplace=True)
    nodes.drop(['node_start', 'node_finish'], axis=1, inplace=True)
    nodes['lat_start'] = round(nodes['lat_start'], 2)
    nodes['lon_start'] = round(nodes['lon_start'], 2)
    nodes['lat_finish'] = round(nodes['lat_finish'], 2)
    nodes['lon_finish'] = round(nodes['lon_finish'], 2)
    nodes = nodes.query('(45<lat_start<47)&(45<lat_finish<47)&(29.5<lon_start<31.5)&(29.5<lon_finish<31.5)')
    return nodes
        
def get_day_time(all_data):
    all_data['date_of_order'] = pd.to_datetime(all_data.running_time).dt.date
    all_data['time_of_order'] = pd.to_datetime(all_data.running_time).dt.time
    all_data.drop('running_time', axis=1, inplace=True)
    parse_indexes = all_data.groupby(['lon_start', 'lat_start', 'date_of_order']).agg('count').index
    utc_timezone = pytz.timezone('UTC')
    kyiv_timezone = pytz.timezone('Europe/Kiev')
    sunny = {'lat_start': [], 'lon_start': [], 'date_of_order': [], 'sunrise': [], 'sunset': []}
    for start_lon, lat_start, date in parse_indexes:
        url = f"https://api.sunrise-sunset.org/json?lat={lat_start}&lng={start_lon}&date={date}"
        utc_sunrise_time = datetime.datetime.strptime(json.loads(requests.get(url).content)['results']['sunrise'], '%I:%M:%S %p')
        utc_sunset_time = datetime.datetime.strptime(json.loads(requests.get(url).content)['results']['sunset'], '%I:%M:%S %p')
        utc_sunrise_time = utc_timezone.localize(utc_sunrise_time)
        utc_sunset_time = utc_timezone.localize(utc_sunset_time)
        kyiv_sunrise_time = utc_sunrise_time.astimezone(kyiv_timezone)
        kyiv_sunset_time = utc_sunset_time.astimezone(kyiv_timezone)
        time_str_24h_sunrise = kyiv_sunrise_time.strftime('%H:%M:%S')
        time_str_24h_sunset = kyiv_sunset_time.strftime('%H:%M:%S')
        sunny['lat_start'].append(lat_start)
        sunny['lon_start'].append(start_lon)
        sunny['date_of_order'].append(date)
        sunny['sunrise'].append(time_str_24h_sunrise)
        sunny['sunset'].append(time_str_24h_sunset)
    sun_data = pd.DataFrame(sunny)
    return all_data.merge(sun_data, how='left', on=['lat_start', 'lon_start', 'date_of_order'])

def get_weather(all_data):
    parse_indexes = all_data.groupby(['lon_start', 'lat_start', 'date_of_order']).agg('count').index
    weather = {'lat_start': [], 'lon_start': [], 'date_of_order': [], 'time': [], 'temperature':[],'precipitation':[]}
    for start_lon, lat_start, date in parse_indexes:
        url = f"https://archive-api.open-meteo.com/v1/archive?latitude={lat_start}&longitude={start_lon}&start_date={date}&end_date={date}&hourly=temperature_2m,precipitation&timezone=Europe%2FBerlin"
        time_strings=json.loads(requests.get(url).content)['hourly']['time']
        temperature_strings=json.loads(requests.get(url).content)['hourly']['temperature_2m']
        precipitation_strings=json.loads(requests.get(url).content)['hourly']['precipitation']

        for item in range(len(time_strings)):
            weather['lat_start'].append(lat_start)
            weather['lon_start'].append(start_lon)
            weather['date_of_order'].append(date)
            weather['time'].append(datetime.datetime.strptime(time_strings[item], "%Y-%m-%dT%H:%M"))
            weather['temperature'].append(temperature_strings[item])
            weather['precipitation'].append(precipitation_strings[item])
    weather_data = pd.DataFrame(weather)
    weather_data['time'] = weather_data['time'].dt.time.astype('str')
    all_data["time"] = all_data.time_of_order.astype(str).str.slice(stop=2)+ ":00:00"
    all_data = all_data.merge(weather_data, how='left', on=['lat_start', 'lon_start', 'date_of_order', 'time'])
    all_data["is_day"] = (all_data["sunrise"] < all_data["time_of_order"].astype('str')) & (all_data["time_of_order"].astype('str') < all_data["sunset"])
    return all_data

def final_prep(all_data) :
    all_data.drop(['date_of_order', 'sunrise', 'sunset', 'time'], axis=1, inplace=True)
    all_data.drop(['lat_start', 'lat_finish', 'lon_start','lon_finish'], axis=1, inplace=True)
    final_data = all_data.groupby(['Id'], as_index=False).agg({
    'time_of_order' : 'count',
    'route_distance_km' : 'mean',
    'orders_per_hour': 'mean',
    'speed' : 'mean',
    'distance': 'sum',
    'is_day': 'median',
    'temperature': 'mean',
    'precipitation': 'mean'
    })
    return final_data
    
        
def complete_data(orders, nodes):
    orders = get_pick_hour_rate(orders)
    nodes = get_coordinates(nodes)
    all_data = orders.merge(nodes, how='left', on='Id')
    all_data = get_day_time(all_data)
    all_data = get_weather(all_data)
    final_data = final_prep(all_data)  
    return final_data


In [69]:
test = complete_data(orders, nodes)

In [72]:
test.to_pickle('data/test.pkl')