In [9]:
%pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -q -f https://data.pyg.org/whl/torch-2.0.1+cu118.html


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [None]:
from itertools import combinations, groupby
from torch_geometric.utils import from_networkx
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.impute import SimpleImputer
from torch_geometric.data import Batch
from tqdm.notebook import tqdm
from tqdm import tqdm

import matplotlib.pyplot as plt
import geopandas as gpd
import numpy as np
import pandas as pd
import networkx as nx
import torch
import pickle
import os

In [153]:

SEED = 111

region = 777
target_business="kafe_delivery"

root_data_path = "/home/jupyter/datasphere/s3/s3-sirius/sirius_2024_participants/data/"
root_features_path = "geo_features/"
root_transaction_path = "transaction_data/"

root_graphs_path = "/home/jupyter/datasphere/s3/s3-sirius/sirius_2024_participants/catboost_train/data/graphs/"
root_metro_features_path = "/home/jupyter/datasphere/project/data/mrt/"


cols2drop = ["zone_id", "city_id", "datep"]
transaction_features = ["industry_type", "transaction_sum", "transaction_count", "customer_count"]
zone_id_column = "zone_id"

bussiness_features_agg = dict(
    transaction_sum_sum=("transaction_sum", "sum"),
    transaction_count_sum=("transaction_count", "sum"),
    transaction_count_mean=("transaction_count", "mean"),
    transaction_sum_mean=("transaction_sum", "mean"),
    customer_count_sum=("customer_count", "sum"),
    customer_count_mean=("customer_count", "mean")
)

na_features_to_fill_with_mean = ['construction_year_max', 'construction_year_mean', 'construction_year_min', 'construction_year_max_city_relative', 'construction_year_mean_city_relative',  'construction_year_min_city_relative',
'construction_year_max_neighbour_max', 'construction_year_mean_neighbour_max', 'construction_year_min_neighbour_max', 'construction_year_max_neighbour_mean', 'construction_year_mean_neighbour_mean',
'construction_year_min_neighbour_mean']

na_features_to_fill_with_median = ['price_meter_mean', 'price_meter_min', 'price_meter_mean_city_relative', 'price_meter_min_city_relative',  'price_meter_mean_neighbour_max', 'price_meter_min_neighbour_max']

target_agg_column = "transaction_sum"

kfold_splits_settings = dict(
    quantilies=10, 
    n_folds=5,
    use_stratification=True
)

cols2drop_from_final_graph = ['region_id',
     'city_id',
     'ldy',
     'ldx',
     'lty',
     'ltx',
     'rty',
     'rtx',
     'rdy',
     'rdx',
     'lat_centre',
     'lon_centre',
     'geometry',
    'datep'
]

regions_mapper = {
        777 : "Moscow",
        812 : "Saint-Petersburg",
        287 : "Kazan",
        473 : "Sochi"
    }

path_to_save_graph = f"../data/graph_preprocessing/{regions_mapper[region]}/graph_with_cv_full_new.pickle"
path_to_save_targets = f"../data/graph_preprocessing/{regions_mapper[region]}/targets_with_cv_full_new.parquet"
path_to_save_features = f"../data/graph_preprocessing/{regions_mapper[region]}/nodes_features_with_cv_full_new.parquet"

save_result = True

features_config = dict(add_business_features=True, metro_features=["number of subways", "distance_to_nearest_metro"])

In [154]:
def train_test_split(x, train_size, val_size):
    
    np.random.seed(SEED)
    x = x.sample(frac=1)
    
    train_actual_size = int(x.shape[0] * train_size)
    val_actual_size = int(x.shape[0] * val_size) + train_actual_size
    train = x.iloc[:train_actual_size, :]
    val = x.iloc[train_actual_size:val_actual_size + 1, :]
    test = x.iloc[val_actual_size + 1:, :]
    
    return list(train.index), list(val.index), list(test.index)

def split_folds(data, target_column, quantilies=10, n_folds=5, use_stratification=False):
    
    data_copy = data.copy()
    
    train_folds = []
    val_folds = []
    
    if use_stratification:
    
        data_copy["stratified_target"] = pd.qcut(data_copy[target_column], q=quantilies, labels=np.arange(quantilies))
        
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=SEED)
        
        for i, (train_ind, val_ind) in enumerate(skf.split(data_copy, data_copy["stratified_target"])):
            
            train_folds.append(data_copy.iloc[train_ind, :].index)
            val_folds.append(data_copy.iloc[val_ind, :].index)
            
    else:
        
        kf = KFold(n_splits=n_folds, shuffle=True, random_state=SEED)
        
        for i, (train_ind, val_ind) in enumerate(kf.split(data_copy)):
            
            train_folds.append(data_copy.iloc[train_ind, :].index)
            val_folds.append(data_copy.iloc[val_ind, :].index)
            
    return train_folds, val_folds

In [155]:
def preprocess_graph():
    
    
    #loading data
    features2keep = []
    
    print("Staring loading data...", end="\n")
    features = pd.read_parquet(
        os.path.join(
            root_data_path, 
            root_features_path, 
            f"geo_features_{region}_buffer_5.parquet"
        )
    )
                               
    features2keep.extend(
        list([col for col in features.columns if col != zone_id_column])
    )
    
                               
    transactions = pd.read_parquet(
        os.path.join(root_data_path, root_transaction_path, f"transaction_data_{region}_buffer_5.parquet")
    )[transaction_features + [zone_id_column]]
    
    target = pd.read_parquet(target_path)
                               
    with open(os.path.join(root_graphs_path, f"graph_{region}_buffer_5_road_connected.gpickle"), 'rb') as f:
        Graph = pickle.load(f)
    
    print("Finished loading data...", end="\n")
    

    #adding new features
    if features_config["add_business_features"]:
        
        print("Staring adding business features...", end="\n")
        
        #selecting all businesses except for target
        bs_types = [x for x in transactions['industry_type'].unique() if x != target_business]
        
        #creating new df for this features
        bs_features = pd.DataFrame({zone_id_column : transactions.zone_id.unique()})
                               
        #creating features
        for bs in tqdm(bs_types):
                               
            cur_bs_features = transactions[transactions.industry_type == bs].groupby(
                zone_id_column, as_index=False
            ).agg(**bussiness_features_agg)
            
            cur_bs_features.columns = [zone_id_column] + [f"{bs}_{x}" for x in cur_bs_features.columns[1:]]
            features2keep.extend(cur_bs_features.columns[1:])
            bs_features = bs_features.merge(cur_bs_features, on=zone_id_column, how="left")

        features = features.merge(bs_features, on=zone_id_column, how="left")
        
        print("Finished adding business features...", end="\n")
        

    #adding metro features (optionally, for some cities there are no Metro)
    if features_config["metro_features"]:
        
        print("Staring adding metro features...", end="\n")
        
        metro_features = pd.read_csv(
           os.path.join(root_metro_features_path, f"subway_{region}_features.csv")
        )[[zone_id_column] + features_config["metro_features"]]
        
        features = features.merge(metro_features, on=zone_id_column, how="left")

        features2keep.extend(features_config["metro_features"])
        
        print("Finished adding metro features...", end="\n")
    
    
    print("Staring adding features oh the graph", end="\n")
    #aggregating everything
    nodes_features = pd.DataFrame({zone_id_column : list(Graph.nodes())})
        
    nodes_features = nodes_features[[zone_id_column]].merge(
        features[[zone_id_column] + features2keep],  on=zone_id_column, how='left',
    ).set_index(zone_id_column)
        
    nodes_features[zone_id_column] = nodes_features.index
    
    #filling up na`s
    for col in na_features_to_fill_with_mean:
        nodes_features[col] = SimpleImputer(
            strategy="mean"
        ).fit_transform(nodes_features[col].values.reshape(-1,1))

    for col in na_features_to_fill_with_median:
        nodes_features[col] = SimpleImputer(
            strategy="median"
        ).fit_transform(nodes_features[col].values.reshape(-1,1))

    nodes_features.fillna(-1, inplace=True)
    
    print(list(nodes_features.columns))
    print(len(list(nodes_features.columns)))
    print(len(set(list(nodes_features.columns))))
    
    print([col for col in list(nodes_features.columns) if col not in past_arr])

    #setting features to graph
    nx.set_node_attributes(Graph, nodes_features.to_dict(orient="index"))
    
    print("Finished adding features oh the graph", end="\n")
    
    
    print("Staring creating target and masks", end="\n")
    #forming target
    target_transactions = transactions[moscow_transactions.industry_type == target_business]
    targets = target_transactions.groupby(
        zone_id_column, as_index=False
    )[target_agg_column].agg("sum").set_index(zone_id_column)
    
    #default split
    tar_train, tar_val, tar_test = train_test_split(x=targets, train_size=0.7, val_size=0.15)
    
    #split folds
    train_folds, val_folds = split_folds(
        data=targets, 
        target_column=target_agg_column, 
        **kfold_splits_settings
    )
    
    #df with masks
    targets_out = pd.DataFrame(
        {
            zone_id_column : list(Graph.nodes()), 
            "y" : [0] * len(list(Graph.nodes())),
        }
    ).set_index(zone_id_column)

    #setting target to all nodes
    for i in list(G_moscow.nodes()):
        if i in targets.index: targets_out.loc[i, "y"] = targets.loc[i, target_agg_column]
        else: targets_out.loc[i, "y"] = -1

    #creating mask for deafault splits
    targets_masks = pd.DataFrame({
        zone_id_column : list(Graph.nodes()), 
        "train_mask" : list(Graph.nodes()),
        "val_mask" : list(Graph.nodes()),
        "test_mask" : list(Graph.nodes()),
        "train_val_mask" : list(Graph.nodes()),
    }).set_index(zone_id_column).isin(
        {
            "train_mask" : tar_train, 
            "val_mask" : tar_val, 
            "test_mask" : tar_test, 
            "train_val_mask" : tar_train + tar_val
        }
    ).reset_index(drop=False)

    #creating mask for kfold splits
    k_folds_mask_df = pd.DataFrame({zone_id_column : list(Graph.nodes()),})
    isin_dict = {}

    for i, (train_fold_idx, val_fold_idx) in enumerate(zip(train_folds, val_folds)):
        k_folds_mask_df[f"train_fold_{i}"] = list(Graph.nodes())
        k_folds_mask_df[f"val_fold_{i}"] = list(Graph.nodes())
        isin_dict[f"train_fold_{i}"] = train_fold_idx
        isin_dict[f"val_fold_{i}"] = val_fold_idx

    k_folds_mask_df = k_folds_mask_df.set_index(zone_id_column).isin(isin_dict).reset_index(drop=False)

    targets_out = targets_out.merge(targets_masks, on=zone_id_column, how="left").set_index(zone_id_column)
    targets_out = targets_out.merge(k_folds_mask_df, on=zone_id_column, how="left").set_index(zone_id_column)
    
    #setting masks to graph
    nx.set_node_attributes(Graph, targets_out.to_dict(orient="index"))
    print("Finished creating target and masks", end="\n")
    
    print("Staring creating final graph", end="\n")
    # creating torch_geometric.Data class
    to_keep_as_single = [col for col in k_folds_mask_df.columns if "fold" in col] + \
    ["y", "train_mask", "val_mask", "test_mask", "train_val_mask", zone_id_column]
                               
    g_data = from_networkx(Graph,
                            group_node_attrs=[
                                x for x in list(next(iter(Graph.nodes(data=True)))[-1].keys())
                                if (x not in cols2drop_from_final_graph + to_keep_as_single)
                            ]
                          )
    print("Finished creating final graph", end="\n")
                               
    for col2drop in cols2drop_from_final_graph: 
        try: setattr(g_data, col2drop, None)
        except: continue

    if save_result:
        print("Saving...", end="\n")
        torch.save(g_data, path_to_save_graph)
        targets_out.to_parquet(path_to_save_targets, index=False)
        nodes_features.to_parquet(path_to_save_features, index=False)
        print("Saving finished succesfully!", end="\n")

In [156]:
# targets_out.to_csv("../data/zones_mask_473.csv", index=True)

In [157]:
cols = preprocess_graph()

Staring loading data...
Finished loading data...
Staring adding business features...


100%|██████████| 16/16 [00:00<00:00, 47.28it/s]


Finished adding business features...
Staring adding metro features...
Finished adding metro features...
Staring adding features oh the graph
['city_id', 'average_check', 'check_count', 'turnover', 'turnover_log', 'average_check_city_relative', 'check_count_city_relative', 'turnover_city_relative', 'turnover_log_city_relative', 'average_check_neighbour_max', 'check_count_neighbour_max', 'turnover_neighbour_max', 'turnover_log_neighbour_max', 'average_check_neighbour_mean', 'check_count_neighbour_mean', 'turnover_neighbour_mean', 'turnover_log_neighbour_mean', 'construction_year_max', 'construction_year_mean', 'construction_year_min', 'price_max', 'price_mean', 'price_min', 'price_meter_max', 'price_meter_mean', 'price_meter_min', 'room_space_max', 'room_space_mean', 'room_space_min', 'construction_year_max_city_relative', 'construction_year_mean_city_relative', 'construction_year_min_city_relative', 'price_max_city_relative', 'price_mean_city_relative', 'price_min_city_relative', 'price