In [1]:
import os
import pandas as pd

from tqdm.notebook import tqdm

DATA_DIR = '../../../data/rosstat/processed'

In [2]:
economics_df = pd.read_csv(os.path.join(DATA_DIR, 'result_economics_df_v1_0.csv'))

In [3]:
vc = economics_df['code'].value_counts()
vc.value_counts()

count
84    69
60     1
Name: count, dtype: int64

In [4]:
save_codes = list(vc[vc == 84].index)
economics_df = economics_df[economics_df['code'].isin(save_codes)]

In [5]:
assert economics_df.groupby(['code'])['date'].min().nunique() == 1
assert economics_df.groupby(['code'])['date'].max().nunique() == 1

In [6]:
df_copy = economics_df.copy()
df_copy['year'] = pd.to_datetime(df_copy['date']).dt.year

df_copy['split'] = 'train'

for code in tqdm(df_copy['code'].unique()):
    code_data = df_copy[df_copy['code'] == code]
    
    years = sorted(code_data['year'].unique())
    
    assert len(years) >= 3

    last_year = years[-1]
    second_last_year = years[-2]
    
    df_copy.loc[(df_copy['code'] == code) & (df_copy['year'] == last_year), 'split'] = 'test'
    
    df_copy.loc[(df_copy['code'] == code) & (df_copy['year'] == second_last_year), 'split'] = 'val'

train_df = df_copy[df_copy['split'] == 'train']
val_df = df_copy[df_copy['split'] == 'val']
test_df = df_copy[df_copy['split'] == 'test']

columns2drop = ['year', 'split']
train_df.drop(columns=columns2drop, inplace=True)
val_df.drop(columns=columns2drop, inplace=True)
test_df.drop(columns=columns2drop, inplace=True)

print(f"Обучающая выборка: {train_df.shape[0]} строк")
print(f"Валидационная выборка: {val_df.shape[0]} строк")
print(f"Тестовая выборка: {test_df.shape[0]} строк")

assert set(train_df['code'].unique()) == set(val_df['code'].unique()) == set(test_df['code'].unique()), 'Не все группы по code успешно разделены'

  0%|          | 0/69 [00:00<?, ?it/s]

Обучающая выборка: 4140 строк
Валидационная выборка: 828 строк
Тестовая выборка: 828 строк


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.drop(columns=columns2drop, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df.drop(columns=columns2drop, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.drop(columns=columns2drop, inplace=True)


In [7]:
names = ['train', 'val', 'test']

for name in names:
    df = eval(f'{name}_df')
    file_path = os.path.join(DATA_DIR, name, 'data.csv')
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    df.to_csv(file_path, index=False)