In [19]:
from sklearn import datasets
import pandas as pd

# Loading a toy dataset into a DataFrame
data = datasets.load_breast_cancer()
data_df = pd.DataFrame(data=data.data, columns=data.feature_names)

In [20]:
# Splitting the dataset into arbitrary sets of features
data_df1 = data_df[data.feature_names[:5]]
data_df2 = data_df[data.feature_names[5:10]]
data_df3 = data_df[data.feature_names[10:17]]
data_df4 = data_df[data.feature_names[17:30]]
target_df = pd.DataFrame(data=data.target, columns=["target"])

In [21]:
# Creating timestamps for the data
timestamps = pd.date_range(
    end=pd.Timestamp.now(), 
    periods=len(data_df), 
    freq='D').to_frame(name="event_timestamp", index=False)

In [22]:
# Adding the timestamp column to each DataFrame

# feast timestamp istiyor.

data_df1 = pd.concat(objs=[data_df1, timestamps], axis=1)
data_df2 = pd.concat(objs=[data_df2, timestamps], axis=1)
data_df3 = pd.concat(objs=[data_df3, timestamps], axis=1)
data_df4 = pd.concat(objs=[data_df4, timestamps], axis=1)
target_df = pd.concat(objs=[target_df, timestamps], axis=1)

In [23]:
data_df1

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,event_timestamp
0,17.99,10.38,122.80,1001.0,0.11840,2022-02-13 16:38:32.229263
1,20.57,17.77,132.90,1326.0,0.08474,2022-02-14 16:38:32.229263
2,19.69,21.25,130.00,1203.0,0.10960,2022-02-15 16:38:32.229263
3,11.42,20.38,77.58,386.1,0.14250,2022-02-16 16:38:32.229263
4,20.29,14.34,135.10,1297.0,0.10030,2022-02-17 16:38:32.229263
...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,2023-08-31 16:38:32.229263
565,20.13,28.25,131.20,1261.0,0.09780,2023-09-01 16:38:32.229263
566,16.60,28.08,108.30,858.1,0.08455,2023-09-02 16:38:32.229263
567,20.60,29.33,140.10,1265.0,0.11780,2023-09-03 16:38:32.229263


In [24]:
# Creating a list of arbitrary IDs for feature rows
patient_ids = pd.DataFrame(data=list(range(len(data_df))), columns=["patient_id"]) # entity için rastgele kimlikler

# Adding the timestamp column to each DataFrame
data_df1 = pd.concat(objs=[data_df1, patient_ids], axis=1)
data_df2 = pd.concat(objs=[data_df2, patient_ids], axis=1)
data_df3 = pd.concat(objs=[data_df3, patient_ids], axis=1)
data_df4 = pd.concat(objs=[data_df4, patient_ids], axis=1)
target_df = pd.concat(objs=[target_df, patient_ids], axis=1)

In [25]:
target_df

Unnamed: 0,target,event_timestamp,patient_id
0,0,2022-02-13 16:38:32.229263,0
1,0,2022-02-14 16:38:32.229263,1
2,0,2022-02-15 16:38:32.229263,2
3,0,2022-02-16 16:38:32.229263,3
4,0,2022-02-17 16:38:32.229263,4
...,...,...,...
564,0,2023-08-31 16:38:32.229263,564
565,0,2023-09-01 16:38:32.229263,565
566,0,2023-09-02 16:38:32.229263,566
567,0,2023-09-03 16:38:32.229263,567


In [26]:
data_df1.to_parquet(path='data_df1.parquet')
data_df2.to_parquet(path='data_df2.parquet')
data_df3.to_parquet(path='data_df3.parquet')
data_df4.to_parquet(path='data_df4.parquet')
target_df.to_parquet(path='target_df.parquet')

In [27]:
data_df1

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,event_timestamp,patient_id
0,17.99,10.38,122.80,1001.0,0.11840,2022-02-13 16:38:32.229263,0
1,20.57,17.77,132.90,1326.0,0.08474,2022-02-14 16:38:32.229263,1
2,19.69,21.25,130.00,1203.0,0.10960,2022-02-15 16:38:32.229263,2
3,11.42,20.38,77.58,386.1,0.14250,2022-02-16 16:38:32.229263,3
4,20.29,14.34,135.10,1297.0,0.10030,2022-02-17 16:38:32.229263,4
...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,2023-08-31 16:38:32.229263,564
565,20.13,28.25,131.20,1261.0,0.09780,2023-09-01 16:38:32.229263,565
566,16.60,28.08,108.30,858.1,0.08455,2023-09-02 16:38:32.229263,566
567,20.60,29.33,140.10,1265.0,0.11780,2023-09-03 16:38:32.229263,567
