In [1]:
# Import necessary libraries
# Your calling code (e.g., in a script or notebook)

from sequenzo import * # Import the package, give it a short alias
import pandas as pd # Data manipulation

# List all the available datasets in Sequenzo
# Now access functions using the alias:
print('Available datasets in Sequenzo: ', list_datasets())

# Load the data that we would like to explore in this tutorial
# `df` is the short for `dataframe`, which is a common variable name for a dataset
# df = load_dataset('country_co2_emissions')
df = load_dataset('pairfam_family')

# Show the dataframe
df

Available datasets in Sequenzo:  ['country_co2_emissions_local_quintiles', 'country_co2_emissions_local_deciles', 'polyadic_seqc1', 'polyadic_samplep1', 'mvad', 'chinese_colonial_territories', 'polyadic_samplec1', 'country_gdp_per_capita', 'polyadic_seqp1', 'country_co2_emissions', 'biofam_married_domain', 'country_co2_emissions_global_deciles', 'country_co2_emissions_global_quintiles', 'biofam', 'pairfam_family', 'biofam_left_domain', 'biofam_child_domain']


Unnamed: 0,id,weight40,sex,doby_gen,dob,ethni,migstatus,yeduc,sat1i4,sat5,...,255,256,257,258,259,260,261,262,263,264
0,111000.0,0.343964,1,1971,855,1,1,11.5,5,7,...,4,4,4,4,4,4,4,4,4,4
1,1624000.0,1.467063,1,1973,880,1,1,11.5,9,8,...,8,8,8,8,8,8,8,8,8,8
2,2767000.0,0.463918,1,1971,853,1,1,9.0,9,-2,...,8,8,8,8,8,8,8,8,8,8
3,2931000.0,1.767455,0,1973,881,5,3,10.5,5,5,...,9,9,9,9,9,9,9,9,9,9
4,3167000.0,0.884521,1,1973,883,1,1,11.5,8,10,...,8,8,8,8,8,8,8,8,8,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1861,919347000.0,0.964979,0,1973,880,1,1,11.5,8,4,...,1,1,1,1,1,1,1,1,1,1
1862,919488000.0,1.981277,0,1971,863,1,1,10.5,10,2,...,9,9,9,9,9,9,9,9,9,9
1863,919910000.0,0.485150,1,1973,878,1,1,20.0,7,7,...,8,8,8,8,8,8,8,8,8,8
1864,920140000.0,2.022823,0,1971,859,1,1,11.5,10,10,...,7,7,7,7,7,7,7,7,7,7


In [2]:
# Define time span: 264 months (e.g., 1 ... 264)
time_list = [f"{i}" for i in range(1, 265)]

# Define 9 states (numeric codes 1–9)
states = list(range(1, 10))

# Define labels for each state
labels = [
    "Single, no child",
    "Living apart together, no child",
    "Cohabiting, no child",
    "Married, no child",
    "Single, with child(ren)",
    "LAT, with child(ren)",
    "Cohabiting, with child(ren)",
    "Married, 1 child",
    "Married, 2+ children"
]

# colors = [
#     "#6EC5E9",  # 天空浅蓝
#     "#7DDDC6",  # 薄荷绿
#     "#AEEA8C",  # 清新青柠
#     "#FFF176",  # 明亮柠檬黄
#     "#FFD54F",  # 柔橙
#     "#FFB7B2",  # 西柚粉
#     "#FF94C2",  # 樱花粉
#     "#E1BEE7",  # 薰衣草紫
#     "#B39DDB"   # 浅紫
# ]

colors = [
    "#74C9B4",  # 青翠色（山间新绿）
    "#A6E3D0",  # 水绿（溪水清波）
    "#F9E79F",  # 杏黄（阳光轻照）
    "#F6CDA3",  # 橙杏色（晚霞暖色）
    "#F5B7B1",  # 粉黛色（桃花春风）
    "#D7BDE2",  # 淡紫（兰花幽香）
    "#A3C4F3",  # 天青色（青花瓷釉）
    "#7FB3D5",  # 碧蓝（江天一色）
    "#EAECEE"   # 云白（轻雾朦胧）
]



# Initialize SequenceData object
sequence_data = SequenceData(
    df,
    time=time_list,
    id_col="id",   # your dataset has column "id", not "country"
    states=states,
    labels=labels,
    weights=df['weight40'].values,  # Extract the weight column values
    custom_colors=colors
)



[>] SequenceData initialized successfully! Here's a summary:
[>] Number of sequences: 1866
[>] Number of time points: 264
[>] Min/Max sequence length: 264 / 264
[>] States: [1, 2, 3, 4, 5, 6, 7, 8, 9]
[>] Labels: ['Single, no child', 'Living apart together, no child', 'Cohabiting, no child', 'Married, no child', 'Single, with child(ren)', 'LAT, with child(ren)', 'Cohabiting, with child(ren)', 'Married, 1 child', 'Married, 2+ children']
[>] Weights: Provided (total weight=2346.268, mean=1.257, std=1.046)


In [3]:
get_complexity_index(sequence_data)

Unnamed: 0,ID,Complexity Index
0,111000.0,0.128952
1,1624000.0,0.135689
2,2767000.0,0.085061
3,2931000.0,0.142920
4,3167000.0,0.079141
...,...,...
1861,919347000.0,0.023616
1862,919488000.0,0.088649
1863,919910000.0,0.099346
1864,920140000.0,0.161840


In [4]:
get_turbulence(sequence_data)

[!] One or more missing values were found after calculating the number of distinct subsequences. They have been replaced with a large number of 1e15 to ensure the calculation continues.


Unnamed: 0,ID,Turbulence
0,111000.0,13.201720
1,1624000.0,10.986395
2,2767000.0,7.388162
3,2931000.0,11.855482
4,3167000.0,6.962700
...,...,...
1861,919347000.0,3.921091
1862,919488000.0,6.769877
1863,919910000.0,9.955445
1864,920140000.0,17.672635


In [5]:
get_state_freq_and_entropy_per_seq(sequence_data)

[>] Computing state distribution for 1866 sequences and 9 states ...


Unnamed: 0,ID,1,2,3,4,5,6,7,8,9
0,111000.0,0.0,128.0,0.0,118.0,10.0,3.0,0.0,5.0,0.0
1,1624000.0,75.0,0.0,17.0,0.0,52.0,0.0,49.0,71.0,0.0
2,2767000.0,12.0,0.0,1.0,0.0,83.0,0.0,1.0,167.0,0.0
3,2931000.0,123.0,0.0,36.0,0.0,14.0,0.0,9.0,45.0,37.0
4,3167000.0,0.0,0.0,35.0,0.0,129.0,0.0,27.0,73.0,0.0
...,...,...,...,...,...,...,...,...,...,...
1861,919347000.0,254.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
1862,919488000.0,53.0,0.0,11.0,14.0,0.0,161.0,0.0,0.0,25.0
1863,919910000.0,126.0,0.0,81.0,0.0,0.0,0.0,9.0,48.0,0.0
1864,920140000.0,50.0,0.0,67.0,0.0,133.0,0.0,14.0,0.0,0.0


In [7]:
result = get_cross_sectional_entropy(sequence_data)
result

Unnamed: 0,time,state,freq,Entropy_norm,N_valid,Effective States,rank,is_top
0,1,1,0.644610,0.380483,2346.268274,2.307133,1,True
1,1,3,0.309302,0.380483,2346.268274,2.307133,2,False
2,1,5,0.032307,0.380483,2346.268274,2.307133,3,False
3,1,7,0.006102,0.380483,2346.268274,2.307133,4,False
4,1,6,0.002092,0.380483,2346.268274,2.307133,5,False
...,...,...,...,...,...,...,...,...
2371,99,7,0.099196,0.863978,2346.268274,6.674902,5,False
2372,99,9,0.062783,0.863978,2346.268274,6.674902,6,False
2373,99,6,0.042628,0.863978,2346.268274,6.674902,7,False
2374,99,2,0.020513,0.863978,2346.268274,6.674902,8,False


In [None]:
summary = result.attrs["summary"]
print(
    f"共有 {summary['n_states']} 个状态、{summary['n_timepoints']} 个时间点；"
    f"主导状态平均占比≈{summary['dominant_stability_ratio']:.2f}；"
    f"熵最高在 t={summary['peak_entropy_time']}，最低在 t={summary['lowest_entropy_time']}。"
)

# TODO: 这个要放在函数里面

共有 9 个状态、264 个时间点；主导状态平均占比≈0.34；熵最高在 t=143，最低在 t=1。
