In [1]:
import pandas as pd
import re
path="tus_00startime.csv"
df = pd.read_csv(path)

print(df.head())
print("\ncolumns:", list(df.columns))

                    DATAFLOW        LAST UPDATE    freq  \
0  ESTAT:TUS_00STARTIME(1.0)  24/04/18 23:00:00  Annual   
1  ESTAT:TUS_00STARTIME(1.0)  24/04/18 23:00:00  Annual   
2  ESTAT:TUS_00STARTIME(1.0)  24/04/18 23:00:00  Annual   
3  ESTAT:TUS_00STARTIME(1.0)  24/04/18 23:00:00  Annual   
4  ESTAT:TUS_00STARTIME(1.0)  24/04/18 23:00:00  Annual   

                     unit    sex             startime   acl00       geo  \
0  Participation rate (%)  Total  From 04:00 to 04:09  Eating   Belgium   
1  Participation rate (%)  Total  From 04:00 to 04:09  Eating   Belgium   
2  Participation rate (%)  Total  From 04:00 to 04:09  Eating  Bulgaria   
3  Participation rate (%)  Total  From 04:00 to 04:09  Eating   Germany   
4  Participation rate (%)  Total  From 04:00 to 04:09  Eating   Germany   

   TIME_PERIOD  OBS_VALUE  OBS_FLAG  CONF_STATUS  
0         2000       0.15       NaN          NaN  
1         2010       0.10       NaN          NaN  
2         2000       0.01       NaN      

In [2]:
activities = sorted(df["acl00"].dropna().unique())

print("\nNumber of activities:", len(activities))
print("\nActivities:")
for a in activities:
    print("-", a)


Number of activities: 9

Activities:
- Eating
- Household and family care and related travel
- Leisure, social and associative life except TV and video
- Personal care except eating
- Television and video
- Total
- Travel to/from work/study
- Unspecified time use and travel
- Work and study


In [3]:
print("\nCountries:", sorted(df["geo"].dropna().unique()))
print("\nYears:", sorted(df["TIME_PERIOD"].dropna().unique()))
print("\nSex:", sorted(df["sex"].dropna().unique()))


Countries: ['Belgium', 'Bulgaria', 'Estonia', 'France', 'Germany', 'Greece', 'Italy', 'Latvia', 'Lithuania', 'Spain']

Years: [2000, 2010]

Sex: ['Total']


In [4]:
if "sex" in df.columns:
    df = df[df["sex"] == "Total"].copy()
else:
    raise ValueError("Column 'sex' not found in dataframe")

print("After sex=Total:", df.shape)

After sex=Total: (14400, 12)


In [5]:
keep_cols = ["geo", "TIME_PERIOD", "acl00", "startime", "OBS_VALUE"]
missing = [c for c in keep_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns: {missing}")

df = df[keep_cols].rename(columns={
    "geo": "country",
    "TIME_PERIOD": "year",
    "acl00": "activity",
    "startime": "time_label",
    "OBS_VALUE": "value"
})

print("After column seletion:", df.shape)
print(df.head())

After column seletion: (14400, 5)
    country  year activity           time_label  value
0   Belgium  2000   Eating  From 04:00 to 04:09   0.15
1   Belgium  2010   Eating  From 04:00 to 04:09   0.10
2  Bulgaria  2000   Eating  From 04:00 to 04:09   0.01
3   Germany  2000   Eating  From 04:00 to 04:09   0.06
4   Germany  2010   Eating  From 04:00 to 04:09   0.10


In [6]:
df = df[df["activity"].notna()].copy()
df = df[df["activity"].str.strip().str.lower() != "total"]

print("After removing 'total' activity rows:",df.shape)

After removing 'total' activity rows: (12800, 5)


In [7]:
def parse_time_label(s):
    
    if pd.isna(s):
        return None, None
    
    m = re.search(r"From\s+(\d{2}):(\d{2})\s+to\s+(\d{2}):(\d{2})", s)
    if not m:
        return None, None
    h = int(m.group(1))
    minute = int(m.group(2))
    hour_float = h + minute / 60.0
    minutes_since_midnight = h * 60 + minute
    return hour_float, minutes_since_midnight

df["hour"], df["minutes_since_midnight"] = zip(*df["time_label"].map(parse_time_label))


In [8]:
before = df.shape[0]
df = df[df["hour"].notna()].copy()
print(f"Dropped {before - df.shape[0]} rows with unparsable time_label")
print(df[["time_label", "hour"]].head())

Dropped 128 rows with unparsable time_label
            time_label  hour
0  From 04:00 to 04:09   4.0
1  From 04:00 to 04:09   4.0
2  From 04:00 to 04:09   4.0
3  From 04:00 to 04:09   4.0
4  From 04:00 to 04:09   4.0


In [9]:
print("\nUnique activities:")
for a in sorted(df["activity"].unique()):
    print("-", a)

activity_map = {
    "Eating": "Eating",
    "Personal care except eating": "Personal care",
    "Sleep": "Sleep",
    "Work and study": "Work & study",
    "Household and family care and related travel": "Household & family care",
    "Leisure, social and associative life except TV and video": "Leisure (social)",
    "Television and video": "TV & video",
    "Travel to/from work/study": "Commute",
    "Unspecified time use and travel": "Other / unspecified",
}

def map_activity(a):
    return activity_map.get(a, a)

df["activity_group"] = df["activity"].map(map_activity)

print("\nUnique grouped activities:")
print(sorted(df["activity_group"].unique()))



Unique activities:
- Eating
- Household and family care and related travel
- Leisure, social and associative life except TV and video
- Personal care except eating
- Television and video
- Travel to/from work/study
- Unspecified time use and travel
- Work and study

Unique grouped activities:
['Commute', 'Eating', 'Household & family care', 'Leisure (social)', 'Other / unspecified', 'Personal care', 'TV & video', 'Work & study']


In [10]:
countries_keep = [
    "Belgium",
    "Germany",
    "Spain",
    "France",
    "Italy",
    "Estonia",
]

df = df[df["country"].isin(countries_keep)].copy()
print("After country filter:", df.shape)
print("Countries in final data:", sorted(df["country"].unique()))

After country filter: (9504, 8)
Countries in final data: ['Belgium', 'Estonia', 'France', 'Germany', 'Italy', 'Spain']


In [11]:
df["year"] = df["year"].astype(int)

df_clean = df[[
    "country",
    "year",
    "activity",
    "activity_group",
    "time_label",
    "hour",
    "minutes_since_midnight",
    "value"
]].sort_values(["country", "year", "activity_group", "minutes_since_midnight"])

print("\nFinal clean shape:", df_clean.shape)
print(df_clean.head())


Final clean shape: (9504, 8)
     country  year                   activity activity_group  \
96   Belgium  2000  Travel to/from work/study        Commute   
240  Belgium  2000  Travel to/from work/study        Commute   
384  Belgium  2000  Travel to/from work/study        Commute   
528  Belgium  2000  Travel to/from work/study        Commute   
672  Belgium  2000  Travel to/from work/study        Commute   

              time_label      hour  minutes_since_midnight  value  
96   From 04:00 to 04:09  4.000000                   240.0   0.07  
240  From 04:10 to 04:19  4.166667                   250.0   0.09  
384  From 04:20 to 04:29  4.333333                   260.0   0.15  
528  From 04:30 to 04:39  4.500000                   270.0   0.35  
672  From 04:40 to 04:49  4.666667                   280.0   0.34  


In [12]:
out_path = "life_rhythms_clean.csv"
df_clean.to_csv(out_path, index=False)
print(f"\nSaved cleaned data to: {out_path}")


Saved cleaned data to: life_rhythms_clean.csv
