In [1]:
import json
import re

def norm_label(s: str) -> str:
    s = s.strip().lower()
    s = re.sub(r"[’']", "", s)
    s = re.sub(r"[^\w\s]", " ", s)   
    s = re.sub(r"\s+", "_", s)
    return s

def build_topicid_to_label(cfg: dict) -> dict:
    mapping = {}
    for cat in cfg.get("categories", []):
        for sg in cat.get("sub_groups", []):
            for q in sg.get("queries", []):
                topic_id = q["topic_id"].strip()
                label = norm_label(q["label"])
                mapping[topic_id] = label
    return mapping

with open("data/raw/keywords_FINAL_2025.json", "r", encoding="utf-8") as f:
    cfg = json.load(f)

topic2label = build_topicid_to_label(cfg)
print("Mapping size:", len(topic2label))


Mapping size: 178


In [2]:
import pandas as pd

df_raw = pd.read_csv("data/raw/final_proje_dataset_CLEANED_ID.csv")  
print(df_raw.shape)
print("First columns:", df_raw.columns[:7].tolist())


(11060, 109)
First columns: ['date', 'Country', '/m/05p0rrx', '/m/09jx2', '/g/11bc6pz4wc', '/m/0_t06w0', '/g/1ymzxwzdh']


In [3]:
rename_map = {c: topic2label.get(c, c) for c in df_raw.columns}
df_labeled = df_raw.rename(columns=rename_map).copy()

print("After renaming:", df_labeled.columns[:7].tolist())


After renaming: ['date', 'Country', 'bitcoin', 'inflation', 'ferrari', 'patek_philippe', 'rolex']


In [4]:
# 1) Topic ID checker
topic_id_left = [c for c in df_labeled.columns if str(c).startswith(("/m/", "/g/"))]
print("Unlabeled topic_id columns left:", len(topic_id_left))
print(topic_id_left[:10])

# 2) Duplicate label checker
dups = df_labeled.columns[df_labeled.columns.duplicated()].tolist()
print("Duplicate columns:", dups)


Unlabeled topic_id columns left: 0
[]
Duplicate columns: []


In [5]:
from pathlib import Path

out_dir = Path("data/processed")
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / "labeled_time_series.csv"
df_labeled.to_csv(out_path, index=False, encoding="utf-8")
print("Saved:", out_path)

Saved: data/processed/labeled_time_series.csv


In [6]:
import pandas as pd

df_labeled = pd.read_csv("data/processed/labeled_time_series.csv")
print(df_labeled.columns[-5:])  # check last columns


Index(['energy_efficiency', 'hacking', 'computer_security', 'mortgage_rate',
       'brain_chip'],
      dtype='object')


In [7]:
import pandas as pd


df = pd.read_csv("data/processed/labeled_time_series.csv")

feature_cols = [c for c in df.columns if c not in ["date", "Country"]]

# Numeric conversion
df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors="coerce")

#group
country_row = (
    df
    .groupby("Country")[feature_cols]  
    .median()
    .reset_index()
)

print(f"New dataset structure: {country_row.shape}")
display(country_row.head())

New dataset structure: (70, 108)


Unnamed: 0,Country,bitcoin,inflation,ferrari,patek_philippe,rolex,lamborghini,louis_vuitton,nuclear_energy,recycling,...,gofundme_medical,greenhouse_effect,hydropower,free_healthcare,hip_thrust,energy_efficiency,hacking,computer_security,mortgage_rate,brain_chip
0,AE,32.0,32.0,27.0,52.5,24.0,76.0,48.0,15.0,66.5,...,0.0,1.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,AR,27.0,36.0,49.5,13.0,32.0,53.0,38.0,11.0,57.0,...,0.0,12.0,4.0,0.0,5.0,7.0,0.0,0.0,0.0,0.0
2,AT,31.0,56.0,40.0,45.5,35.0,64.5,43.5,15.0,64.0,...,0.0,1.0,6.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0
3,AU,26.0,47.0,37.0,48.0,13.0,71.5,58.5,17.0,71.5,...,0.0,1.0,2.0,0.0,3.0,3.0,0.0,0.0,0.0,7.0
4,BD,21.0,42.0,49.5,51.0,23.5,37.0,27.0,19.0,41.5,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
