In [8]:
import json
import re

def norm_label(s: str) -> str:
    s = s.strip().lower()
    s = re.sub(r"[’']", "", s)
    s = re.sub(r"[^\w\s]", " ", s)   # parantez, nokta, vs.
    s = re.sub(r"\s+", "_", s)
    return s

def build_topicid_to_label(cfg: dict) -> dict:
    mapping = {}
    for cat in cfg.get("categories", []):
        for sg in cat.get("sub_groups", []):
            for q in sg.get("queries", []):
                topic_id = q["topic_id"].strip()
                label = norm_label(q["label"])
                mapping[topic_id] = label
    return mapping

with open("data/raw/keywords_FINAL_2025.json", "r", encoding="utf-8") as f:
    cfg = json.load(f)

topic2label = build_topicid_to_label(cfg)
print("Mapping size:", len(topic2label))


Mapping size: 178


In [9]:
import pandas as pd

df_raw = pd.read_csv("data/raw/final_proje_dataset_CLEAN.csv")  # dosya yolunu yaz
print(df_raw.shape)
print("First columns:", df_raw.columns[:12].tolist())


(11060, 180)
First columns: ['date', '/m/05p0rrx', '/m/0108bn2x', '/g/11tskkw5c9', '/g/11sfdkgmfn', '/g/11hcz1r4wl', '/g/11pcs9ny8w', '/g/11bw1zmnts', '/m/0273t5w', '/g/1234z6p9', '/m/020shm', '/g/11gt__kf44']


In [10]:
rename_map = {c: topic2label.get(c, c) for c in df_raw.columns}
df_labeled = df_raw.rename(columns=rename_map).copy()

print("After renaming:", df_labeled.columns[:15].tolist())


After renaming: ['date', 'bitcoin', 'ethereum', 'crypto_wallet', 'memecoin', 'binance', 'web3', 'real_estate', 'mortgage', 'housing_market', 'property_tax', 'housing_bubble', 'inflation', 'cost_of_living', 'price_hike']


In [11]:
# 1) Topic ID kaldı mı?
topic_id_left = [c for c in df_labeled.columns if str(c).startswith(("/m/", "/g/"))]
print("Unlabeled topic_id columns left:", len(topic_id_left))
print(topic_id_left[:10])

# 2) Duplicate label oluştu mu? (çok önemli!)
dups = df_labeled.columns[df_labeled.columns.duplicated()].tolist()
print("Duplicate columns:", dups)


Unlabeled topic_id columns left: 0
[]
Duplicate columns: []


In [12]:
from pathlib import Path

out_dir = Path("data/processed")
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / "labeled_time_series.csv"
df_labeled.to_csv(out_path, index=False, encoding="utf-8")
print("Saved:", out_path)

Saved: data\processed\labeled_time_series.csv


In [13]:
import pandas as pd

df_labeled = pd.read_csv("data/processed/labeled_time_series.csv")
print(df_labeled.columns[-5:])  # son kolonlara bak


Index(['islam_makhachev', 'alex_pereira', 'formula_1', 'ferrari_f1',
       'Country'],
      dtype='object')


In [14]:
import pandas as pd

# 1. Dosyayı oku
df = pd.read_csv("data/processed/labeled_time_series.csv")

feature_cols = [c for c in df.columns if c not in ["date", "Country"]]

# 4. Numeric dönüşüm
df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors="coerce")

# 5. Gruplandırma 
country_row = (
    df
    .groupby("Country")[feature_cols]  
    .median()
    .reset_index()
)

print(f"Yeni Veri Yapısı: {country_row.shape}")
display(country_row.head())

Yeni Veri Yapısı: (70, 179)


Unnamed: 0,Country,bitcoin,ethereum,crypto_wallet,memecoin,binance,web3,real_estate,mortgage,housing_market,...,pilates,chloe_ting,hip_thrust,booty_workout,ufc,conor_mcgregor,islam_makhachev,alex_pereira,formula_1,ferrari_f1
0,AE,32.0,8.0,0.0,0.0,12.0,8.0,0.0,74.0,0.0,...,43.0,0.0,0.0,0.0,37.0,20.5,0.0,0.0,20.0,0.0
1,AR,27.0,6.0,0.0,0.0,14.0,1.0,0.0,13.0,0.0,...,64.0,0.0,5.0,0.0,23.5,16.5,0.0,0.0,11.5,0.0
2,AT,31.0,7.0,0.0,0.0,2.0,2.5,0.0,19.0,0.0,...,49.5,0.0,4.0,0.0,43.0,20.0,0.0,0.0,28.0,0.0
3,AU,26.0,6.0,0.0,0.0,3.0,1.0,0.0,70.0,0.0,...,69.5,0.0,3.0,0.0,23.0,17.0,0.0,0.0,33.0,0.0
4,BD,21.0,4.0,0.0,0.0,15.0,10.0,0.0,41.0,0.0,...,3.0,0.0,0.0,0.0,55.0,30.5,0.0,0.0,42.5,0.0
