In [132]:
import pandas as pd
import numpy as np

def build_index_matrix(df, index_map, country="Country", agg="median", min_present=1):
    if country not in df.columns:
        raise ValueError(f"'{country}' column not found. Add it first.")

    # 1) Korunacak kolonlar (Country + varsa date)
    keep_cols = [country]
    if "date" in df.columns:
        keep_cols.append("date")

    # 2) Sadece feature kolonlarını numeric'e çevir
    feature_cols = [c for c in df.columns if c not in keep_cols]
    X_num = df[feature_cols].apply(pd.to_numeric, errors="coerce")

    # 3) Çıktıyı oluştur (Country/date korunur)
    out = df[keep_cols].copy()

    # 4) Index hesapla
    report_rows = []
    for idx_name, cols in index_map.items():
        present = [c for c in cols if c in X_num.columns]
        missing = [c for c in cols if c not in X_num.columns]

        report_rows.append({
            "index": idx_name,
            "n_keywords": len(cols),
            "n_present": len(present),
            "n_missing": len(missing),
            "missing": missing
        })

        if len(present) < min_present:
            out[idx_name] = np.nan
            continue

        if agg == "median":
            out[idx_name] = X_num[present].median(axis=1, skipna=True)
        elif agg == "mean":
            out[idx_name] = X_num[present].mean(axis=1, skipna=True)
        else:
            raise ValueError("agg must be 'median' or 'mean'")

    index_report = pd.DataFrame(report_rows)
    return out, index_report


In [133]:
print([c for c in df.columns if c.lower() in ["country", "geo"]])
print(df.columns[:20])
print("Country in columns?", "Country" in df.columns)


['Country']
Index(['date', 'bitcoin', 'ethereum', 'crypto_wallet', 'memecoin', 'binance',
       'web3', 'real_estate', 'mortgage', 'housing_market', 'property_tax',
       'housing_bubble', 'inflation', 'cost_of_living', 'price_hike',
       'shrinkflation', 'recession', 'layoff', 'job_cuts', 'economic_crisis'],
      dtype='object')
Country in columns? True


In [134]:
df_indexed, index_report = build_index_matrix(df, INDEX_MAP, country="Country", agg="median", min_present=1)

print("Indexed shape:", df_indexed.shape)
display(df_indexed.head())
display(index_report.tail(10))


Indexed shape: (11060, 32)


Unnamed: 0,Country,date,finance_crypto_speculation_index,finance_real_estate_pressure_index,finance_macro_pressure_index,luxury_status_consumption_index,luxury_aesthetic_body_index,luxury_elite_mobility_index,crime_gun_violence_index,crime_organized_crime_index,...,tech_digital_intelligence_index,tech_security_privacy_index,entertainment_streaming_consumption_index,entertainment_short_video_index,health_mental_health_index,health_healthcare_access_index,health_addiction_recovery_index,health_sleep_crisis_index,sports_body_image_index,sports_elite_spectator_sports_index
0,US,2021-12-26,1.5,0.0,2.0,4.5,8.0,1.0,0.0,1.0,...,0.0,0.0,8.0,0.0,2.0,0.0,0.0,0.0,0.0,8.5
1,US,2022-01-02,2.0,0.0,3.0,4.0,8.0,1.0,0.0,1.0,...,0.0,0.0,7.0,0.0,3.0,0.0,0.0,0.0,0.0,8.0
2,US,2022-01-09,1.5,0.0,3.0,4.0,8.0,0.0,0.0,2.0,...,0.0,0.0,7.0,0.0,3.0,0.0,0.0,0.0,0.0,7.5
3,US,2022-01-16,1.5,0.0,3.0,4.0,8.0,0.0,0.0,2.0,...,0.0,0.0,6.0,0.0,3.0,0.0,0.0,0.0,0.0,6.5
4,US,2022-01-23,1.5,0.0,3.0,7.0,9.0,0.0,0.0,2.0,...,0.0,0.0,5.0,0.0,3.0,0.0,0.0,0.0,0.0,7.0


Unnamed: 0,index,n_keywords,n_present,n_missing,missing
20,tech_digital_intelligence_index,10,10,0,[]
21,tech_security_privacy_index,7,7,0,[]
22,entertainment_streaming_consumption_index,5,5,0,[]
23,entertainment_short_video_index,3,3,0,[]
24,health_mental_health_index,7,7,0,[]
25,health_healthcare_access_index,8,8,0,[]
26,health_addiction_recovery_index,5,5,0,[]
27,health_sleep_crisis_index,5,5,0,[]
28,sports_body_image_index,9,9,0,[]
29,sports_elite_spectator_sports_index,6,6,0,[]


In [135]:
print("Old feature count:", df.shape[1]-1)
print("New index count:", df_indexed.shape[1]-1)


Old feature count: 179
New index count: 31


In [136]:
index_cols = [c for c in df_indexed.columns if c not in ["date", "Country"]]

country_index_matrix_median = (
    df_indexed
    .groupby("Country")[index_cols]
    .median()
    .reset_index()
)


In [137]:
df_indexed.to_csv("data/processed/country_index_matrix_median.csv", index=False)
index_report.to_csv("data/processed/index_build_report.csv", index=False)
print("Saved outputs to data/processed/")


Saved outputs to data/processed/


In [138]:
import pandas as pd

df = pd.read_csv("data/processed/labeled_time_series.csv")

# standardize: geo -> Country
if "geo" in df.columns and "Country" not in df.columns:
    df = df.rename(columns={"geo":"Country"})

cols = set(df.columns)

report = []
for idx, keys in INDEX_MAP.items():
    present = [k for k in keys if k in cols]
    missing = [k for k in keys if k not in cols]
    report.append((idx, len(keys), len(present), len(missing), missing[:8]))

report_df = pd.DataFrame(report, columns=["index","n_keywords","n_present","n_missing","missing_sample"])
report_df["coverage"] = report_df["n_present"]/report_df["n_keywords"]

display(report_df.sort_values("coverage"))



Unnamed: 0,index,n_keywords,n_present,n_missing,missing_sample,coverage
0,finance_crypto_speculation_index,6,6,0,[],1.0
27,health_sleep_crisis_index,5,5,0,[],1.0
26,health_addiction_recovery_index,5,5,0,[],1.0
25,health_healthcare_access_index,8,8,0,[],1.0
24,health_mental_health_index,7,7,0,[],1.0
23,entertainment_short_video_index,3,3,0,[],1.0
22,entertainment_streaming_consumption_index,5,5,0,[],1.0
21,tech_security_privacy_index,7,7,0,[],1.0
20,tech_digital_intelligence_index,10,10,0,[],1.0
19,environment_pollution_index,6,6,0,[],1.0


In [139]:
# Sadece endeks sütunlarını seç ve varyans hesapla
index_cols = [c for c in df_indexed.columns if c not in ["date", "Country"]]
variances = df_indexed[index_cols].var().sort_values(ascending=False)

# Görselleştirmek için DataFrame'e çevir
variance_report = pd.DataFrame({'Index': variances.index, 'Variance': variances.values})
print(variance_report)

                                        Index    Variance
0             luxury_status_consumption_index  174.595765
1                 social_dating_culture_index   83.176027
2                   crime_violent_crime_index   53.070194
3                   social_work_culture_index   51.785011
4                 crime_sexual_violence_index   38.867697
5                     social_corruption_index   36.968992
6                   health_sleep_crisis_index   35.396001
7         sports_elite_spectator_sports_index   24.039823
8                 environment_pollution_index   23.303454
9         environment_energy_transition_index   20.091348
10            social_migration_pressure_index   19.622708
11                luxury_aesthetic_body_index   19.591802
12                tech_security_privacy_index   17.139538
13                     crime_cybercrime_index   14.577100
14           finance_crypto_speculation_index   10.674338
15               finance_macro_pressure_index    7.821488
16  entertainm

In [140]:
# 1. Endeks matrisini yükle
df_indices = pd.read_csv('data/processed/country_index_matrix_median.csv')

# 2. Sayısal sütunları seç (Country ve date hariç)
numeric_cols = [c for c in df_indices.columns if c not in ['Country', 'date']]

# 3. Varyansları hesapla ve eşik değeri (threshold) belirle
# 0.1 altındaki varyanslar "ayrıştırıcı değil" olarak kabul edilir.
variances = df_indices[numeric_cols].var()
threshold = 0.1

low_variance_features = variances[variances < threshold].index.tolist()
high_variance_features = [c for c in df_indices.columns if c not in low_variance_features]

# 4. Düşük varyanslıları çıkar
df_high_var = df_indices[high_variance_features]

# 5. Yeni dosyayı kaydet
df_high_var.to_csv('data/processed/country_index_matrix_HIGH_VARIANCE.csv', index=False)

print(f"Eşik Değeri: {threshold}")
print(f"Çıkarılan Feature Sayısı: {len(low_variance_features)}")
print(f"Kalan Feature Sayısı: {len(high_variance_features) - 2} (Country ve date hariç)") # Country ve date hariç
print("\nElenen Düşük Varyanslı Feature'lar:")
for f in low_variance_features:
    print(f"- {f}")

Eşik Değeri: 0.1
Çıkarılan Feature Sayısı: 6
Kalan Feature Sayısı: 24 (Country ve date hariç)

Elenen Düşük Varyanslı Feature'lar:
- crime_gun_violence_index
- social_identity_politics_index
- tech_digital_intelligence_index
- health_healthcare_access_index
- health_addiction_recovery_index
- sports_body_image_index


Varyansın yorumu
Kültürel ayrışma en çok tüketim, ilişki biçimleri, suç algısı, iş ve yaşam tarzı üzerinden oluyor.

Küresel ortak konular dünyanın her yerinde benzer ilgiye sahip olan sağlık ve spor benzeri alanlar, AI, quantum, BCI. Bu konular küresel hype.
Her yerde konuşuluyor Bunlar clusteringi çok az etkiler. Ülkeler arası bir farklılığa işaret etmez. 

düşük ama kabul edilebilir varyans, örn. crime_gun_violence_index (0.046) Beklenen şekilde: ABD çok farklı Ama çoğu ülkede ilgi benzer
bu yüzden düşük varyanslı featurelar elendi
