In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 📍 Set path to your zip file in Drive
zip_path = "/content/drive/MyDrive/Gladyshev/data/wiki_crop.tar"  # ← change this
extract_dir = "/content/drive/MyDrive/Gladyshev/data/wiki_crop_unzip"  # Output location in Colab

# 📦 Unzip
import zipfile
import os

os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"✅ Unzipped to: {extract_dir}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


BadZipFile: File is not a zip file

In [3]:
import tarfile
import os

# 📍 Set your .tar(.gz) file path and target extraction directory
tar_path = "/content/drive/MyDrive/Gladyshev/data/wiki_crop.tar"  # ← change this
extract_path = "/content/drive/MyDrive/Gladyshev/data/wiki_crop_unzip"  # Output location in Colab

# ✅ Make the output directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# 📦 Extract
with tarfile.open(tar_path, "r:*") as tar:
    tar.extractall(path=extract_dir)

print(f"✅ Extracted to: {extract_dir}")


✅ Extracted to: /content/drive/MyDrive/Gladyshev/data/wiki_crop_unzip


In [15]:
import tarfile
import os

# 📍 Set your .tar(.gz) file path and target extraction directory
tar_path = "/content/drive/MyDrive/Gladyshev/data/wiki.tar.gz"  # ← change this
extract_path = "/content/drive/MyDrive/Gladyshev/data/wiki_unzip"  # Output location in Colab

# ✅ Make the output directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# 📦 Extract
with tarfile.open(tar_path, "r:*") as tar:
    tar.extractall(path=extract_dir)

print(f"✅ Extracted to: {extract_dir}")


✅ Extracted to: /content/drive/MyDrive/Gladyshev/data/wiki_crop_unzip


In [5]:
import scipy.io
import pandas as pd
import numpy as np
from datetime import datetime
import os

# === 1. Load wiki.mat ===
wiki_mat_path = "/content/drive/MyDrive/Gladyshev/data/wiki_crop_unzip/wiki_crop/wiki.mat"
mat = scipy.io.loadmat(wiki_mat_path)

# Extract fields
wiki = mat["wiki"][0, 0]
dob = wiki["dob"][0]
photo_taken = wiki["photo_taken"][0]
full_path = wiki["full_path"][0]
gender = wiki["gender"][0] if "gender" in wiki.dtype.names else np.full(len(dob), np.nan)
name = wiki["name"][0]
face_score = wiki["face_score"][0]
second_face_score = wiki["second_face_score"][0]

# Flatten and decode
def extract_str(mat_arr):
    return [str(el[0]) for el in mat_arr]

df_wiki = pd.DataFrame({
    "name": extract_str(name),
    "dob": dob,
    "photo_taken": photo_taken,
    "full_path": extract_str(full_path),
    "gender": gender,
    "face_score": face_score,
    "second_face_score": second_face_score
})

# === 2. Load names.basics.tsv ===
tsv_path = "/content/drive/MyDrive/Gladyshev/data/names.basics.tsv"
df_names = pd.read_csv(tsv_path, sep="\t", usecols=["primaryName", "deathYear"])
df_names = df_names.rename(columns={"primaryName": "name"})
df_names = df_names[df_names["deathYear"] != "\\N"]  # Filter missing
df_names["deathYear"] = df_names["deathYear"].astype(int)

# === 3. Merge and Compute ===
df_merged = pd.merge(df_wiki, df_names, on="name", how="left")

# Convert MATLAB datenum to year
def matlab_datenum_to_year(matlab_dn):
    if np.isnan(matlab_dn):
        return np.nan
    return datetime.fromordinal(int(matlab_dn)) .year

df_merged["birth_year"] = df_merged["dob"].apply(matlab_datenum_to_year)

# Compute age at death and time to death
df_merged["age_at_death"] = df_merged["deathYear"] - df_merged["birth_year"]
df_merged["time_to_death"] = df_merged["deathYear"] - df_merged["photo_taken"]

# === 4. Save as CSV ===
output_path = "/content/drive/MyDrive/Gladyshev/data/wiki_with_death.csv"
df_merged.to_csv(output_path, index=False)
print(f"✅ Saved merged CSV with shape {df_merged.shape} to: {output_path}")

IndexError: index 0 is out of bounds for axis 0 with size 0

In [6]:
# 📦 Install and import
import scipy.io
import pandas as pd
import numpy as np
from datetime import datetime
import os

# 📂 Paths
wiki_mat_path = "/content/drive/MyDrive/Gladyshev/data/wiki_crop_unzip/wiki_crop/wiki.mat"
tsv_path = "/content/drive/MyDrive/Gladyshev/data/names.basics.tsv"
output_csv_path = "/content/drive/MyDrive/Gladyshev/data/wiki_with_death.csv"

# 📥 Load .mat file
mat = scipy.io.loadmat(wiki_mat_path)
wiki_data = mat["wiki"]

# 🔍 Extract fields
dob = wiki_data["dob"][0, 0].squeeze()                  # Date of birth as MATLAB serial date
photo_taken = wiki_data["photo_taken"][0, 0].squeeze()  # Year photo taken
imdb_ids = wiki_data["imdb_id"][0, 0].squeeze()          # IMDb IDs
full_paths = wiki_data["full_path"][0, 0].squeeze()      # Image paths
names = wiki_data["name"][0, 0].squeeze()                # Names

# 📦 Decode
def decode_str_array(mat_arr):
    return [el[0] if len(el) > 0 else "" for el in mat_arr]

imdb_ids_decoded = decode_str_array(imdb_ids)
full_paths_decoded = decode_str_array(full_paths)
names_decoded = decode_str_array(names)

# 📅 Convert MATLAB serial date to datetime
def matlab_datenum_to_datetime(matlab_datenum):
    origin = datetime.fromordinal(1) + pd.to_timedelta(366, unit='D')
    return [origin + pd.to_timedelta(int(d), unit='D') if d > 0 else pd.NaT for d in matlab_datenum]

dob_dt = matlab_datenum_to_datetime(dob)

# 🧱 Build base DataFrame
df_wiki = pd.DataFrame({
    "imdb_id": imdb_ids_decoded,
    "name": names_decoded,
    "dob": dob_dt,
    "photo_taken": photo_taken,
    "image_path": full_paths_decoded
})

# 🧮 Compute age at photo
df_wiki["age_at_photo"] = df_wiki["photo_taken"] - df_wiki["dob"].dt.year

# 🧬 Load death data
df_death = pd.read_csv(tsv_path, sep='\t', usecols=["nconst", "deathYear"])
df_death = df_death.rename(columns={"nconst": "imdb_id", "deathYear": "death_year"})
df_death = df_death[df_death["death_year"] != "\\N"]
df_death["death_year"] = pd.to_numeric(df_death["death_year"])

# 🔗 Merge death year
df_merged = df_wiki.merge(df_death, on="imdb_id", how="left")

# ➕ Compute age at death & time to death
df_merged["age_at_death"] = df_merged["death_year"] - df_merged["dob"].dt.year
df_merged["time_to_death"] = df_merged["age_at_death"] - df_merged["age_at_photo"]

# 💾 Save final CSV
df_merged.to_csv(output_csv_path, index=False)
print(f"✅ Saved merged dataset to: {output_csv_path}")

# 📊 Summary
print(df_merged[["imdb_id", "name", "age_at_photo", "death_year", "age_at_death", "time_to_death"]].head())


ValueError: no field of name imdb_id

In [7]:
import scipy.io
import pandas as pd
import numpy as np
from datetime import datetime
import os

# === Paths ===
wiki_mat_path = "/content/drive/MyDrive/Gladyshev/data/wiki_crop_unzip/wiki_crop/wiki.mat"
tsv_path = "/content/drive/MyDrive/Gladyshev/data/names.basics.tsv"
output_csv_path = "/content/drive/MyDrive/Gladyshev/data/wiki_with_death.csv"

# === Load .mat file ===
mat = scipy.io.loadmat(wiki_mat_path)
wiki = mat["wiki"][0, 0]

# Extract fields
dob = wiki["dob"].squeeze()
photo_taken = wiki["photo_taken"].squeeze()
full_path = wiki["full_path"].squeeze()
gender = wiki["gender"].squeeze()
name = wiki["name"].squeeze()
face_score = wiki["face_score"].squeeze()

# === Decode text fields ===
def decode_str(arr):
    return [el[0] if len(el) > 0 else "" for el in arr]

names_decoded = decode_str(name)
paths_decoded = decode_str(full_path)

# === Convert dob from MATLAB datenum ===
def matlab_datenum_to_datetime(matlab_datenum):
    origin = datetime.fromordinal(1) + pd.to_timedelta(366, unit='D')
    return [origin + pd.to_timedelta(int(d), unit='D') if d > 0 else pd.NaT for d in matlab_datenum]

dob_converted = matlab_datenum_to_datetime(dob)

# === Build DataFrame ===
df_wiki = pd.DataFrame({
    "name": names_decoded,               # Likely nmXXXXXX
    "dob": dob_converted,
    "photo_taken": photo_taken,
    "gender": gender,
    "image_path": paths_decoded,
    "face_score": face_score
})

df_wiki["age_at_photo"] = df_wiki["photo_taken"] - df_wiki["dob"].dt.year

# === Load and merge with death year from IMDb ===
df_death = pd.read_csv(tsv_path, sep="\t", usecols=["nconst", "deathYear"])
df_death = df_death[df_death["deathYear"] != "\\N"]
df_death["deathYear"] = pd.to_numeric(df_death["deathYear"])

# Merge on `name` ↔ `nconst`
df_merged = df_wiki.merge(df_death, left_on="name", right_on="nconst", how="left")

# === Calculate age at death and time to death ===
df_merged["age_at_death"] = df_merged["deathYear"] - df_merged["dob"].dt.year
df_merged["time_to_death"] = df_merged["age_at_death"] - df_merged["age_at_photo"]

# === Save to CSV ===
df_merged.to_csv(output_csv_path, index=False)
print(f"✅ Saved merged data with death year to:\n{output_csv_path}")

# Preview
df_merged[["name", "photo_taken", "dob", "deathYear", "age_at_photo", "age_at_death", "time_to_death"]].head()


OutOfBoundsTimedelta: Cannot cast 723671 from D to 'ns' without overflow.

In [8]:
import scipy.io
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# === Load the MATLAB file ===
wiki_mat_path = "/content/drive/MyDrive/Gladyshev/data/wiki_crop_unzip/wiki_crop/wiki.mat"
mat = scipy.io.loadmat(wiki_mat_path)
wiki = mat["wiki"][0, 0]

# === Extract fields ===
dob = wiki["dob"][0]                     # MATLAB datenum
photo_taken = wiki["photo_taken"][0]    # Year of photo
full_path = [str(fp[0]) for fp in wiki["full_path"][0]]
gender = wiki["gender"][0]              # 0 = male, 1 = female
face_score = wiki["face_score"][0]
second_face_score = wiki["second_face_score"][0]
name = [str(n[0]) for n in wiki["name"][0]]

# === Convert MATLAB datenum to datetime safely ===
def matlab_datenum_to_date(datenum):
    # MATLAB datenum is days since 0000-01-01, but Python starts at 0001-01-01
    # So we subtract 366 days
    origin = datetime.fromordinal(1)
    dates = []
    for dn in datenum:
        try:
            if dn > 0:
                dt = origin + timedelta(days=float(dn) - 366)
                dates.append(dt.date())
            else:
                dates.append(pd.NaT)
        except:
            dates.append(pd.NaT)
    return dates

dob_converted = matlab_datenum_to_date(dob)

# === Load death years from tsv ===
tsv_path = "/content/drive/MyDrive/Gladyshev/data/names.basics.tsv"
df_death = pd.read_csv(tsv_path, sep="\t", usecols=["primaryName", "deathYear"])
df_death = df_death.dropna()
df_death["deathYear"] = pd.to_numeric(df_death["deathYear"], errors="coerce")

# === Create wiki DataFrame ===
df_wiki = pd.DataFrame({
    "name": name,
    "dob": dob_converted,
    "photo_taken": photo_taken,
    "full_path": full_path,
    "gender": gender,
    "face_score": face_score,
    "second_face_score": second_face_score
})

# === Merge death year using fuzzy name match ===
df = df_wiki.merge(df_death, left_on="name", right_on="primaryName", how="left")

# === Compute age at death and time to death ===
def compute_age_at_death(row):
    if pd.notna(row["dob"]) and pd.notna(row["deathYear"]):
        return row["deathYear"] - row["dob"].year
    return np.nan

def compute_time_to_death(row):
    if pd.notna(row["deathYear"]) and pd.notna(row["photo_taken"]):
        return row["deathYear"] - row["photo_taken"]
    return np.nan

df["age_at_death"] = df.apply(compute_age_at_death, axis=1)
df["time_to_death"] = df.apply(compute_time_to_death, axis=1)

# === Save to CSV ===
output_path = "/content/wiki_with_death_data.csv"
df.to_csv(output_path, index=False)
print(f"✅ Saved: {output_path}")


IndexError: index 0 is out of bounds for axis 0 with size 0

In [10]:
# 📦 Install necessary packages
!pip install -q wikipedia-api wikidata

# 📚 Imports
import scipy.io
import pandas as pd
import numpy as np
import requests
import wikipediaapi
from datetime import datetime, timedelta
from tqdm import tqdm

# === 📂 Load the MATLAB file ===
wiki_mat_path = "/content/drive/MyDrive/Gladyshev/data/wiki_crop_unzip/wiki_crop/wiki.mat"
mat = scipy.io.loadmat(wiki_mat_path)
wiki = mat["wiki"][0, 0]

# === 🔍 Extract fields ===
dob = wiki["dob"][0]                     # MATLAB datenum
photo_taken = wiki["photo_taken"][0]    # Year of photo
full_path = [str(fp[0]) for fp in wiki["full_path"][0]]
gender = wiki["gender"][0]              # 0 = male, 1 = female
face_score = wiki["face_score"][0]
second_face_score = wiki["second_face_score"][0]
names = [str(n[0]) for n in wiki["name"][0]]

# === 🔁 MATLAB to Python datetime conversion ===
def matlab_datenum_to_date(datenum):
    origin = datetime.fromordinal(1)
    return [
        (origin + timedelta(days=float(d) - 366)).date() if d > 0 else pd.NaT
        for d in datenum
    ]

dob_converted = matlab_datenum_to_date(dob)

# === 📐 Initial DataFrame ===
df = pd.DataFrame({
    "name": names,
    "dob": dob_converted,
    "photo_taken": photo_taken,
    "full_path": full_path,
    "gender": gender,
    "face_score": face_score,
    "second_face_score": second_face_score
})

# === 🌐 Wikidata setup ===
wiki_wiki = wikipediaapi.Wikipedia('en')

def get_wikidata_qid(name):
    page = wiki_wiki.page(name)
    if not page.exists():
        return None
    try:
        wikibase = page.wikibase
        return wikibase
    except:
        return None

def fetch_death_year_and_cause(qid):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    r = requests.get(url)
    if r.status_code != 200:
        return None, None
    try:
        data = r.json()
        entity = data["entities"][qid]

        claims = entity.get("claims", {})
        death_year = None
        cause = None

        # Get death year
        if "P570" in claims:
            date_str = claims["P570"][0]["mainsnak"]["datavalue"]["value"]["time"]
            death_year = int(date_str[1:5])

        # Get cause of death
        if "P509" in claims:
            cause_id = claims["P509"][0]["mainsnak"]["datavalue"]["value"]["id"]
            cause_label_url = f"https://www.wikidata.org/wiki/Special:EntityData/{cause_id}.json"
            cause_data = requests.get(cause_label_url).json()
            cause_label = cause_data["entities"][cause_id]["labels"]["en"]["value"]
            cause = cause_label

        return death_year, cause
    except:
        return None, None

# === 🔎 Add death data ===
death_years = []
cause_of_deaths = []

print("🔄 Fetching Wikidata info...")
for name in tqdm(df["name"]):
    qid = get_wikidata_qid(name)
    if qid:
        death, cause = fetch_death_year_and_cause(qid)
    else:
        death, cause = None, None
    death_years.append(death)
    cause_of_deaths.append(cause)

df["death_year"] = death_years
df["cause_of_death"] = cause_of_deaths

# === 🧮 Compute age at death and time to death ===
def compute_age_at_death(row):
    if pd.notna(row["dob"]) and pd.notna(row["death_year"]):
        return row["death_year"] - row["dob"].year
    return np.nan

def compute_time_to_death(row):
    if pd.notna(row["death_year"]) and pd.notna(row["photo_taken"]):
        return row["death_year"] - row["photo_taken"]
    return np.nan

df["age_at_death"] = df.apply(compute_age_at_death, axis=1)
df["time_to_death"] = df.apply(compute_time_to_death, axis=1)

# === 💾 Save to CSV ===
output_path = "/content/wiki_with_wikidata.csv"
df.to_csv(output_path, index=False)
print(f"✅ Saved merged dataset with death info to: {output_path}")


IndexError: index 0 is out of bounds for axis 0 with size 0

In [11]:
# 📦 Install required packages
!pip install -q wikipedia-api

# 📚 Imports
import scipy.io
import pandas as pd
import numpy as np
import requests
import wikipediaapi
from datetime import datetime, timedelta
from tqdm import tqdm

# === Load the MATLAB file ===
wiki_mat_path = "/content/drive/MyDrive/Gladyshev/data/wiki_crop_unzip/wiki_crop/wiki.mat"
mat = scipy.io.loadmat(wiki_mat_path, struct_as_record=False, squeeze_me=True)
wiki = mat["wiki"]

# === Extract and parse fields ===
def extract_field(field):
    try:
        return getattr(wiki, field)
    except AttributeError:
        return None

dob = extract_field("dob")
photo_taken = extract_field("photo_taken")
full_path = [str(fp) for fp in extract_field("full_path")]
gender = extract_field("gender")
face_score = extract_field("face_score")
second_face_score = extract_field("second_face_score")
names = [str(n) for n in extract_field("name")]

# === Convert MATLAB datenum to datetime safely ===
def matlab_datenum_to_date(datenum):
    origin = datetime.fromordinal(1)
    return [
        (origin + timedelta(days=float(d) - 366)).date() if d > 0 else pd.NaT
        for d in datenum
    ]

dob_converted = matlab_datenum_to_date(dob)

# === Initial DataFrame ===
df = pd.DataFrame({
    "name": names,
    "dob": dob_converted,
    "photo_taken": photo_taken,
    "full_path": full_path,
    "gender": gender,
    "face_score": face_score,
    "second_face_score": second_face_score
})

# === Setup Wikipedia/Wikidata API ===
wiki_wiki = wikipediaapi.Wikipedia(
    language='en',
    user_agent='WikiMortalityBot/1.0 (contact: your_email@example.com)'  # Customize if desired
)

def get_wikidata_qid(name):
    page = wiki_wiki.page(name)
    if page.exists() and hasattr(page, "wikibase"):
        return page.wikibase
    return None

def fetch_death_year_and_cause(qid):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    r = requests.get(url)
    if r.status_code != 200:
        return None, None
    try:
        data = r.json()
        entity = data["entities"][qid]
        claims = entity.get("claims", {})
        death_year = None
        cause = None

        if "P570" in claims:
            date_str = claims["P570"][0]["mainsnak"]["datavalue"]["value"]["time"]
            death_year = int(date_str[1:5])

        if "P509" in claims:
            cause_id = claims["P509"][0]["mainsnak"]["datavalue"]["value"]["id"]
            cause_url = f"https://www.wikidata.org/wiki/Special:EntityData/{cause_id}.json"
            cause_data = requests.get(cause_url).json()
            cause_label = cause_data["entities"][cause_id]["labels"]["en"]["value"]
            cause = cause_label

        return death_year, cause
    except:
        return None, None

# === Query and append death info ===
death_years = []
cause_of_deaths = []

print("🔄 Fetching Wikidata info...")
for name in tqdm(df["name"]):
    qid = get_wikidata_qid(name)
    if qid:
        death, cause = fetch_death_year_and_cause(qid)
    else:
        death, cause = None, None
    death_years.append(death)
    cause_of_deaths.append(cause)

df["death_year"] = death_years
df["cause_of_death"] = cause_of_deaths

# === Compute derived columns ===
def compute_age_at_death(row):
    if pd.notna(row["dob"]) and pd.notna(row["death_year"]):
        return row["death_year"] - row["dob"].year
    return np.nan

def compute_time_to_death(row):
    if pd.notna(row["death_year"]) and pd.notna(row["photo_taken"]):
        return row["death_year"] - row["photo_taken"]
    return np.nan

df["age_at_death"] = df.apply(compute_age_at_death, axis=1)
df["time_to_death"] = df.apply(compute_time_to_death, axis=1)

# === Save result ===
output_path = "/content/wiki_with_death_info.csv"
df.to_csv(output_path, index=False)
print(f"✅ Saved to {output_path}")


AssertionError: Please, be nice to Wikipedia and specify user agent - https://meta.wikimedia.org/wiki/User-Agent_policy. Current user_agent: 'en' is not sufficient. Use Wikipedia(user_agent='your-user-agent', language='en')

In [None]:
# 📦 Install required packages
!pip install -q wikipedia-api

In [12]:
# 📚 Imports
import scipy.io
import pandas as pd
import numpy as np
import requests
import wikipediaapi
from datetime import datetime, timedelta
from tqdm import tqdm

# === Load the MATLAB file ===
wiki_mat_path = "/content/drive/MyDrive/Gladyshev/data/wiki_crop_unzip/wiki_crop/wiki.mat"
mat = scipy.io.loadmat(wiki_mat_path, struct_as_record=False, squeeze_me=True)
wiki = mat["wiki"]

# === Extract and parse fields ===
def extract_field(field):
    try:
        return getattr(wiki, field)
    except AttributeError:
        return None

dob = extract_field("dob")
photo_taken = extract_field("photo_taken")
full_path = [str(fp) for fp in extract_field("full_path")]
gender = extract_field("gender")
face_score = extract_field("face_score")
second_face_score = extract_field("second_face_score")
names = [str(n) for n in extract_field("name")]

# === Convert MATLAB datenum to datetime safely ===
def matlab_datenum_to_date(datenum):
    origin = datetime.fromordinal(1)
    return [
        (origin + timedelta(days=float(d) - 366)).date() if d > 0 else pd.NaT
        for d in datenum
    ]

dob_converted = matlab_datenum_to_date(dob)

# === Initial DataFrame ===
df = pd.DataFrame({
    "name": names,
    "dob": dob_converted,
    "photo_taken": photo_taken,
    "full_path": full_path,
    "gender": gender,
    "face_score": face_score,
    "second_face_score": second_face_score
})

# === Setup Wikipedia/Wikidata API ===
wiki_wiki = wikipediaapi.Wikipedia(
    language='en',
    user_agent='WikiMortalityBot/1.0 (contact: your_email@example.com)'  # Customize if desired
)

def get_wikidata_qid(name):
    page = wiki_wiki.page(name)
    if page.exists() and hasattr(page, "wikibase"):
        return page.wikibase
    return None

def fetch_death_year_and_cause(qid):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    r = requests.get(url)
    if r.status_code != 200:
        return None, None
    try:
        data = r.json()
        entity = data["entities"][qid]
        claims = entity.get("claims", {})
        death_year = None
        cause = None

        if "P570" in claims:
            date_str = claims["P570"][0]["mainsnak"]["datavalue"]["value"]["time"]
            death_year = int(date_str[1:5])

        if "P509" in claims:
            cause_id = claims["P509"][0]["mainsnak"]["datavalue"]["value"]["id"]
            cause_url = f"https://www.wikidata.org/wiki/Special:EntityData/{cause_id}.json"
            cause_data = requests.get(cause_url).json()
            cause_label = cause_data["entities"][cause_id]["labels"]["en"]["value"]
            cause = cause_label

        return death_year, cause
    except:
        return None, None

# === Query and append death info ===
death_years = []
cause_of_deaths = []

print("🔄 Fetching Wikidata info...")
for name in tqdm(df["name"]):
    qid = get_wikidata_qid(name)
    if qid:
        death, cause = fetch_death_year_and_cause(qid)
    else:
        death, cause = None, None
    death_years.append(death)
    cause_of_deaths.append(cause)

df["death_year"] = death_years
df["cause_of_death"] = cause_of_deaths

# === Compute derived columns ===
def compute_age_at_death(row):
    if pd.notna(row["dob"]) and pd.notna(row["death_year"]):
        return row["death_year"] - row["dob"].year
    return np.nan

def compute_time_to_death(row):
    if pd.notna(row["death_year"]) and pd.notna(row["photo_taken"]):
        return row["death_year"] - row["photo_taken"]
    return np.nan

df["age_at_death"] = df.apply(compute_age_at_death, axis=1)
df["time_to_death"] = df.apply(compute_time_to_death, axis=1)

# === Save result ===
output_path = "/content/wiki_with_death_info.csv"
df.to_csv(output_path, index=False)
print(f"✅ Saved to {output_path}")

🔄 Fetching Wikidata info...


100%|██████████| 62328/62328 [2:00:47<00:00,  8.60it/s]


✅ Saved to /content/wiki_with_death_info.csv


In [13]:
import pandas as pd

# Load the previously saved CSV
df = pd.read_csv("/content/wiki_with_death_info.csv")

# Ensure the critical fields exist and are not null
filtered_df = df[
    df["death_year"].notna() &
    df["age_at_death"].notna() &
    df["time_to_death"].notna()
]

# Optional: remove implausible ages (e.g., <0 or >120)
filtered_df = filtered_df[
    (filtered_df["age_at_death"] > 0) &
    (filtered_df["age_at_death"] < 120)
]

# Save the filtered dataset
filtered_output_path = "/content/wiki_filtered_deaths.csv"
filtered_df.to_csv(filtered_output_path, index=False)
print(f"✅ Filtered dataset saved to: {filtered_output_path}")
print(f"📊 Rows retained: {len(filtered_df)}")

✅ Filtered dataset saved to: /content/wiki_filtered_deaths.csv
📊 Rows retained: 0


In [14]:
from google.colab import files

# Paths to the CSV files
full_csv = "/content/wiki_with_death_info.csv"
filtered_csv = "/content/wiki_filtered_deaths.csv"

# Download both files
print("⬇️ Downloading full dataset...")
files.download(full_csv)

print("⬇️ Downloading filtered dataset...")
files.download(filtered_csv)

⬇️ Downloading full dataset...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

⬇️ Downloading filtered dataset...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# 📦 Install necessary packages
!pip install -q wikipedia-api

# 📚 Imports
import scipy.io
import pandas as pd
import numpy as np
import requests
import wikipediaapi
from datetime import datetime, timedelta
from tqdm import tqdm

# === Load the MATLAB file ===
wiki_mat_path = "/content/drive/MyDrive/Gladyshev/data/wiki_crop_unzip/wiki_crop/wiki.mat"
mat = scipy.io.loadmat(wiki_mat_path, struct_as_record=False, squeeze_me=True)
wiki = mat["wiki"]

# === Extract and parse fields ===
def extract_field(field):
    try:
        return getattr(wiki, field)
    except AttributeError:
        return None

dob = extract_field("dob")
photo_taken = extract_field("photo_taken")
full_path = [str(fp) for fp in extract_field("full_path")]
gender = extract_field("gender")
face_score = extract_field("face_score")
second_face_score = extract_field("second_face_score")
names = [str(n) for n in extract_field("name")]

# === Convert MATLAB datenum to datetime safely ===
def matlab_datenum_to_date(datenum):
    origin = datetime.fromordinal(1)
    return [
        (origin + timedelta(days=float(d) - 366)).date() if d > 0 else pd.NaT
        for d in datenum
    ]

dob_converted = matlab_datenum_to_date(dob)

# === Initial DataFrame ===
df = pd.DataFrame({
    "name": names,
    "dob": dob_converted,
    "photo_taken": photo_taken,
    "full_path": full_path,
    "gender": gender,
    "face_score": face_score,
    "second_face_score": second_face_score
})

# === Setup Wikipedia/Wikidata API ===
wiki_wiki = wikipediaapi.Wikipedia(
    language='en',
    user_agent='WikiMortalityBot/1.0 (contact: your_email@example.com)'  # Replace contact
)

def get_wikidata_qid(name):
    page = wiki_wiki.page(name)
    if page.exists() and hasattr(page, "wikibase"):
        return page.wikibase
    return None

def fetch_death_year_and_cause(qid):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    r = requests.get(url)
    if r.status_code != 200:
        return None, None
    try:
        data = r.json()
        entity = data["entities"][qid]
        claims = entity.get("claims", {})
        death_year = None
        cause = None

        if "P570" in claims:
            date_str = claims["P570"][0]["mainsnak"]["datavalue"]["value"]["time"]
            death_year = int(date_str[1:5])

        if "P509" in claims:
            cause_id = claims["P509"][0]["mainsnak"]["datavalue"]["value"]["id"]
            cause_url = f"https://www.wikidata.org/wiki/Special:EntityData/{cause_id}.json"
            cause_data = requests.get(cause_url).json()
            cause_label = cause_data["entities"][cause_id]["labels"]["en"]["value"]
            cause = cause_label

        return death_year, cause
    except:
        return None, None

# === Query and append death info ===
death_years = []
cause_of_deaths = []

print("🔄 Fetching Wikidata info...")
for name in tqdm(df["name"]):
    qid = get_wikidata_qid(name)
    if qid:
        death, cause = fetch_death_year_and_cause(qid)
    else:
        death, cause = None, None
    death_years.append(death)
    cause_of_deaths.append(cause)

df["death_year"] = death_years
df["cause_of_death"] = cause_of_deaths

# === Compute derived columns ===
def compute_age_at_death(row):
    if pd.notna(row["dob"]) and pd.notna(row["death_year"]):
        return row["death_year"] - row["dob"].year
    return np.nan

def compute_time_to_death(row):
    if pd.notna(row["death_year"]) and pd.notna(row["photo_taken"]):
        return row["death_year"] - row["photo_taken"]
    return np.nan

df["age_at_death"] = df.apply(compute_age_at_death, axis=1)
df["time_to_death"] = df.apply(compute_time_to_death, axis=1)

# === Filter rows with valid death info ===
df_filtered = df[
    df["death_year"].notna() &
    df["age_at_death"].notna() &
    df["time_to_death"].notna()
]

# === Save both versions ===
df.to_csv("/content/wiki_with_death_info_full.csv", index=False)
df_filtered.to_csv("/content/wiki_with_death_info_filtered.csv", index=False)

# === Print first 10 rows of filtered dataset ===
print("✅ First 10 people with valid death data:")
df_filtered.head(10)
