In [41]:
import requests
import re
import csv
import time

def get_wikipedia_page(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "wikitext"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyProjectBot/1.0; +https://yourdomain.com)"
    }

    response = requests.get(url, params=params, headers=headers)

    try:
        data = response.json()
    except Exception as e:
        print(f"❌ JSON decode error for {title}: {e}")
        print("Raw response text:", response.text)
        return None

    if 'error' in data:
        print(f"❌ Wikipedia API error for {title}: {data['error']['info']}")
        return None

    return data['parse']['wikitext']['*']

def extract_infobox(wikitext):
    if not wikitext:
        return None
    match = re.search(r"\{\{Infobox(.+?)\n\}\}", wikitext, re.DOTALL)
    if match:
        return match.group(0)
    else:
        match = re.search(r"\{\{Infobox(.+?)\}\}", wikitext, re.DOTALL)
        return match.group(0) if match else None

def extract_attributes(infobox_text):
    if not infobox_text:
        return {}

    attributes = {}
    lines = infobox_text.split("\n")
    for line in lines:
        if line.startswith("|"):
            parts = line.split("=", 1)
            if len(parts) == 2:
                key = parts[0].strip("| ").lower()
                value = parts[1].strip()
                attributes[key] = value
    return attributes

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = set()
    for item in data:
        keys.update(item.keys())
    keys = sorted(list(keys))

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"✅ Dataset '{filename}' faylga saqlandi.")

if __name__ == "__main__":
    persons = [
        "Isaac Newton", "Marie Curie", "Albert Einstein", "Ada Lovelace", "Leonardo da Vinci",
        "Galileo Galilei", "Nikola Tesla", "Charles Darwin", "Thomas Edison", "Aristotle",
        "Plato", "Socrates", "Johannes Kepler", "Alan Turing", "Stephen Hawking",
        "Carl Sagan", "Benjamin Franklin", "James Clerk Maxwell", "Dmitri Mendeleev", "Gregor Mendel",
        "Louis Pasteur", "Alexander Fleming", "Michael Faraday", "Johann Wolfgang von Goethe", "Sigmund Freud",
        "Max Planck", "Niels Bohr", "Rosalind Franklin", "Rachel Carson", "Marie Stopes",
        "Hypatia", "Emmy Noether", "Grace Hopper", "Chien-Shiung Wu", "Lise Meitner",
        "Jane Goodall", "Margaret Mead", "Carl Linnaeus", "Alfred Nobel", "Andrei Sakharov",
        "Richard Feynman", "Werner Heisenberg", "Erwin Schrödinger", "Niels Henrik Abel", "Kurt Gödel",
        "John von Neumann", "Florence Nightingale", "Mary Anning", "Catherine the Great", "Cleopatra",
        "Queen Victoria", "Joan of Arc", "Elizabeth I", "Marie Antoinette", "Simone de Beauvoir",
        "Virginia Woolf", "Frida Kahlo", "Georgia O'Keeffe", "Amelia Earhart", "Harriet Tubman",
        "Sojourner Truth", "Malala Yousafzai", "Mother Teresa", "Eleanor Roosevelt", "Margaret Thatcher",
        "Indira Gandhi", "Angela Merkel", "Sappho", "Homer", "Virgil",
        "Dante Alighieri", "Geoffrey Chaucer", "Miguel de Cervantes", "William Shakespeare", "Johann Sebastian Bach",
        "Ludwig van Beethoven", "Wolfgang Amadeus Mozart", "Franz Schubert", "Pyotr Ilyich Tchaikovsky", "Igor Stravinsky",
        "Claude Debussy", "Johannes Brahms", "Richard Wagner", "Gustav Mahler", "Felix Mendelssohn",
        "Sergei Rachmaninoff", "Antonio Vivaldi", "Johann Strauss II", "Maurice Ravel", "Giacomo Puccini",
        "Gustav Holst", "Aaron Copland", "Duke Ellington", "Ella Fitzgerald", "Louis Armstrong",
        "Billie Holiday", "Charlie Parker", "John Coltrane", "Dmitri Shostakovich", "Leonard Bernstein",
        "Miles Davis", "Thelonious Monk", "Charles Mingus", "Art Tatum", "Count Basie",
        "Herbie Hancock", "Wynton Marsalis", "Dizzy Gillespie", "Benny Goodman", "Glenn Miller",
        "George Gershwin", "Cole Porter", "Irving Berlin", "Stephen Foster", "Scott Joplin",
        "Bach", "Haydn", "Handel", "Mozart", "Beethoven",
        "Schubert", "Schumann", "Chopin", "Liszt", "Brahms",
        "Tchaikovsky", "Rachmaninoff", "Debussy", "Ravel", "Stravinsky",
        "Bartók", "Shostakovich", "Prokofiev", "Sibelius", "Vaughan Williams",
        "Copland", "Bernstein", "Glass", "Reich", "Adams",
        "Boulez", "Stockhausen", "Ligeti", "Messiaen", "Penderecki",
        "John Cage", "Philip Glass", "Arvo Pärt", "Kaija Saariaho", "Olga Neuwirth",
        "Anna Thorvaldsdottir", "Tania León", "Jennifer Higdon", "Missy Mazzoli", "Julia Wolfe",
        "Florence Price", "Fanny Mendelssohn", "Clara Schumann", "Ethel Smyth", "Amy Beach",
        "Lili Boulanger", "Germaine Tailleferre", "Nadia Boulanger", "Louise Farrenc", "Cécile Chaminade",
        "Pauline Viardot", "Teresa Carreño", "Margaret Bonds", "Undine Smith Moore", "William Grant Still",
        "Scott Joplin", "W.C. Handy", "James P. Johnson", "Jelly Roll Morton", "Fats Waller",
        "Duke Ellington", "Count Basie", "Benny Goodman", "Glenn Miller", "Charlie Parker",
        "Dizzy Gillespie", "Thelonious Monk", "Miles Davis", "John Coltrane", "Ornette Coleman",
        "Charles Mingus", "Art Blakey", "Herbie Hancock", "Wayne Shorter", "Chick Corea",
        "Keith Jarrett", "Pat Metheny", "Esperanza Spalding", "Kamasi Washington", "Snarky Puppy","Esperanza Spalding"
    ]


    

    dataset = []

    for idx, person in enumerate(persons, 1):
        print(f"{idx}/{len(persons)} 📥 {person} sahifasini yuklanmoqda...")
        wikitext = get_wikipedia_page(person)
        infobox = extract_infobox(wikitext)
        attributes = extract_attributes(infobox)

        if attributes:
            if "name" not in attributes:
                attributes["name"] = person
            dataset.append(attributes)
            print(f"✅ {person} infoboxi muvaffaqiyatli olindi.")
        else:
            print(f"⚠️ {person} sahifasida infobox topilmadi yoki bo'sh.")

        time.sleep(1)  # API limitlarini cheklash uchun

    save_to_csv(dataset, "historical_figures_dataset.csv")


1/196 📥 Isaac Newton sahifasini yuklanmoqda...
✅ Isaac Newton infoboxi muvaffaqiyatli olindi.
2/196 📥 Marie Curie sahifasini yuklanmoqda...
✅ Marie Curie infoboxi muvaffaqiyatli olindi.
3/196 📥 Albert Einstein sahifasini yuklanmoqda...
✅ Albert Einstein infoboxi muvaffaqiyatli olindi.
4/196 📥 Ada Lovelace sahifasini yuklanmoqda...
✅ Ada Lovelace infoboxi muvaffaqiyatli olindi.
5/196 📥 Leonardo da Vinci sahifasini yuklanmoqda...
✅ Leonardo da Vinci infoboxi muvaffaqiyatli olindi.
6/196 📥 Galileo Galilei sahifasini yuklanmoqda...
✅ Galileo Galilei infoboxi muvaffaqiyatli olindi.
7/196 📥 Nikola Tesla sahifasini yuklanmoqda...
✅ Nikola Tesla infoboxi muvaffaqiyatli olindi.
8/196 📥 Charles Darwin sahifasini yuklanmoqda...
✅ Charles Darwin infoboxi muvaffaqiyatli olindi.
9/196 📥 Thomas Edison sahifasini yuklanmoqda...
✅ Thomas Edison infoboxi muvaffaqiyatli olindi.
10/196 📥 Aristotle sahifasini yuklanmoqda...
✅ Aristotle infoboxi muvaffaqiyatli olindi.
11/196 📥 Plato sahifasini yuklanmoqda..

In [43]:
import pandas as pd 
df = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Notebooks\historical_figures_dataset.csv")
df.head()

Unnamed: 0,1blankname,1namedata,academic_advisors,agent,alias,allegiance,alma mater,alma_mater,alongside,alongside1,...,venerated_in,vicepresident,vicepresident1,w,website,work_institutions,workplaces,works,years_active,yearsactive
0,,,{{unbulleted list | [[Isaac Barrow]]<ref>Feing...,,,,,,"[[Henry Boyle, 1st Baron Carleton|Henry Boyle]]",Henry Boyle,...,,,,,,,{{hlist|[[University of Cambridge]]|[[Royal So...,,,
1,,,,,,,,[[University of Paris]],,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,{{circa|1470–1519}},


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Columns: 289 entries, 1blankname to yearsactive
dtypes: float64(17), object(272)
memory usage: 347.8+ KB


In [49]:
print(df.columns.tolist())

['1blankname', '1namedata', 'academic_advisors', 'agent', 'alias', 'allegiance', 'alma mater', 'alma_mater', 'alongside', 'alongside1', 'alt', 'appointer1', 'appointer2', 'associated_acts', 'attributes', 'author_abbrev_bot', 'author_abbrev_zoo', 'awards', 'background', 'baptised', 'beatified_by', 'beatified_date', 'beatified_place', 'birth_date', 'birth_name', 'birth_place', 'birthname', 'branch', 'burial', 'burial_date', 'burial_place', 'canonized_by', 'canonized_date', 'canonized_place', 'caption', 'child', 'children', 'church', 'citizenship', 'consort', 'constituency12', 'constituency13', 'constituency14', 'constituency15', 'cor-type', 'cor-type1', 'coregency', 'coronation', 'coronation1', 'current_member_of', 'death_cause', 'death_date', 'death_place', 'denomination', 'deputy', 'disappeared_date', 'disappeared_place', 'disappeared_status', 'discipline', 'discography', 'doctoral_advisor', 'doctoral_students', 'dynasty', 'education', 'embed', 'embed_title', 'employer', 'era', 'family

In [51]:
important_columns = [
    "name",                 # Ism (asosiy identifikator)
    "birth_date",           # Tug‘ilgan sana
    "birth_place",          # Tug‘ilgan joy
    "death_date",           # Vafot sanasi
    "death_place",          # Vafot joyi
    "nationality",          # Millati
    "occupation",           # Kasbi yoki lavozimi
    "years_active",         # Faoliyat yillari
    "known_for",            # Mashhurlikka sabab bo‘lgan ishlari
    "awards",               # Mukofotlar
    "alma_mater",           # O‘qigan universiteti (o‘quv yurti)
    "education",            # Ta’limi
    "employer",             # Ishlagan tashkilotlari
    "notable_works",        # Mashhur asarlari yoki ishlari
    "field",                # Soha (ilmiy, san’at, siyosat va h.k.)    
    "spouse",               # Turmush o‘rtog‘i
    "children",             # Farzandlari
    "parents",              # Ota-onasi
    "religion",             # Dini    
    "genre",                # Janri (san’at yoki adabiyotda)   
]


In [52]:
df_filtered = df[important_columns]


In [54]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           154 non-null    object
 1   birth_date     147 non-null    object
 2   birth_place    147 non-null    object
 3   death_date     132 non-null    object
 4   death_place    130 non-null    object
 5   nationality    7 non-null      object
 6   occupation     72 non-null     object
 7   years_active   34 non-null     object
 8   known_for      25 non-null     object
 9   awards         13 non-null     object
 10  alma_mater     19 non-null     object
 11  education      33 non-null     object
 12  employer       1 non-null      object
 13  notable_works  4 non-null      object
 14  field          10 non-null     object
 15  spouse         58 non-null     object
 16  children       35 non-null     object
 17  parents        10 non-null     object
 18  religion       4 non-null     

In [None]:
df_filtered.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Raw_data\hf_dataset1.csv", index=False, encoding="utf-8")


In [23]:
import requests
import re
import csv
import time

def get_wikipedia_page(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "wikitext"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyProjectBot/1.0; +https://yourdomain.com)"
    }

    response = requests.get(url, params=params, headers=headers)

    try:
        data = response.json()
    except Exception as e:
        print(f"❌ JSON decode error for {title}: {e}")
        print("Raw response text:", response.text)
        return None

    if 'error' in data:
        print(f"❌ Wikipedia API error for {title}: {data['error']['info']}")
        return None

    return data['parse']['wikitext']['*']

def extract_infobox(wikitext):
    if not wikitext:
        return None
    match = re.search(r"\{\{Infobox(.+?)\n\}\}", wikitext, re.DOTALL)
    if match:
        return match.group(0)
    else:
        match = re.search(r"\{\{Infobox(.+?)\}\}", wikitext, re.DOTALL)
        return match.group(0) if match else None

def extract_attributes(infobox_text):
    if not infobox_text:
        return {}

    attributes = {}
    current_key = None
    current_value_lines = []

    lines = infobox_text.split("\n")
    for line in lines[1:]:  # 1-qator: {{Infobox ... shuning uchun o'tkazamiz
        line = line.strip()
        if line.startswith("|"):
            # Yangi atribut boshlanmoqda, oldingisini saqlaymiz
            if current_key is not None:
                attributes[current_key] = " ".join(current_value_lines).strip()
            parts = line.split("=", 1)
            if len(parts) == 2:
                current_key = parts[0].strip("| ").lower()
                current_value_lines = [parts[1].strip()]
            else:
                current_key = None
                current_value_lines = []
        else:
            # Agar chiziq | bilan boshlanmasa, oldingi key qiymatining davomidir
            if current_key is not None:
                current_value_lines.append(line)

    # Oxirgi atributni qo'shamiz
    if current_key is not None:
        attributes[current_key] = " ".join(current_value_lines).strip()

    return attributes

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = set()
    for item in data:
        keys.update(item.keys())
    keys = sorted(list(keys))

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"✅ Dataset '{filename}' faylga saqlandi.")

if __name__ == "__main__":
    persons = [
    "Isaac Newton", "Marie Curie", "Albert Einstein", "Ada Lovelace", "Leonardo da Vinci",
    "Galileo Galilei", "Nikola Tesla", "Charles Darwin", "Thomas Edison", "Aristotle",
    "Plato", "Socrates", "Johannes Kepler", "Alan Turing", "Stephen Hawking",
    "Carl Sagan", "Benjamin Franklin", "James Clerk Maxwell", "Dmitri Mendeleev", "Gregor Mendel",
    "Louis Pasteur", "Alexander Fleming", "Michael Faraday", "Johann Wolfgang von Goethe", "Sigmund Freud",
    "Max Planck", "Niels Bohr", "Rosalind Franklin", "Rachel Carson", "Marie Stopes",
    "Hypatia", "Emmy Noether", "Grace Hopper", "Chien-Shiung Wu", "Lise Meitner",
    "Jane Goodall", "Margaret Mead", "Carl Linnaeus", "Alfred Nobel", "Andrei Sakharov",
    "Richard Feynman", "Werner Heisenberg", "Erwin Schrödinger", "Niels Henrik Abel", "Kurt Gödel",
    "John von Neumann", "Florence Nightingale", "Mary Anning", "Catherine the Great", "Cleopatra",
    "Queen Victoria", "Joan of Arc", "Elizabeth I", "Marie Antoinette", "Simone de Beauvoir",
    "Virginia Woolf", "Frida Kahlo", "Georgia O'Keeffe", "Amelia Earhart", "Harriet Tubman",
    "Sojourner Truth", "Malala Yousafzai", "Mother Teresa", "Eleanor Roosevelt", "Margaret Thatcher",
    "Indira Gandhi", "Angela Merkel", "Sappho", "Homer", "Virgil",
    "Dante Alighieri", "Geoffrey Chaucer", "Miguel de Cervantes", "William Shakespeare", "Johann Sebastian Bach",
    "Ludwig van Beethoven", "Wolfgang Amadeus Mozart", "Franz Schubert", "Pyotr Ilyich Tchaikovsky", "Igor Stravinsky",
    "Claude Debussy", "Johannes Brahms", "Richard Wagner", "Gustav Mahler", "Felix Mendelssohn",
    "Sergei Rachmaninoff", "Antonio Vivaldi", "Johann Strauss II", "Maurice Ravel", "Giacomo Puccini",
    "Gustav Holst", "Aaron Copland", "Duke Ellington", "Ella Fitzgerald", "Louis Armstrong",
    "Billie Holiday", "Charlie Parker", "John Coltrane", "Dmitri Shostakovich", "Leonard Bernstein",
    "Miles Davis", "Thelonious Monk", "Charles Mingus", "Art Tatum", "Count Basie",
    "Herbie Hancock", "Wynton Marsalis", "Dizzy Gillespie", "Benny Goodman", "Glenn Miller",
    "George Gershwin", "Cole Porter", "Irving Berlin", "Stephen Foster", "Scott Joplin",
    "Bach", "Haydn", "Handel", "Mozart", "Beethoven",
    "Schubert", "Schumann", "Chopin", "Liszt", "Brahms",
    "Tchaikovsky", "Rachmaninoff", "Debussy", "Ravel", "Stravinsky",
    "Bartók", "Shostakovich", "Prokofiev", "Sibelius", "Vaughan Williams",
    "Copland", "Bernstein", "Glass", "Reich", "Adams",
    "Boulez", "Stockhausen", "Ligeti", "Messiaen", "Penderecki",
    "John Cage", "Philip Glass", "Arvo Pärt", "Kaija Saariaho", "Olga Neuwirth",
    "Anna Thorvaldsdottir", "Tania León", "Jennifer Higdon", "Missy Mazzoli", "Julia Wolfe",
    "Florence Price", "Fanny Mendelssohn", "Clara Schumann", "Ethel Smyth", "Amy Beach",
    "Lili Boulanger", "Germaine Tailleferre", "Nadia Boulanger", "Louise Farrenc", "Cécile Chaminade",
    "Pauline Viardot", "Teresa Carreño", "Margaret Bonds", "Undine Smith Moore", "William Grant Still",
    "Scott Joplin", "W.C. Handy", "James P. Johnson", "Jelly Roll Morton", "Fats Waller",
    "Duke Ellington", "Count Basie", "Benny Goodman", "Glenn Miller", "Charlie Parker",
    "Dizzy Gillespie", "Thelonious Monk", "Miles Davis", "John Coltrane", "Ornette Coleman",
    "Charles Mingus", "Art Blakey", "Herbie Hancock", "Wayne Shorter", "Chick Corea",
    "Keith Jarrett", "Pat Metheny", "Esperanza Spalding", "Kamasi Washington", "Snarky Puppy",
    "Esperanza Spalding", "Esperanza Spalding", "Esperanza Spalding", "Esperanza Spalding", "Esperanza Spalding"

    ]

    dataset = []

    for idx, person in enumerate(persons, 1):
        print(f"{idx}/{len(persons)} 📥 {person} sahifasini yuklanmoqda...")
        wikitext = get_wikipedia_page(person)
        infobox = extract_infobox(wikitext)
        attributes = extract_attributes(infobox)

        if attributes:
            if "name" not in attributes:
                attributes["name"] = person
            dataset.append(attributes)
            print(f"✅ {person} infoboxi muvaffaqiyatli olindi.")
        else:
            print(f"⚠️ {person} sahifasida infobox topilmadi yoki bo'sh.")

        time.sleep(1)  # API limitlarini cheklash uchun

    save_to_csv(dataset, "historical_figures_dataset2.csv")


1/200 📥 Isaac Newton sahifasini yuklanmoqda...
✅ Isaac Newton infoboxi muvaffaqiyatli olindi.
2/200 📥 Marie Curie sahifasini yuklanmoqda...
✅ Marie Curie infoboxi muvaffaqiyatli olindi.
3/200 📥 Albert Einstein sahifasini yuklanmoqda...
✅ Albert Einstein infoboxi muvaffaqiyatli olindi.
4/200 📥 Ada Lovelace sahifasini yuklanmoqda...
✅ Ada Lovelace infoboxi muvaffaqiyatli olindi.
5/200 📥 Leonardo da Vinci sahifasini yuklanmoqda...
✅ Leonardo da Vinci infoboxi muvaffaqiyatli olindi.
6/200 📥 Galileo Galilei sahifasini yuklanmoqda...
✅ Galileo Galilei infoboxi muvaffaqiyatli olindi.
7/200 📥 Nikola Tesla sahifasini yuklanmoqda...
✅ Nikola Tesla infoboxi muvaffaqiyatli olindi.
8/200 📥 Charles Darwin sahifasini yuklanmoqda...
✅ Charles Darwin infoboxi muvaffaqiyatli olindi.
9/200 📥 Thomas Edison sahifasini yuklanmoqda...
✅ Thomas Edison infoboxi muvaffaqiyatli olindi.
10/200 📥 Aristotle sahifasini yuklanmoqda...
✅ Aristotle infoboxi muvaffaqiyatli olindi.
11/200 📥 Plato sahifasini yuklanmoqda..

In [45]:
import pandas as pd 
df2 = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Notebooks\historical_figures_dataset2.csv")
df2.head()


Unnamed: 0,1blankname,1namedata,27 others<ref name,[[bernard carr]]<ref name,[[bruce allen (physicist)|bruce allen]]<ref name,[[christophe galfard]]<ref name,[[christopher pope (physicist)|christopher pope]]|[[marika taylor]]<ref name,[[don page (physicist)|don page]]<ref name,[[fay dowker]]<ref name,[[gary gibbons]]<ref name,...,workplaces,works,years_active,yearsactive,{{marriage|[[jane hawking|jane wilde]]|14 july 1965|1995|end,{{marriage|[[ray brown (musician)|ray brown]]|1947|1953|end,{{marriage|[[yekaterina nosenko]]|24 january 1906|2 march 1939|end,{{marriage|benny kornegay|1941|1942|end,{{marriage|elaine mason|16 september 1995|2006|end,"{{marriage|mary stilwell|december 25, 1871|august 9, 1884|end"
0,,,,,,,,,,,...,{{hlist|[[University of Cambridge]]|[[Royal So...,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,{{circa|1470–1519}},,,,,,,


In [56]:
important_columns = [
    "name",                 # Ism (asosiy identifikator)
    "birth_date",           # Tug‘ilgan sana
    "birth_place",          # Tug‘ilgan joy
    "death_date",           # Vafot sanasi
    "death_place",          # Vafot joyi
    "nationality",          # Millati
    "occupation",           # Kasbi yoki lavozimi
    "years_active",         # Faoliyat yillari
    "known_for",            # Mashhurlikka sabab bo‘lgan ishlari
    "awards",               # Mukofotlar
    "alma_mater",           # O‘qigan universiteti (o‘quv yurti)
    "education",            # Ta’limi
    "employer",             # Ishlagan tashkilotlari
    "notable_works",        # Mashhur asarlari yoki ishlari
    "field",                # Soha (ilmiy, san’at, siyosat va h.k.)    
    "spouse",               # Turmush o‘rtog‘i
    "children",             # Farzandlari
    "parents",              # Ota-onasi
    "religion",             # Dini    
    "genre",                # Janri (san’at yoki adabiyotda)   
]
df_filtered2 = df2[important_columns]
df_filtered2.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Raw_data\hf_dataset2.csv", index=False, encoding="utf-8")

In [57]:
df_filtered2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           158 non-null    object
 1   birth_date     152 non-null    object
 2   birth_place    152 non-null    object
 3   death_date     133 non-null    object
 4   death_place    131 non-null    object
 5   nationality    7 non-null      object
 6   occupation     78 non-null     object
 7   years_active   40 non-null     object
 8   known_for      25 non-null     object
 9   awards         14 non-null     object
 10  alma_mater     19 non-null     object
 11  education      34 non-null     object
 12  employer       1 non-null      object
 13  notable_works  5 non-null      object
 14  field          10 non-null     object
 15  spouse         59 non-null     object
 16  children       35 non-null     object
 17  parents        10 non-null     object
 18  religion       4 non-null     

In [24]:
import requests
import re
import csv
import time

def get_wikipedia_page(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "wikitext"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyProjectBot/1.0; +https://yourdomain.com)"
    }

    response = requests.get(url, params=params, headers=headers)

    try:
        data = response.json()
    except Exception as e:
        print(f"❌ JSON decode error for {title}: {e}")
        print("Raw response text:", response.text)
        return None

    if 'error' in data:
        print(f"❌ Wikipedia API error for {title}: {data['error']['info']}")
        return None

    return data['parse']['wikitext']['*']

def extract_infobox(wikitext):
    if not wikitext:
        return None
    match = re.search(r"\{\{Infobox(.+?)\n\}\}", wikitext, re.DOTALL)
    if match:
        return match.group(0)
    else:
        match = re.search(r"\{\{Infobox(.+?)\}\}", wikitext, re.DOTALL)
        return match.group(0) if match else None

def extract_attributes(infobox_text):
    if not infobox_text:
        return {}

    attributes = {}
    current_key = None
    current_value_lines = []

    lines = infobox_text.split("\n")
    for line in lines[1:]:  # 1-qator: {{Infobox ... shuning uchun o'tkazamiz
        line = line.strip()
        if line.startswith("|"):
            # Yangi atribut boshlanmoqda, oldingisini saqlaymiz
            if current_key is not None:
                attributes[current_key] = " ".join(current_value_lines).strip()
            parts = line.split("=", 1)
            if len(parts) == 2:
                current_key = parts[0].strip("| ").lower()
                current_value_lines = [parts[1].strip()]
            else:
                current_key = None
                current_value_lines = []
        else:
            # Agar chiziq | bilan boshlanmasa, oldingi key qiymatining davomidir
            if current_key is not None:
                current_value_lines.append(line)

    # Oxirgi atributni qo'shamiz
    if current_key is not None:
        attributes[current_key] = " ".join(current_value_lines).strip()

    return attributes

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = set()
    for item in data:
        keys.update(item.keys())
    keys = sorted(list(keys))

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"✅ Dataset '{filename}' faylga saqlandi.")

if __name__ == "__main__":
    persons = [
    "Plato", "Socrates", "Aristotle", "Confucius", "Sun Tzu",
    "Mahatma Gandhi", "Martin Luther King Jr.", "Nelson Mandela", "Malcolm X", "Che Guevara",
    "Barack Obama", "Franklin D. Roosevelt", "Winston Churchill", "Theodore Roosevelt", "John F. Kennedy",
    "Abraham Lincoln", "George Washington", "Thomas Jefferson", "Benjamin Franklin", "Theodore Roosevelt",
    "Franklin D. Roosevelt", "Woodrow Wilson", "Harry Truman", "Dwight D. Eisenhower", "Ronald Reagan",
    "Richard Nixon", "Jimmy Carter", "Bill Clinton", "George H.W. Bush", "George W. Bush",
    "Donald Trump", "Joe Biden", "Angela Merkel", "Margaret Thatcher", "Indira Gandhi",
    "Golda Meir", "Jacinda Ardern", "Sukarno", "Sukarno", "Sukarno",
    "Napoleon Bonaparte", "Julius Caesar", "Alexander the Great", "Genghis Khan", "Attila the Hun",
    "Charlemagne", "Otto von Bismarck", "Queen Elizabeth II", "Catherine the Great", "Marie Antoinette",
    "Cleopatra", "Joan of Arc", "Empress Wu Zetian", "Emperor Meiji", "Peter the Great",
    "Ivan the Terrible", "Ivan IV of Russia", "Harold Godwinson", "William the Conqueror", "Henry VIII",
    "Elizabeth I", "Mary Queen of Scots", "Queen Victoria", "Isabella I of Castile", "Louis XIV",
    "Frederick the Great", "Joseph Stalin", "Vladimir Lenin", "Leon Trotsky", "Mikhail Gorbachev",
    "Boris Yeltsin", "Vladimir Putin", "Kim Il-sung", "Kim Jong-il", "Kim Jong-un",
    "Fidel Castro", "Che Guevara", "Ho Chi Minh", "Pol Pot", "Mao Zedong",
    "Deng Xiaoping", "Sun Yat-sen", "Chiang Kai-shek", "Shinzo Abe", "Emperor Hirohito",
    "Akihito", "Narendra Modi", "Yitzhak Rabin", "Benjamin Netanyahu", "Golda Meir",
    "Menachem Begin", "Shimon Peres", "Mahmoud Abbas", "Hassan Nasrallah", "Bashar al-Assad",
    "Recep Tayyip Erdoğan", "Abdel Fattah el-Sisi", "Mohammed bin Salman", "Sheikh Hasina", "Imran Khan",
    "Aung San Suu Kyi", "Jacinda Ardern", "Sanna Marin", "Sviatlana Tsikhanouskaya", "Volodymyr Zelensky",
    "Justin Trudeau", "Emmanuel Macron", "Boris Johnson", "Theresa May", "Nicola Sturgeon",
    "Alex Salmond", "David Cameron", "Gordon Brown", "Tony Blair", "Margaret Thatcher",
    "John Major", "Harold Wilson", "Winston Churchill", "Clement Attlee", "Neville Chamberlain",
    "Stanley Baldwin", "Bonar Law", "Andrew Bonar Law", "James Callaghan", "Edward Heath",
    "Harold Macmillan", "Alec Douglas-Home", "Anthony Eden", "Neville Chamberlain", "Sir Robert Walpole",
    "Benjamin Disraeli", "William Gladstone", "David Lloyd George", "Herbert Asquith", "H. H. Asquith",
    "Winston Churchill", "Clement Attlee", "Harold Wilson", "Edward Heath", "James Callaghan",
    "Margaret Thatcher", "John Major", "Tony Blair", "Gordon Brown", "David Cameron",
    "Theresa May", "Boris Johnson", "Liz Truss", "Rishi Sunak", "Barack Obama",
    "Joe Biden", "Donald Trump", "George W. Bush", "Bill Clinton", "George H.W. Bush",
    "Ronald Reagan", "Jimmy Carter", "Richard Nixon", "Lyndon B. Johnson", "John F. Kennedy",
    "Dwight D. Eisenhower", "Harry S. Truman", "Franklin D. Roosevelt", "Woodrow Wilson", "Theodore Roosevelt",
    "Abraham Lincoln", "Thomas Jefferson", "Benjamin Franklin", "John Adams", "James Madison",
    "James Monroe", "Andrew Jackson", "Martin Van Buren", "William Henry Harrison", "John Tyler",
    "James K. Polk", "Zachary Taylor", "Millard Fillmore", "Franklin Pierce", "James Buchanan",
    "Abraham Lincoln", "Andrew Johnson", "Ulysses S. Grant", "Rutherford B. Hayes", "James A. Garfield",
    "Chester A. Arthur", "Grover Cleveland", "Benjamin Harrison", "Grover Cleveland", "William McKinley",
    "Theodore Roosevelt", "William Howard Taft", "Woodrow Wilson", "Warren G. Harding", "Calvin Coolidge",
    "Herbert Hoover", "Franklin D. Roosevelt", "Harry S. Truman", "Dwight D. Eisenhower", "John F. Kennedy",
    "Lyndon B. Johnson", "Richard Nixon", "Gerald Ford", "Jimmy Carter", "Ronald Reagan",
    "George H.W. Bush", "Bill Clinton", "George W. Bush", "Barack Obama", "Donald Trump"

    ]

    dataset = []

    for idx, person in enumerate(persons, 1):
        print(f"{idx}/{len(persons)} 📥 {person} sahifasini yuklanmoqda...")
        wikitext = get_wikipedia_page(person)
        infobox = extract_infobox(wikitext)
        attributes = extract_attributes(infobox)

        if attributes:
            if "name" not in attributes:
                attributes["name"] = person
            dataset.append(attributes)
            print(f"✅ {person} infoboxi muvaffaqiyatli olindi.")
        else:
            print(f"⚠️ {person} sahifasida infobox topilmadi yoki bo'sh.")

        time.sleep(1)  # API limitlarini cheklash uchun

    save_to_csv(dataset, "historical_figures_dataset3.csv")


1/210 📥 Plato sahifasini yuklanmoqda...
✅ Plato infoboxi muvaffaqiyatli olindi.
2/210 📥 Socrates sahifasini yuklanmoqda...
✅ Socrates infoboxi muvaffaqiyatli olindi.
3/210 📥 Aristotle sahifasini yuklanmoqda...
✅ Aristotle infoboxi muvaffaqiyatli olindi.
4/210 📥 Confucius sahifasini yuklanmoqda...
✅ Confucius infoboxi muvaffaqiyatli olindi.
5/210 📥 Sun Tzu sahifasini yuklanmoqda...
✅ Sun Tzu infoboxi muvaffaqiyatli olindi.
6/210 📥 Mahatma Gandhi sahifasini yuklanmoqda...
✅ Mahatma Gandhi infoboxi muvaffaqiyatli olindi.
7/210 📥 Martin Luther King Jr. sahifasini yuklanmoqda...
✅ Martin Luther King Jr. infoboxi muvaffaqiyatli olindi.
8/210 📥 Nelson Mandela sahifasini yuklanmoqda...
✅ Nelson Mandela infoboxi muvaffaqiyatli olindi.
9/210 📥 Malcolm X sahifasini yuklanmoqda...
✅ Malcolm X infoboxi muvaffaqiyatli olindi.
10/210 📥 Che Guevara sahifasini yuklanmoqda...
✅ Che Guevara infoboxi muvaffaqiyatli olindi.
11/210 📥 Barack Obama sahifasini yuklanmoqda...
✅ Barack Obama infoboxi muvaffaqiya

In [47]:
df3 = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Notebooks\historical_figures_dataset3.csv")
df3.head()


Unnamed: 0,100s/50s1,1blankname,1blankname1,1blankname2,1blankname3,1blankname4,1blankname5,1blankname6,1namedata,1namedata1,...,y2,year1,year2,year3,years_active,{{marriage|fleur cates|1981|1988|end,{{marriage|miriam weizmann|1972|1978|end,{{native name|ja| 明仁|italics,{{tlit|zh|[[tan yankai]]|i,{{tlit|zh|t. v. soong|i
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,Húng Yāu,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [69]:
important_columns = [
    "name",                 # Ism (asosiy identifikator)
    "birth_date",           # Tug‘ilgan sana
    "birth_place",          # Tug‘ilgan joy
    "death_date",           # Vafot sanasi
    "death_place",          # Vafot joyi
    "nationality",          # Millati
    "occupation",           # Kasbi yoki lavozimi
    "years_active",         # Faoliyat yillari
    "known_for",            # Mashhurlikka sabab bo‘lgan ishlari
    "awards",               # Mukofotlar
    "alma_mater",           # O‘qigan universiteti (o‘quv yurti)
    "education",            # Ta’lim    
    "notable_works",        # Mashhur asarlari yoki ishlari    
    "spouse",               # Turmush o‘rtog‘i
    "children",             # Farzandlari
    "parents",              # Ota-onasi
    "religion"             # Dini    
]
df_filtered3 = df3[important_columns]
df_filtered3.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Raw_data\hf_dataset3.csv", index=False, encoding="utf-8")

In [48]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188 entries, 0 to 187
Columns: 552 entries, 100s/50s1 to {{tlit|zh|t. v. soong|i
dtypes: float64(19), object(533)
memory usage: 810.9+ KB


In [25]:
import requests
import re
import csv
import time

def get_wikipedia_page(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "wikitext"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyProjectBot/1.0; +https://yourdomain.com)"
    }

    response = requests.get(url, params=params, headers=headers)

    try:
        data = response.json()
    except Exception as e:
        print(f"❌ JSON decode error for {title}: {e}")
        print("Raw response text:", response.text)
        return None

    if 'error' in data:
        print(f"❌ Wikipedia API error for {title}: {data['error']['info']}")
        return None

    return data['parse']['wikitext']['*']

def extract_infobox(wikitext):
    if not wikitext:
        return None
    match = re.search(r"\{\{Infobox(.+?)\n\}\}", wikitext, re.DOTALL)
    if match:
        return match.group(0)
    else:
        match = re.search(r"\{\{Infobox(.+?)\}\}", wikitext, re.DOTALL)
        return match.group(0) if match else None

def extract_attributes(infobox_text):
    if not infobox_text:
        return {}

    attributes = {}
    current_key = None
    current_value_lines = []

    lines = infobox_text.split("\n")
    for line in lines[1:]:  # 1-qator: {{Infobox ... shuning uchun o'tkazamiz
        line = line.strip()
        if line.startswith("|"):
            # Yangi atribut boshlanmoqda, oldingisini saqlaymiz
            if current_key is not None:
                attributes[current_key] = " ".join(current_value_lines).strip()
            parts = line.split("=", 1)
            if len(parts) == 2:
                current_key = parts[0].strip("| ").lower()
                current_value_lines = [parts[1].strip()]
            else:
                current_key = None
                current_value_lines = []
        else:
            # Agar chiziq | bilan boshlanmasa, oldingi key qiymatining davomidir
            if current_key is not None:
                current_value_lines.append(line)

    # Oxirgi atributni qo'shamiz
    if current_key is not None:
        attributes[current_key] = " ".join(current_value_lines).strip()

    return attributes

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = set()
    for item in data:
        keys.update(item.keys())
    keys = sorted(list(keys))

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"✅ Dataset '{filename}' faylga saqlandi.")

if __name__ == "__main__":
    persons = [
    "William Shakespeare", "Christopher Marlowe", "John Milton", "Geoffrey Chaucer", "Dante Alighieri",
    "Homer", "Virgil", "Sappho", "Ovid", "Horace",
    "John Keats", "Percy Bysshe Shelley", "Lord Byron", "William Wordsworth", "Samuel Taylor Coleridge",
    "T.S. Eliot", "Ezra Pound", "Wallace Stevens", "Robert Frost", "Emily Dickinson",
    "Maya Angelou", "Langston Hughes", "Allen Ginsberg", "Sylvia Plath", "Elizabeth Bishop",
    "Sylvia Plath", "Robert Lowell", "Adrienne Rich", "Gwendolyn Brooks", "Billy Collins",
    "Seamus Heaney", "Ted Hughes", "Carol Ann Duffy", "Margaret Atwood", "Louise Glück",
    "Alice Walker", "Toni Morrison", "Maya Angelou", "Zora Neale Hurston", "James Baldwin",
    "Richard Wright", "Ralph Ellison", "Ta-Nehisi Coates", "Chimamanda Ngozi Adichie", "Jhumpa Lahiri",
    "Salman Rushdie", "Haruki Murakami", "Gabriel García Márquez", "Isabel Allende", "Mario Vargas Llosa",
    "Jorge Luis Borges", "Octavio Paz", "Pablo Neruda", "Federico García Lorca", "Alejo Carpentier",
    "Julio Cortázar", "Carlos Fuentes", "Juan Rulfo", "José Martí", "Miguel de Cervantes",
    "Lope de Vega", "Pedro Calderón de la Barca", "Garcilaso de la Vega", "Luis de Góngora", "Francisco de Quevedo",
    "William Faulkner", "Ernest Hemingway", "F. Scott Fitzgerald", "John Steinbeck", "Toni Morrison",
    "Harper Lee", "Mark Twain", "Edgar Allan Poe", "Nathaniel Hawthorne", "Herman Melville",
    "Emily Dickinson", "Walt Whitman", "Ralph Waldo Emerson", "Henry David Thoreau", "Louisa May Alcott",
    "Louisa May Alcott", "Bronte sisters", "Charlotte Bronte", "Emily Bronte", "Anne Bronte",
    "Jane Austen", "Mary Shelley", "Percy Bysshe Shelley", "Lord Byron", "John Keats",
    "Samuel Taylor Coleridge", "William Wordsworth", "William Blake", "Alexander Pope", "Jonathan Swift",
    "John Donne", "Geoffrey Chaucer", "Edmund Spenser", "Ben Jonson", "Christopher Marlowe",
    "Thomas Hardy", "D.H. Lawrence", "E.M. Forster", "Virginia Woolf", "James Joyce",
    "Franz Kafka", "Albert Camus", "Jean-Paul Sartre", "Simone de Beauvoir", "Marcel Proust",
    "Marcel Proust", "Gustave Flaubert", "Victor Hugo", "Émile Zola", "Honoré de Balzac",
    "Alexandre Dumas", "Jules Verne", "H.G. Wells", "J.R.R. Tolkien", "C.S. Lewis",
    "George Orwell", "Aldous Huxley", "Ray Bradbury", "Philip K. Dick", "Isaac Asimov",
    "Arthur C. Clarke", "Robert A. Heinlein", "Ursula K. Le Guin", "Margaret Atwood", "William Gibson",
    "Neil Gaiman", "Terry Pratchett", "Douglas Adams", "J.K. Rowling", "Stephen King",
    "Agatha Christie", "Arthur Conan Doyle", "Edgar Allan Poe", "H.P. Lovecraft", "Raymond Chandler",
    "Dashiell Hammett", "James Ellroy", "Patricia Highsmith", "Gillian Flynn", "Stieg Larsson",
    "Jo Nesbø", "Michael Connelly", "Dennis Lehane", "George R.R. Martin", "Brandon Sanderson"



    ]

    dataset = []

    for idx, person in enumerate(persons, 1):
        print(f"{idx}/{len(persons)} 📥 {person} sahifasini yuklanmoqda...")
        wikitext = get_wikipedia_page(person)
        infobox = extract_infobox(wikitext)
        attributes = extract_attributes(infobox)

        if attributes:
            if "name" not in attributes:
                attributes["name"] = person
            dataset.append(attributes)
            print(f"✅ {person} infoboxi muvaffaqiyatli olindi.")
        else:
            print(f"⚠️ {person} sahifasida infobox topilmadi yoki bo'sh.")

        time.sleep(1)  # API limitlarini cheklash uchun

    save_to_csv(dataset, "historical_figures_dataset4.csv")


1/150 📥 William Shakespeare sahifasini yuklanmoqda...
✅ William Shakespeare infoboxi muvaffaqiyatli olindi.
2/150 📥 Christopher Marlowe sahifasini yuklanmoqda...
✅ Christopher Marlowe infoboxi muvaffaqiyatli olindi.
3/150 📥 John Milton sahifasini yuklanmoqda...
✅ John Milton infoboxi muvaffaqiyatli olindi.
4/150 📥 Geoffrey Chaucer sahifasini yuklanmoqda...
✅ Geoffrey Chaucer infoboxi muvaffaqiyatli olindi.
5/150 📥 Dante Alighieri sahifasini yuklanmoqda...
✅ Dante Alighieri infoboxi muvaffaqiyatli olindi.
6/150 📥 Homer sahifasini yuklanmoqda...
✅ Homer infoboxi muvaffaqiyatli olindi.
7/150 📥 Virgil sahifasini yuklanmoqda...
✅ Virgil infoboxi muvaffaqiyatli olindi.
8/150 📥 Sappho sahifasini yuklanmoqda...
⚠️ Sappho sahifasida infobox topilmadi yoki bo'sh.
9/150 📥 Ovid sahifasini yuklanmoqda...
✅ Ovid infoboxi muvaffaqiyatli olindi.
10/150 📥 Horace sahifasini yuklanmoqda...
✅ Horace infoboxi muvaffaqiyatli olindi.
11/150 📥 John Keats sahifasini yuklanmoqda...
✅ John Keats infoboxi muvaffa

In [59]:
df4 = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Notebooks\historical_figures_dataset4.csv")
df4.head()
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133 entries, 0 to 132
Data columns (total 100 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   [[presidential medal of freedom]]<ref name   1 non-null      object 
 1   academic_advisors                            1 non-null      object 
 2   alma_mater                                   34 non-null     object 
 3   alt                                          24 non-null     object 
 4   awards                                       26 non-null     object 
 5   baptised                                     3 non-null      object 
 6   birth_date                                   124 non-null    object 
 7   birth_name                                   82 non-null     object 
 8   birth_place                                  126 non-null    object 
 9   burial_place                                 2 non-null      object 
 10  c

In [68]:
important_columns = [
    "name",                 # Ism (asosiy identifikator)
    "birth_date",           # Tug‘ilgan sana
    "birth_place",          # Tug‘ilgan joy
    "death_date",           # Vafot sanasi
    "death_place",          # Vafot joyi
    "nationality",          # Millati
    "occupation",           # Kasbi yoki lavozimi
    "years_active",         # Faoliyat yillari    
    "awards",               # Mukofotlar
    "alma_mater",           # O‘qigan universiteti (o‘quv yurti)
    "education",            # Ta’limi
    "employer",             # Ishlagan tashkilotlari
    "notable_works",        # Mashhur asarlari yoki ishlari    
    "spouse",               # Turmush o‘rtog‘i
    "children",             # Farzandlari
    "parents",              # Ota-onasi
    "religion",             # Dini    
    "genre"                # Janri (san’at yoki adabiyotda)   
]
df_filtered4 = df4[important_columns]
df_filtered4.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Raw_data\hf_dataset4.csv", index=False, encoding="utf-8")

In [26]:
import requests
import re
import csv
import time

def get_wikipedia_page(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "wikitext"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyProjectBot/1.0; +https://yourdomain.com)"
    }

    response = requests.get(url, params=params, headers=headers)

    try:
        data = response.json()
    except Exception as e:
        print(f"❌ JSON decode error for {title}: {e}")
        print("Raw response text:", response.text)
        return None

    if 'error' in data:
        print(f"❌ Wikipedia API error for {title}: {data['error']['info']}")
        return None

    return data['parse']['wikitext']['*']

def extract_infobox(wikitext):
    if not wikitext:
        return None
    match = re.search(r"\{\{Infobox(.+?)\n\}\}", wikitext, re.DOTALL)
    if match:
        return match.group(0)
    else:
        match = re.search(r"\{\{Infobox(.+?)\}\}", wikitext, re.DOTALL)
        return match.group(0) if match else None

def extract_attributes(infobox_text):
    if not infobox_text:
        return {}

    attributes = {}
    current_key = None
    current_value_lines = []

    lines = infobox_text.split("\n")
    for line in lines[1:]:  # 1-qator: {{Infobox ... shuning uchun o'tkazamiz
        line = line.strip()
        if line.startswith("|"):
            # Yangi atribut boshlanmoqda, oldingisini saqlaymiz
            if current_key is not None:
                attributes[current_key] = " ".join(current_value_lines).strip()
            parts = line.split("=", 1)
            if len(parts) == 2:
                current_key = parts[0].strip("| ").lower()
                current_value_lines = [parts[1].strip()]
            else:
                current_key = None
                current_value_lines = []
        else:
            # Agar chiziq | bilan boshlanmasa, oldingi key qiymatining davomidir
            if current_key is not None:
                current_value_lines.append(line)

    # Oxirgi atributni qo'shamiz
    if current_key is not None:
        attributes[current_key] = " ".join(current_value_lines).strip()

    return attributes

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = set()
    for item in data:
        keys.update(item.keys())
    keys = sorted(list(keys))

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"✅ Dataset '{filename}' faylga saqlandi.")

if __name__ == "__main__":
    persons = [
    "Barack Obama", "Joe Biden", "Donald Trump", "George W. Bush", "Bill Clinton",
    "George H.W. Bush", "Ronald Reagan", "Jimmy Carter", "Richard Nixon", "Lyndon B. Johnson",
    "John F. Kennedy", "Dwight D. Eisenhower", "Harry S. Truman", "Franklin D. Roosevelt", "Woodrow Wilson",
    "Theodore Roosevelt", "Abraham Lincoln", "Thomas Jefferson", "Benjamin Franklin", "John Adams",
    "James Madison", "James Monroe", "Andrew Jackson", "Martin Van Buren", "William Henry Harrison",
    "John Tyler", "James K. Polk", "Zachary Taylor", "Millard Fillmore", "Franklin Pierce",
    "James Buchanan", "Andrew Johnson", "Ulysses S. Grant", "Rutherford B. Hayes", "James A. Garfield",
    "Chester A. Arthur", "Grover Cleveland", "Benjamin Harrison", "Grover Cleveland", "William McKinley",
    "Theodore Roosevelt", "William Howard Taft", "Woodrow Wilson", "Warren G. Harding", "Calvin Coolidge",
    "Herbert Hoover", "Franklin D. Roosevelt", "Harry S. Truman", "Dwight D. Eisenhower", "John F. Kennedy",
    "Lyndon B. Johnson", "Richard Nixon", "Gerald Ford", "Jimmy Carter", "Ronald Reagan",
    "George H.W. Bush", "Bill Clinton", "George W. Bush", "Barack Obama", "Donald Trump",
    "Angela Merkel", "Margaret Thatcher", "Indira Gandhi", "Golda Meir", "Jacinda Ardern",
    "Sukarno", "Napoleon Bonaparte", "Julius Caesar", "Alexander the Great", "Genghis Khan",
    "Attila the Hun", "Charlemagne", "Otto von Bismarck", "Queen Elizabeth II", "Catherine the Great",
    "Marie Antoinette", "Cleopatra", "Joan of Arc", "Empress Wu Zetian", "Emperor Meiji",
    "Peter the Great", "Ivan the Terrible", "Ivan IV of Russia", "Harold Godwinson", "William the Conqueror",
    "Henry VIII", "Elizabeth I", "Mary Queen of Scots", "Queen Victoria", "Isabella I of Castile",
    "Louis XIV", "Frederick the Great", "Joseph Stalin", "Vladimir Lenin", "Leon Trotsky",
    "Mikhail Gorbachev", "Boris Yeltsin", "Vladimir Putin", "Kim Il-sung", "Kim Jong-il",
    "Kim Jong-un", "Fidel Castro", "Che Guevara", "Ho Chi Minh", "Pol Pot",
    "Mao Zedong", "Deng Xiaoping", "Sun Yat-sen", "Chiang Kai-shek", "Shinzo Abe",
    "Emperor Hirohito", "Akihito", "Narendra Modi", "Yitzhak Rabin", "Benjamin Netanyahu",
    "Golda Meir", "Menachem Begin", "Shimon Peres", "Mahmoud Abbas", "Hassan Nasrallah",
    "Bashar al-Assad", "Recep Tayyip Erdoğan", "Abdel Fattah el-Sisi", "Mohammed bin Salman", "Sheikh Hasina",
    "Imran Khan", "Aung San Suu Kyi", "Jacinda Ardern", "Sanna Marin", "Sviatlana Tsikhanouskaya",
    "Volodymyr Zelensky", "Justin Trudeau", "Emmanuel Macron", "Boris Johnson", "Theresa May",
    "Nicola Sturgeon", "Alex Salmond", "David Cameron", "Gordon Brown", "Tony Blair",
    "Margaret Thatcher", "John Major", "Harold Wilson", "Winston Churchill", "Clement Attlee",
    "Neville Chamberlain", "Stanley Baldwin", "Bonar Law", "Andrew Bonar Law", "James Callaghan",
    "Edward Heath", "Harold Macmillan", "Alec Douglas-Home", "Anthony Eden", "Neville Chamberlain",
    "Sir Robert Walpole", "Benjamin Disraeli", "William Gladstone", "David Lloyd George", "Herbert Asquith",
    "H. H. Asquith", "Winston Churchill", "Clement Attlee", "Harold Wilson", "Edward Heath",
    "James Callaghan", "Margaret Thatcher", "John Major", "Tony Blair", "Gordon Brown"

    ]

    dataset = []

    for idx, person in enumerate(persons, 1):
        print(f"{idx}/{len(persons)} 📥 {person} sahifasini yuklanmoqda...")
        wikitext = get_wikipedia_page(person)
        infobox = extract_infobox(wikitext)
        attributes = extract_attributes(infobox)

        if attributes:
            if "name" not in attributes:
                attributes["name"] = person
            dataset.append(attributes)
            print(f"✅ {person} infoboxi muvaffaqiyatli olindi.")
        else:
            print(f"⚠️ {person} sahifasida infobox topilmadi yoki bo'sh.")

        time.sleep(1)  # API limitlarini cheklash uchun

    save_to_csv(dataset, "historical_figures_dataset5.csv")


1/170 📥 Barack Obama sahifasini yuklanmoqda...
✅ Barack Obama infoboxi muvaffaqiyatli olindi.
2/170 📥 Joe Biden sahifasini yuklanmoqda...
✅ Joe Biden infoboxi muvaffaqiyatli olindi.
3/170 📥 Donald Trump sahifasini yuklanmoqda...
⚠️ Donald Trump sahifasida infobox topilmadi yoki bo'sh.
4/170 📥 George W. Bush sahifasini yuklanmoqda...
✅ George W. Bush infoboxi muvaffaqiyatli olindi.
5/170 📥 Bill Clinton sahifasini yuklanmoqda...
✅ Bill Clinton infoboxi muvaffaqiyatli olindi.
6/170 📥 George H.W. Bush sahifasini yuklanmoqda...
⚠️ George H.W. Bush sahifasida infobox topilmadi yoki bo'sh.
7/170 📥 Ronald Reagan sahifasini yuklanmoqda...
✅ Ronald Reagan infoboxi muvaffaqiyatli olindi.
8/170 📥 Jimmy Carter sahifasini yuklanmoqda...
✅ Jimmy Carter infoboxi muvaffaqiyatli olindi.
9/170 📥 Richard Nixon sahifasini yuklanmoqda...
✅ Richard Nixon infoboxi muvaffaqiyatli olindi.
10/170 📥 Lyndon B. Johnson sahifasini yuklanmoqda...
✅ Lyndon B. Johnson infoboxi muvaffaqiyatli olindi.
11/170 📥 John F. Ke

In [61]:
df5 = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Notebooks\historical_figures_dataset5.csv")
df5.head()
df5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Columns: 461 entries, 100s/50s1 to {{tlit|zh|t. v. soong|i
dtypes: float64(18), object(443)
memory usage: 544.0+ KB


In [67]:
important_columns = [
    "name",                 # Ism (asosiy identifikator)
    "birth_date",           # Tug‘ilgan sana
    "birth_place",          # Tug‘ilgan joy
    "death_date",           # Vafot sanasi
    "death_place",          # Vafot joyi
    "nationality",          # Millati
    "occupation",           # Kasbi yoki lavozimi
    "years_active",         # Faoliyat yillari
    "known_for",            # Mashhurlikka sabab bo‘lgan ishlari
    "awards",               # Mukofotlar
    "alma_mater",           # O‘qigan universiteti (o‘quv yurti)
    "education",            # Ta’limi       
    "spouse",               # Turmush o‘rtog‘i
    "children",             # Farzandlari
    "parents",              # Ota-onasi
    "religion"             # Dini   
    
]
df_filtered5 = df5[important_columns]
df_filtered5.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Raw_data\hf_dataset5.csv", index=False, encoding="utf-8")

In [27]:
import requests
import re
import csv
import time

def get_wikipedia_page(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "wikitext"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyProjectBot/1.0; +https://yourdomain.com)"
    }

    response = requests.get(url, params=params, headers=headers)

    try:
        data = response.json()
    except Exception as e:
        print(f"❌ JSON decode error for {title}: {e}")
        print("Raw response text:", response.text)
        return None

    if 'error' in data:
        print(f"❌ Wikipedia API error for {title}: {data['error']['info']}")
        return None

    return data['parse']['wikitext']['*']

def extract_infobox(wikitext):
    if not wikitext:
        return None
    match = re.search(r"\{\{Infobox(.+?)\n\}\}", wikitext, re.DOTALL)
    if match:
        return match.group(0)
    else:
        match = re.search(r"\{\{Infobox(.+?)\}\}", wikitext, re.DOTALL)
        return match.group(0) if match else None

def extract_attributes(infobox_text):
    if not infobox_text:
        return {}

    attributes = {}
    current_key = None
    current_value_lines = []

    lines = infobox_text.split("\n")
    for line in lines[1:]:  # 1-qator: {{Infobox ... shuning uchun o'tkazamiz
        line = line.strip()
        if line.startswith("|"):
            # Yangi atribut boshlanmoqda, oldingisini saqlaymiz
            if current_key is not None:
                attributes[current_key] = " ".join(current_value_lines).strip()
            parts = line.split("=", 1)
            if len(parts) == 2:
                current_key = parts[0].strip("| ").lower()
                current_value_lines = [parts[1].strip()]
            else:
                current_key = None
                current_value_lines = []
        else:
            # Agar chiziq | bilan boshlanmasa, oldingi key qiymatining davomidir
            if current_key is not None:
                current_value_lines.append(line)

    # Oxirgi atributni qo'shamiz
    if current_key is not None:
        attributes[current_key] = " ".join(current_value_lines).strip()

    return attributes

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = set()
    for item in data:
        keys.update(item.keys())
    keys = sorted(list(keys))

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"✅ Dataset '{filename}' faylga saqlandi.")

if __name__ == "__main__":
    persons = [
    "Leonardo da Vinci", "Michelangelo", "Raphael", "Donatello", "Sandro Botticelli",
    "Caravaggio", "Titian", "Peter Paul Rubens", "Rembrandt", "Johannes Vermeer",
    "Claude Monet", "Edgar Degas", "Paul Cézanne", "Vincent van Gogh", "Pablo Picasso",
    "Salvador Dalí", "Henri Matisse", "Jackson Pollock", "Andy Warhol", "Frida Kahlo",
    "Georgia O'Keeffe", "Mark Rothko", "Wassily Kandinsky", "Joan Miró", "Paul Klee",
    "Diego Rivera", "Fernando Botero", "Jean-Michel Basquiat", "Banksy", "Yayoi Kusama",
    "Ai Weiwei", "Takashi Murakami", "Jeff Koons", "Damien Hirst", "Anish Kapoor",
    "Antoni Gaudí", "Frank Lloyd Wright", "Le Corbusier", "Zaha Hadid", "Norman Foster",
    "I.M. Pei", "Renzo Piano", "Oscar Niemeyer", "Ludwig Mies van der Rohe", "Louis Sullivan",
    "Walter Gropius", "Eero Saarinen", "Philip Johnson", "Richard Meier", "Santiago Calatrava",
    "Frank Gehry", "Bjarke Ingels", "Rem Koolhaas", "Tadao Ando", "Kengo Kuma",
    "Arata Isozaki", "Kazuyo Sejima", "Shigeru Ban", "Sou Fujimoto", "Kisho Kurokawa",
    "Alvar Aalto", "Eero Saarinen", "Gerrit Rietveld", "Le Corbusier", "Mies van der Rohe",
    "Carlo Scarpa", "Renzo Piano", "Richard Rogers", "Norman Foster", "Zaha Hadid",
    "Oscar Niemeyer", "Frank Gehry", "Toyo Ito", "Kengo Kuma", "Shigeru Ban",
    "Tadao Ando", "Kazuyo Sejima", "Arata Isozaki", "Kisho Kurokawa", "Bjarke Ingels",
    "Antoni Gaudí", "Le Corbusier", "Frank Lloyd Wright", "Louis Sullivan", "Eero Saarinen",
    "Philip Johnson", "Richard Meier", "Santiago Calatrava", "Rem Koolhaas", "Norman Foster",
    "Zaha Hadid", "Frank Gehry", "Toyo Ito", "Kengo Kuma", "Shigeru Ban",
    "Tadao Ando", "Kazuyo Sejima", "Arata Isozaki", "Kisho Kurokawa", "Bjarke Ingels",
    "Gustav Klimt", "Egons Schiele", "Oskar Kokoschka", "Alphonse Mucha", "Henri de Toulouse-Lautrec",
    "Édouard Manet", "Pierre-Auguste Renoir", "Camille Pissarro", "Paul Gauguin", "Georges Seurat",
    "Henri Rousseau", "Marc Chagall", "Franz Marc", "Wassily Kandinsky", "Paul Klee"

    ]

    dataset = []

    for idx, person in enumerate(persons, 1):
        print(f"{idx}/{len(persons)} 📥 {person} sahifasini yuklanmoqda...")
        wikitext = get_wikipedia_page(person)
        infobox = extract_infobox(wikitext)
        attributes = extract_attributes(infobox)

        if attributes:
            if "name" not in attributes:
                attributes["name"] = person
            dataset.append(attributes)
            print(f"✅ {person} infoboxi muvaffaqiyatli olindi.")
        else:
            print(f"⚠️ {person} sahifasida infobox topilmadi yoki bo'sh.")

        time.sleep(1)  # API limitlarini cheklash uchun

    save_to_csv(dataset, "historical_figures_dataset6.csv")


1/115 📥 Leonardo da Vinci sahifasini yuklanmoqda...
✅ Leonardo da Vinci infoboxi muvaffaqiyatli olindi.
2/115 📥 Michelangelo sahifasini yuklanmoqda...
✅ Michelangelo infoboxi muvaffaqiyatli olindi.
3/115 📥 Raphael sahifasini yuklanmoqda...
✅ Raphael infoboxi muvaffaqiyatli olindi.
4/115 📥 Donatello sahifasini yuklanmoqda...
✅ Donatello infoboxi muvaffaqiyatli olindi.
5/115 📥 Sandro Botticelli sahifasini yuklanmoqda...
✅ Sandro Botticelli infoboxi muvaffaqiyatli olindi.
6/115 📥 Caravaggio sahifasini yuklanmoqda...
✅ Caravaggio infoboxi muvaffaqiyatli olindi.
7/115 📥 Titian sahifasini yuklanmoqda...
✅ Titian infoboxi muvaffaqiyatli olindi.
8/115 📥 Peter Paul Rubens sahifasini yuklanmoqda...
✅ Peter Paul Rubens infoboxi muvaffaqiyatli olindi.
9/115 📥 Rembrandt sahifasini yuklanmoqda...
✅ Rembrandt infoboxi muvaffaqiyatli olindi.
10/115 📥 Johannes Vermeer sahifasini yuklanmoqda...
✅ Johannes Vermeer infoboxi muvaffaqiyatli olindi.
11/115 📥 Claude Monet sahifasini yuklanmoqda...
✅ Claude Mo

In [63]:
df6 = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Notebooks\historical_figures_dataset6.csv")
df6.head()
df6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111 entries, 0 to 110
Columns: 105 entries, access-date to years_active
dtypes: float64(19), object(86)
memory usage: 91.2+ KB


In [65]:
important_columns = [
    "name",                 # Ism (asosiy identifikator)
    "birth_date",           # Tug‘ilgan sana
    "birth_place",          # Tug‘ilgan joy
    "death_date",           # Vafot sanasi
    "death_place",          # Vafot joyi
    "nationality",          # Millati
    "occupation",           # Kasbi yoki lavozimi
    "years_active",         # Faoliyat yillari
    "known_for",            # Mashhurlikka sabab bo‘lgan ishlari
    "awards",               # Mukofotlar
    "alma_mater",           # O‘qigan universiteti (o‘quv yurti)
    "education",            # Ta’limi    
    "notable_works",        # Mashhur asarlari yoki ishlari
    "field",                # Soha (ilmiy, san’at, siyosat va h.k.)    
    "spouse",               # Turmush o‘rtog‘i
    "children",             # Farzandlari
    "parents"              # Ota-onasi
]
df_filtered6 = df6[important_columns]
df_filtered6.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Raw_data\hf_dataset6.csv", index=False, encoding="utf-8")

In [28]:
import requests
import re
import csv
import time

def get_wikipedia_page(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "wikitext"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyProjectBot/1.0; +https://yourdomain.com)"
    }

    response = requests.get(url, params=params, headers=headers)

    try:
        data = response.json()
    except Exception as e:
        print(f"❌ JSON decode error for {title}: {e}")
        print("Raw response text:", response.text)
        return None

    if 'error' in data:
        print(f"❌ Wikipedia API error for {title}: {data['error']['info']}")
        return None

    return data['parse']['wikitext']['*']

def extract_infobox(wikitext):
    if not wikitext:
        return None
    match = re.search(r"\{\{Infobox(.+?)\n\}\}", wikitext, re.DOTALL)
    if match:
        return match.group(0)
    else:
        match = re.search(r"\{\{Infobox(.+?)\}\}", wikitext, re.DOTALL)
        return match.group(0) if match else None

def extract_attributes(infobox_text):
    if not infobox_text:
        return {}

    attributes = {}
    current_key = None
    current_value_lines = []

    lines = infobox_text.split("\n")
    for line in lines[1:]:  # 1-qator: {{Infobox ... shuning uchun o'tkazamiz
        line = line.strip()
        if line.startswith("|"):
            # Yangi atribut boshlanmoqda, oldingisini saqlaymiz
            if current_key is not None:
                attributes[current_key] = " ".join(current_value_lines).strip()
            parts = line.split("=", 1)
            if len(parts) == 2:
                current_key = parts[0].strip("| ").lower()
                current_value_lines = [parts[1].strip()]
            else:
                current_key = None
                current_value_lines = []
        else:
            # Agar chiziq | bilan boshlanmasa, oldingi key qiymatining davomidir
            if current_key is not None:
                current_value_lines.append(line)

    # Oxirgi atributni qo'shamiz
    if current_key is not None:
        attributes[current_key] = " ".join(current_value_lines).strip()

    return attributes

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = set()
    for item in data:
        keys.update(item.keys())
    keys = sorted(list(keys))

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"✅ Dataset '{filename}' faylga saqlandi.")

if __name__ == "__main__":
    persons = [
    "Alan Turing", "Marie Curie", "Nikola Tesla", "Stephen Hawking", "Richard Feynman",
    "Rosalind Franklin", "Galileo Galilei", "James Clerk Maxwell", "Dmitri Mendeleev", "Gregor Mendel",
    "Louis Pasteur", "Alexander Fleming", "Michael Faraday", "Johannes Kepler", "Carl Sagan",
    "Benjamin Franklin", "George Washington Carver", "John von Neumann", "Emmy Noether", "Grace Hopper",
    "Chien-Shiung Wu", "Lise Meitner", "Jane Goodall", "Margaret Mead", "Carl Linnaeus",
    "Alfred Nobel", "Andrei Sakharov", "Werner Heisenberg", "Erwin Schrödinger", "Niels Henrik Abel",
    "Kurt Gödel", "Florence Nightingale", "Mary Anning", "Catherine the Great", "Cleopatra",
    "Queen Victoria", "Joan of Arc", "Elizabeth I", "Marie Antoinette", "Simone de Beauvoir",
    "Virginia Woolf", "Frida Kahlo", "Georgia O'Keeffe", "Amelia Earhart", "Harriet Tubman",
    "Sojourner Truth", "Malala Yousafzai", "Mother Teresa", "Eleanor Roosevelt", "Margaret Thatcher",
    "Indira Gandhi", "Angela Merkel", "Sappho", "Homer", "Virgil",
    "Dante Alighieri", "Geoffrey Chaucer", "Miguel de Cervantes", "William Shakespeare", "Johann Sebastian Bach",
    "Ludwig van Beethoven", "Wolfgang Amadeus Mozart", "Franz Schubert", "Pyotr Ilyich Tchaikovsky", "Igor Stravinsky",
    "Claude Debussy", "Johannes Brahms", "Richard Wagner", "Gustav Mahler", "Felix Mendelssohn",
    "Sergei Rachmaninoff", "Antonio Vivaldi", "Johann Strauss II", "Maurice Ravel", "Giacomo Puccini",
    "Gustav Holst", "Aaron Copland", "Duke Ellington", "Ella Fitzgerald", "Louis Armstrong",
    "Billie Holiday", "Charlie Parker", "John Coltrane", "Dmitri Shostakovich", "Leonard Bernstein",
    "Miles Davis", "Thelonious Monk", "Charles Mingus", "Art Tatum", "Count Basie",
    "Herbie Hancock", "Wynton Marsalis", "Dizzy Gillespie", "Benny Goodman", "Glenn Miller",
    "George Gershwin", "Cole Porter", "Irving Berlin", "Stephen Foster", "Scott Joplin",
    "Bach", "Haydn", "Handel", "Mozart", "Beethoven",
    "Schubert", "Schumann", "Chopin", "Liszt", "Brahms",
    "Tchaikovsky", "Rachmaninoff", "Debussy", "Ravel", "Stravinsky",
    "Bartók", "Shostakovich", "Prokofiev", "Sibelius", "Vaughan Williams",
    "Copland", "Bernstein", "Glass", "Reich", "Adams",
    "Boulez", "Stockhausen", "Ligeti", "Messiaen", "Penderecki",
    "John Cage", "Philip Glass", "Arvo Pärt", "Kaija Saariaho", "Olga Neuwirth",
    "Anna Thorvaldsdottir", "Tania León", "Jennifer Higdon", "Missy Mazzoli", "Julia Wolfe",
    "Florence Price", "Fanny Mendelssohn", "Clara Schumann", "Ethel Smyth", "Amy Beach",
    "Lili Boulanger", "Germaine Tailleferre", "Nadia Boulanger", "Louise Farrenc", "Cécile Chaminade",
    "Pauline Viardot", "Teresa Carreño", "Margaret Bonds", "Undine Smith Moore", "William Grant Still",
    "Scott Joplin", "W.C. Handy", "James P. Johnson", "Jelly Roll Morton", "Fats Waller",
    "Duke Ellington", "Count Basie", "Benny Goodman", "Glenn Miller", "Charlie Parker",
    "Dizzy Gillespie", "Thelonious Monk", "Miles Davis", "John Coltrane", "Ornette Coleman",
    "Charles Mingus", "Art Blakey", "Herbie Hancock", "Wayne Shorter", "Chick Corea",
    "Keith Jarrett", "Pat Metheny", "Kamasi Washington", "Snarky Puppy"

    ]

    dataset = []

    for idx, person in enumerate(persons, 1):
        print(f"{idx}/{len(persons)} 📥 {person} sahifasini yuklanmoqda...")
        wikitext = get_wikipedia_page(person)
        infobox = extract_infobox(wikitext)
        attributes = extract_attributes(infobox)

        if attributes:
            if "name" not in attributes:
                attributes["name"] = person
            dataset.append(attributes)
            print(f"✅ {person} infoboxi muvaffaqiyatli olindi.")
        else:
            print(f"⚠️ {person} sahifasida infobox topilmadi yoki bo'sh.")

        time.sleep(1)  # API limitlarini cheklash uchun

    save_to_csv(dataset, "historical_figures_dataset7.csv")


1/179 📥 Alan Turing sahifasini yuklanmoqda...
✅ Alan Turing infoboxi muvaffaqiyatli olindi.
2/179 📥 Marie Curie sahifasini yuklanmoqda...
✅ Marie Curie infoboxi muvaffaqiyatli olindi.
3/179 📥 Nikola Tesla sahifasini yuklanmoqda...
✅ Nikola Tesla infoboxi muvaffaqiyatli olindi.
4/179 📥 Stephen Hawking sahifasini yuklanmoqda...
✅ Stephen Hawking infoboxi muvaffaqiyatli olindi.
5/179 📥 Richard Feynman sahifasini yuklanmoqda...
✅ Richard Feynman infoboxi muvaffaqiyatli olindi.
6/179 📥 Rosalind Franklin sahifasini yuklanmoqda...
✅ Rosalind Franklin infoboxi muvaffaqiyatli olindi.
7/179 📥 Galileo Galilei sahifasini yuklanmoqda...
✅ Galileo Galilei infoboxi muvaffaqiyatli olindi.
8/179 📥 James Clerk Maxwell sahifasini yuklanmoqda...
✅ James Clerk Maxwell infoboxi muvaffaqiyatli olindi.
9/179 📥 Dmitri Mendeleev sahifasini yuklanmoqda...
✅ Dmitri Mendeleev infoboxi muvaffaqiyatli olindi.
10/179 📥 Gregor Mendel sahifasini yuklanmoqda...
✅ Gregor Mendel infoboxi muvaffaqiyatli olindi.
11/179 📥 Lo

In [70]:
df7 = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Notebooks\historical_figures_dataset7.csv")
df7.head()
df7.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Columns: 297 entries, 1blankname to {{marriage|elaine mason|16 september 1995|2006|end
dtypes: float64(15), object(282)
memory usage: 318.0+ KB


In [71]:
important_columns = [
    "name",                 # Ism (asosiy identifikator)
    "birth_date",           # Tug‘ilgan sana
    "birth_place",          # Tug‘ilgan joy
    "death_date",           # Vafot sanasi
    "death_place",          # Vafot joyi
    "nationality",          # Millati
    "occupation",           # Kasbi yoki lavozimi
    "years_active",         # Faoliyat yillari
    "known_for",            # Mashhurlikka sabab bo‘lgan ishlari
    "awards",               # Mukofotlar
    "alma_mater",           # O‘qigan universiteti (o‘quv yurti)
    "education",            # Ta’limi
    "employer",             # Ishlagan tashkilotlari
    "notable_works",        # Mashhur asarlari yoki ishlari
    "field",                # Soha (ilmiy, san’at, siyosat va h.k.)    
    "spouse",               # Turmush o‘rtog‘i
    "children",             # Farzandlari
    "parents",              # Ota-onasi
    "religion",             # Dini    
    "genre",                # Janri (san’at yoki adabiyotda)   
]
df_filtered7 = df7[important_columns]
df_filtered7.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Raw_data\hf_dataset7.csv", index=False, encoding="utf-8")

In [29]:
import requests
import re
import csv
import time

def get_wikipedia_page(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "wikitext"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyProjectBot/1.0; +https://yourdomain.com)"
    }

    response = requests.get(url, params=params, headers=headers)

    try:
        data = response.json()
    except Exception as e:
        print(f"❌ JSON decode error for {title}: {e}")
        print("Raw response text:", response.text)
        return None

    if 'error' in data:
        print(f"❌ Wikipedia API error for {title}: {data['error']['info']}")
        return None

    return data['parse']['wikitext']['*']

def extract_infobox(wikitext):
    if not wikitext:
        return None
    match = re.search(r"\{\{Infobox(.+?)\n\}\}", wikitext, re.DOTALL)
    if match:
        return match.group(0)
    else:
        match = re.search(r"\{\{Infobox(.+?)\}\}", wikitext, re.DOTALL)
        return match.group(0) if match else None

def extract_attributes(infobox_text):
    if not infobox_text:
        return {}

    attributes = {}
    current_key = None
    current_value_lines = []

    lines = infobox_text.split("\n")
    for line in lines[1:]:  # 1-qator: {{Infobox ... shuning uchun o'tkazamiz
        line = line.strip()
        if line.startswith("|"):
            # Yangi atribut boshlanmoqda, oldingisini saqlaymiz
            if current_key is not None:
                attributes[current_key] = " ".join(current_value_lines).strip()
            parts = line.split("=", 1)
            if len(parts) == 2:
                current_key = parts[0].strip("| ").lower()
                current_value_lines = [parts[1].strip()]
            else:
                current_key = None
                current_value_lines = []
        else:
            # Agar chiziq | bilan boshlanmasa, oldingi key qiymatining davomidir
            if current_key is not None:
                current_value_lines.append(line)

    # Oxirgi atributni qo'shamiz
    if current_key is not None:
        attributes[current_key] = " ".join(current_value_lines).strip()

    return attributes

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = set()
    for item in data:
        keys.update(item.keys())
    keys = sorted(list(keys))

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"✅ Dataset '{filename}' faylga saqlandi.")

if __name__ == "__main__":
    persons = [
    "Leonardo da Vinci", "Michelangelo", "Raphael", "Donatello", "Sandro Botticelli",
    "Caravaggio", "Titian", "Peter Paul Rubens", "Rembrandt", "Johannes Vermeer",
    "Claude Monet", "Edgar Degas", "Paul Cézanne", "Vincent van Gogh", "Pablo Picasso",
    "Salvador Dalí", "Henri Matisse", "Jackson Pollock", "Andy Warhol", "Mark Rothko",
    "Wassily Kandinsky", "Joan Miró", "Paul Klee", "Diego Rivera", "Fernando Botero",
    "Jean-Michel Basquiat", "Banksy", "Yayoi Kusama", "Ai Weiwei", "Takashi Murakami",
    "Jeff Koons", "Damien Hirst", "Anish Kapoor", "Antoni Gaudí", "Frank Lloyd Wright",
    "Le Corbusier", "Zaha Hadid", "Norman Foster", "I.M. Pei", "Renzo Piano",
    "Oscar Niemeyer", "Ludwig Mies van der Rohe", "Louis Sullivan", "Walter Gropius", "Eero Saarinen",
    "Philip Johnson", "Richard Meier", "Santiago Calatrava", "Frank Gehry", "Bjarke Ingels",
    "Rem Koolhaas", "Tadao Ando", "Kengo Kuma", "Arata Isozaki", "Kazuyo Sejima",
    "Shigeru Ban", "Sou Fujimoto", "Kisho Kurokawa", "Alvar Aalto", "Gerrit Rietveld",
    "Carlo Scarpa", "Richard Rogers", "Toyo Ito", "Gustav Klimt", "Egons Schiele",
    "Oskar Kokoschka", "Alphonse Mucha", "Henri de Toulouse-Lautrec", "Édouard Manet", "Pierre-Auguste Renoir",
    "Camille Pissarro", "Paul Gauguin", "Georges Seurat", "Henri Rousseau", "Marc Chagall",
    "Franz Marc", "Paul Klee", "Balthus", "Edward Hopper", "Georgia O'Keeffe",
    "John Singer Sargent", "Winslow Homer", "Mary Cassatt", "J.M.W. Turner", "Thomas Gainsborough",
    "Jean-Auguste-Dominique Ingres", "Gustave Courbet", "Eugène Delacroix", "Élisabeth Vigée Le Brun", "Jean-Baptiste-Camille Corot",
    "Thomas Eakins", "Albrecht Dürer", "Hans Holbein the Younger", "Hieronymus Bosch", "Pieter Bruegel the Elder",
    "Jan van Eyck", "Raphael", "Michelangelo", "Donatello", "Leonardo da Vinci",
    "Francisco Goya", "Diego Velázquez", "El Greco", "Caravaggio", "Rembrandt",
    "Johannes Vermeer", "Caspar David Friedrich", "Gustave Moreau", "William-Adolphe Bouguereau", "Jean-Léon Gérôme",
    "Paul Signac", "Georges Braque", "Marc Chagall", "Jean Dubuffet", "Robert Rauschenberg",
    "Frank Stella", "Jasper Johns", "Roy Lichtenstein", "Claes Oldenburg", "David Hockney",
    "Bridget Riley", "Yayoi Kusama", "Takashi Murakami", "Jeff Koons", "Damien Hirst",
    "Anish Kapoor", "Richard Serra", "Antoni Gaudí", "Zaha Hadid", "Norman Foster",
    "Frank Gehry", "Bjarke Ingels", "Rem Koolhaas", "Tadao Ando", "Kengo Kuma",
    "Arata Isozaki", "Kazuyo Sejima", "Shigeru Ban", "Sou Fujimoto", "Kisho Kurokawa"

    ]

    dataset = []

    for idx, person in enumerate(persons, 1):
        print(f"{idx}/{len(persons)} 📥 {person} sahifasini yuklanmoqda...")
        wikitext = get_wikipedia_page(person)
        infobox = extract_infobox(wikitext)
        attributes = extract_attributes(infobox)

        if attributes:
            if "name" not in attributes:
                attributes["name"] = person
            dataset.append(attributes)
            print(f"✅ {person} infoboxi muvaffaqiyatli olindi.")
        else:
            print(f"⚠️ {person} sahifasida infobox topilmadi yoki bo'sh.")

        time.sleep(1)  # API limitlarini cheklash uchun

    save_to_csv(dataset, "historical_figures_dataset8.csv")


1/140 📥 Leonardo da Vinci sahifasini yuklanmoqda...
✅ Leonardo da Vinci infoboxi muvaffaqiyatli olindi.
2/140 📥 Michelangelo sahifasini yuklanmoqda...
✅ Michelangelo infoboxi muvaffaqiyatli olindi.
3/140 📥 Raphael sahifasini yuklanmoqda...
✅ Raphael infoboxi muvaffaqiyatli olindi.
4/140 📥 Donatello sahifasini yuklanmoqda...
✅ Donatello infoboxi muvaffaqiyatli olindi.
5/140 📥 Sandro Botticelli sahifasini yuklanmoqda...
✅ Sandro Botticelli infoboxi muvaffaqiyatli olindi.
6/140 📥 Caravaggio sahifasini yuklanmoqda...
✅ Caravaggio infoboxi muvaffaqiyatli olindi.
7/140 📥 Titian sahifasini yuklanmoqda...
✅ Titian infoboxi muvaffaqiyatli olindi.
8/140 📥 Peter Paul Rubens sahifasini yuklanmoqda...
✅ Peter Paul Rubens infoboxi muvaffaqiyatli olindi.
9/140 📥 Rembrandt sahifasini yuklanmoqda...
✅ Rembrandt infoboxi muvaffaqiyatli olindi.
10/140 📥 Johannes Vermeer sahifasini yuklanmoqda...
✅ Johannes Vermeer infoboxi muvaffaqiyatli olindi.
11/140 📥 Claude Monet sahifasini yuklanmoqda...
✅ Claude Mo

In [72]:
df8 = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Notebooks\historical_figures_dataset8.csv")
df8.head()
df8.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135 entries, 0 to 134
Columns: 110 entries, [[parsons school of design]]<ref>{{cite web |url to {{marriage|isabel wilson|1949|1965|end
dtypes: float64(21), object(89)
memory usage: 116.1+ KB


In [101]:
important_columns = [
    "name",                 # Ism (asosiy identifikator)
    "birth_date",           # Tug‘ilgan sana
    "birth_place",          # Tug‘ilgan joy
    "death_date",           # Vafot sanasi
    "death_place",          # Vafot joyi
    "nationality",          # Millati
    "occupation",           # Kasbi yoki lavozimi
    "years_active",         # Faoliyat yillari
    "known_for",            # Mashhurlikka sabab bo‘lgan ishlari
    "awards",               # Mukofotlar
    "alma_mater",           # O‘qigan universiteti (o‘quv yurti)
    "education",            # Ta’limi
   
    "notable_works",        # Mashhur asarlari yoki ishlari
    "field",                # Soha (ilmiy, san’at, siyosat va h.k.)    
    "spouse",               # Turmush o‘rtog‘i
    "children",             # Farzandlari
    "parents"              # Ota-onasi
   
   
]
df_filtered8 = df8[important_columns]
df_filtered8.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Raw_data\hf_dataset8.csv", index=False, encoding="utf-8")

In [30]:
import requests
import re
import csv
import time

def get_wikipedia_page(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "wikitext"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyProjectBot/1.0; +https://yourdomain.com)"
    }

    response = requests.get(url, params=params, headers=headers)

    try:
        data = response.json()
    except Exception as e:
        print(f"❌ JSON decode error for {title}: {e}")
        print("Raw response text:", response.text)
        return None

    if 'error' in data:
        print(f"❌ Wikipedia API error for {title}: {data['error']['info']}")
        return None

    return data['parse']['wikitext']['*']

def extract_infobox(wikitext):
    if not wikitext:
        return None
    match = re.search(r"\{\{Infobox(.+?)\n\}\}", wikitext, re.DOTALL)
    if match:
        return match.group(0)
    else:
        match = re.search(r"\{\{Infobox(.+?)\}\}", wikitext, re.DOTALL)
        return match.group(0) if match else None

def extract_attributes(infobox_text):
    if not infobox_text:
        return {}

    attributes = {}
    current_key = None
    current_value_lines = []

    lines = infobox_text.split("\n")
    for line in lines[1:]:  # 1-qator: {{Infobox ... shuning uchun o'tkazamiz
        line = line.strip()
        if line.startswith("|"):
            # Yangi atribut boshlanmoqda, oldingisini saqlaymiz
            if current_key is not None:
                attributes[current_key] = " ".join(current_value_lines).strip()
            parts = line.split("=", 1)
            if len(parts) == 2:
                current_key = parts[0].strip("| ").lower()
                current_value_lines = [parts[1].strip()]
            else:
                current_key = None
                current_value_lines = []
        else:
            # Agar chiziq | bilan boshlanmasa, oldingi key qiymatining davomidir
            if current_key is not None:
                current_value_lines.append(line)

    # Oxirgi atributni qo'shamiz
    if current_key is not None:
        attributes[current_key] = " ".join(current_value_lines).strip()

    return attributes

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = set()
    for item in data:
        keys.update(item.keys())
    keys = sorted(list(keys))

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"✅ Dataset '{filename}' faylga saqlandi.")

if __name__ == "__main__":
    persons = [
    "Ada Lovelace", "Grace Hopper", "Hedy Lamarr", "Katherine Johnson", "Dorothy Vaughan",
    "Mary Jackson", "Mae Jemison", "Sally Ride", "Valentina Tereshkova", "Peggy Whitson",
    "Christa McAuliffe", "Kalpana Chawla", "Sunita Williams", "Jessica Meir", "Jessica Watkins",
    "Eileen Collins", "Mae Carol Jemison", "Nichelle Nichols", "Jocelyn Bell Burnell", "Chien-Shiung Wu",
    "Barbara McClintock", "Rosalyn Yalow", "Rita Levi-Montalcini", "Marie Curie", "Dorothy Hodgkin",
    "Lise Meitner", "Gertrude Elion", "Tu Youyou", "Françoise Barré-Sinoussi", "Emmanuelle Charpentier",
    "Jennifer Doudna", "May-Britt Moser", "Edvard I. Moser", "Elizabeth Blackburn", "Carol Greider",
    "Jack Szostak", "Françoise Barre-Sinoussi", "Ada Yonath", "Tu Youyou", "Shinya Yamanaka",
    "Craig Venter", "Katalin Karikó", "Sarah Gilbert", "Özlem Türeci", "Uğur Şahin",
    "Fei-Fei Li", "Geoffrey Hinton", "Yoshua Bengio", "Yann LeCun", "Andrew Ng",
    "Tim Berners-Lee", "Vint Cerf", "Linus Torvalds", "Dennis Ritchie", "Ken Thompson",
    "Bjarne Stroustrup", "James Gosling", "Guido van Rossum", "Brendan Eich", "Margaret Hamilton",
    "Radia Perlman", "Ada Lovelace", "Katherine Johnson", "Grace Hopper", "Hedy Lamarr",
    "Mary Jackson", "Dorothy Vaughan", "Sally Ride", "Mae Jemison", "Valentina Tereshkova",
    "Christa McAuliffe", "Kalpana Chawla", "Jessica Meir", "Jessica Watkins", "Eileen Collins",
    "Mae Carol Jemison", "Nichelle Nichols", "Jocelyn Bell Burnell", "Barbara McClintock", "Rosalyn Yalow",
    "Rita Levi-Montalcini", "Dorothy Hodgkin", "Gertrude Elion", "Françoise Barré-Sinoussi", "Emmanuelle Charpentier",
    "Jennifer Doudna", "May-Britt Moser", "Edvard I. Moser", "Elizabeth Blackburn", "Carol Greider",
    "Jack Szostak", "Ada Yonath", "Shinya Yamanaka", "Craig Venter", "Katalin Karikó",
    "Sarah Gilbert", "Özlem Türeci", "Uğur Şahin", "Fei-Fei Li", "Geoffrey Hinton",
    "Yoshua Bengio", "Yann LeCun", "Andrew Ng", "Tim Berners-Lee", "Vint Cerf"
]

    dataset = []

    for idx, person in enumerate(persons, 1):
        print(f"{idx}/{len(persons)} 📥 {person} sahifasini yuklanmoqda...")
        wikitext = get_wikipedia_page(person)
        infobox = extract_infobox(wikitext)
        attributes = extract_attributes(infobox)

        if attributes:
            if "name" not in attributes:
                attributes["name"] = person
            dataset.append(attributes)
            print(f"✅ {person} infoboxi muvaffaqiyatli olindi.")
        else:
            print(f"⚠️ {person} sahifasida infobox topilmadi yoki bo'sh.")

        time.sleep(1)  # API limitlarini cheklash uchun

    save_to_csv(dataset, "historical_figures_dataset9.csv")


1/105 📥 Ada Lovelace sahifasini yuklanmoqda...
✅ Ada Lovelace infoboxi muvaffaqiyatli olindi.
2/105 📥 Grace Hopper sahifasini yuklanmoqda...
✅ Grace Hopper infoboxi muvaffaqiyatli olindi.
3/105 📥 Hedy Lamarr sahifasini yuklanmoqda...
✅ Hedy Lamarr infoboxi muvaffaqiyatli olindi.
4/105 📥 Katherine Johnson sahifasini yuklanmoqda...
✅ Katherine Johnson infoboxi muvaffaqiyatli olindi.
5/105 📥 Dorothy Vaughan sahifasini yuklanmoqda...
✅ Dorothy Vaughan infoboxi muvaffaqiyatli olindi.
6/105 📥 Mary Jackson sahifasini yuklanmoqda...
⚠️ Mary Jackson sahifasida infobox topilmadi yoki bo'sh.
7/105 📥 Mae Jemison sahifasini yuklanmoqda...
✅ Mae Jemison infoboxi muvaffaqiyatli olindi.
8/105 📥 Sally Ride sahifasini yuklanmoqda...
✅ Sally Ride infoboxi muvaffaqiyatli olindi.
9/105 📥 Valentina Tereshkova sahifasini yuklanmoqda...
✅ Valentina Tereshkova infoboxi muvaffaqiyatli olindi.
10/105 📥 Peggy Whitson sahifasini yuklanmoqda...
✅ Peggy Whitson infoboxi muvaffaqiyatli olindi.
11/105 📥 Christa McAuli

In [74]:
df9 = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Notebooks\historical_figures_dataset9.csv")
df9.head()
df9.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Columns: 105 entries, academic_advisors to {{marriage|foster johnson|1951|1951|end
dtypes: float64(13), object(92)
memory usage: 73.1+ KB


In [100]:
important_columns = [
    "name",                 # Ism (asosiy identifikator)
    "birth_date",           # Tug‘ilgan sana
    "birth_place",          # Tug‘ilgan joy
    "death_date",           # Vafot sanasi
    "death_place",          # Vafot joyi
    "nationality",          # Millati
    "occupation",           # Kasbi yoki lavozimi
    "years_active",         # Faoliyat yillari
    "known_for",            # Mashhurlikka sabab bo‘lgan ishlari
    "awards",               # Mukofotlar
    "alma_mater",           # O‘qigan universiteti (o‘quv yurti)
    "education",            # Ta’limi
    "employer",             # Ishlagan tashkilotlari   
    "field",                # Soha (ilmiy, san’at, siyosat va h.k.)    
    "spouse",               # Turmush o‘rtog‘i
    "children",             # Farzandlari
    "parents",              # Ota-onasi
    "religion"             # Dini    
   
]
df_filtered9 = df9[important_columns]
df_filtered9.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Raw_data\hf_dataset9.csv", index=False, encoding="utf-8")

In [31]:
import requests
import re
import csv
import time

def get_wikipedia_page(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "wikitext"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyProjectBot/1.0; +https://yourdomain.com)"
    }

    response = requests.get(url, params=params, headers=headers)

    try:
        data = response.json()
    except Exception as e:
        print(f"❌ JSON decode error for {title}: {e}")
        print("Raw response text:", response.text)
        return None

    if 'error' in data:
        print(f"❌ Wikipedia API error for {title}: {data['error']['info']}")
        return None

    return data['parse']['wikitext']['*']

def extract_infobox(wikitext):
    if not wikitext:
        return None
    match = re.search(r"\{\{Infobox(.+?)\n\}\}", wikitext, re.DOTALL)
    if match:
        return match.group(0)
    else:
        match = re.search(r"\{\{Infobox(.+?)\}\}", wikitext, re.DOTALL)
        return match.group(0) if match else None

def extract_attributes(infobox_text):
    if not infobox_text:
        return {}

    attributes = {}
    current_key = None
    current_value_lines = []

    lines = infobox_text.split("\n")
    for line in lines[1:]:  # 1-qator: {{Infobox ... shuning uchun o'tkazamiz
        line = line.strip()
        if line.startswith("|"):
            # Yangi atribut boshlanmoqda, oldingisini saqlaymiz
            if current_key is not None:
                attributes[current_key] = " ".join(current_value_lines).strip()
            parts = line.split("=", 1)
            if len(parts) == 2:
                current_key = parts[0].strip("| ").lower()
                current_value_lines = [parts[1].strip()]
            else:
                current_key = None
                current_value_lines = []
        else:
            # Agar chiziq | bilan boshlanmasa, oldingi key qiymatining davomidir
            if current_key is not None:
                current_value_lines.append(line)

    # Oxirgi atributni qo'shamiz
    if current_key is not None:
        attributes[current_key] = " ".join(current_value_lines).strip()

    return attributes

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = set()
    for item in data:
        keys.update(item.keys())
    keys = sorted(list(keys))

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"✅ Dataset '{filename}' faylga saqlandi.")

if __name__ == "__main__":
    persons = [
    "Martin Luther King Jr.", "Nelson Mandela", "Mahatma Gandhi", "Malcolm X", "Rosa Parks",
    "Susan B. Anthony", "Frederick Douglass", "Harriet Tubman", "Sojourner Truth", "Angela Davis",
    "Cesar Chavez", "Aung San Suu Kyi", "Lech Wałęsa", "Vaclav Havel", "Eleanor Roosevelt",
    "Desmond Tutu", "Dalai Lama", "Ban Ki-moon", "Kofi Annan", "Gro Harlem Brundtland",
    "Margaret Thatcher", "Indira Gandhi", "Golda Meir", "Benazir Bhutto", "Jacinda Ardern",
    "Sanna Marin", "Theresa May", "Angela Merkel", "Hillary Clinton", "Elizabeth Warren",
    "Kamala Harris", "Michelle Obama", "Malala Yousafzai", "AOC", "Ilhan Omar",
    "Rashida Tlaib", "Alexandria Ocasio-Cortez", "Nancy Pelosi", "Condoleezza Rice", "Madeleine Albright",
    "Sonia Gandhi", "Pratibha Patil", "Sheikh Hasina", "Ellen Johnson Sirleaf", "Wangari Maathai",
    "Rigoberta Menchú", "Leymah Gbowee", "Leyla Zana", "Tawakkol Karman", "Nadia Murad",
    "Malalai Joya", "Shirin Ebadi", "Tansu Çiller", "Dalia Grybauskaitė", "Kolinda Grabar-Kitarović",
    "Atifete Jahjaga", "Beata Szydło", "Sviatlana Tsikhanouskaya", "Svetlana Tikhanovskaya", "Yulia Tymoshenko",
    "Julia Gillard", "Jacinda Ardern", "Theresa May", "Angela Merkel", "Margaret Thatcher",
    "Indira Gandhi", "Golda Meir", "Benazir Bhutto", "Aung San Suu Kyi", "Michelle Bachelet",
    "Michelle Obama", "Kamala Harris", "Elizabeth Warren", "Hillary Clinton", "Nancy Pelosi",
    "Condoleezza Rice", "Madeleine Albright", "Sonia Gandhi", "Pratibha Patil", "Sheikh Hasina",
    "Ellen Johnson Sirleaf", "Wangari Maathai", "Rigoberta Menchú", "Leymah Gbowee", "Leyla Zana",
    "Tawakkol Karman", "Nadia Murad", "Malalai Joya", "Shirin Ebadi", "Tansu Çiller"
]

    dataset = []

    for idx, person in enumerate(persons, 1):
        print(f"{idx}/{len(persons)} 📥 {person} sahifasini yuklanmoqda...")
        wikitext = get_wikipedia_page(person)
        infobox = extract_infobox(wikitext)
        attributes = extract_attributes(infobox)

        if attributes:
            if "name" not in attributes:
                attributes["name"] = person
            dataset.append(attributes)
            print(f"✅ {person} infoboxi muvaffaqiyatli olindi.")
        else:
            print(f"⚠️ {person} sahifasida infobox topilmadi yoki bo'sh.")

        time.sleep(1)  # API limitlarini cheklash uchun

    save_to_csv(dataset, "historical_figures_dataset10.csv")


1/90 📥 Martin Luther King Jr. sahifasini yuklanmoqda...
✅ Martin Luther King Jr. infoboxi muvaffaqiyatli olindi.
2/90 📥 Nelson Mandela sahifasini yuklanmoqda...
✅ Nelson Mandela infoboxi muvaffaqiyatli olindi.
3/90 📥 Mahatma Gandhi sahifasini yuklanmoqda...
✅ Mahatma Gandhi infoboxi muvaffaqiyatli olindi.
4/90 📥 Malcolm X sahifasini yuklanmoqda...
✅ Malcolm X infoboxi muvaffaqiyatli olindi.
5/90 📥 Rosa Parks sahifasini yuklanmoqda...
✅ Rosa Parks infoboxi muvaffaqiyatli olindi.
6/90 📥 Susan B. Anthony sahifasini yuklanmoqda...
✅ Susan B. Anthony infoboxi muvaffaqiyatli olindi.
7/90 📥 Frederick Douglass sahifasini yuklanmoqda...
✅ Frederick Douglass infoboxi muvaffaqiyatli olindi.
8/90 📥 Harriet Tubman sahifasini yuklanmoqda...
✅ Harriet Tubman infoboxi muvaffaqiyatli olindi.
9/90 📥 Sojourner Truth sahifasini yuklanmoqda...
✅ Sojourner Truth infoboxi muvaffaqiyatli olindi.
10/90 📥 Angela Davis sahifasini yuklanmoqda...
✅ Angela Davis infoboxi muvaffaqiyatli olindi.
11/90 📥 Cesar Chavez 

In [76]:
df10 = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Notebooks\historical_figures_dataset10.csv")
df10.head()
df10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Columns: 293 entries, 1blankname to years_active
dtypes: float64(10), object(283)
memory usage: 194.7+ KB


In [99]:
important_columns = [
    "name",                 # Ism (asosiy identifikator)
    "birth_date",           # Tug‘ilgan sana
    "birth_place",          # Tug‘ilgan joy
    "death_date",           # Vafot sanasi
    "death_place",          # Vafot joyi
    "nationality",          # Millati
    "occupation",           # Kasbi yoki lavozimi
    "years_active",         # Faoliyat yillari
    "known_for",            # Mashhurlikka sabab bo‘lgan ishlari
    "awards",               # Mukofotlar
    "alma_mater",           # O‘qigan universiteti (o‘quv yurti)
    "education",            # Ta’limi    
    "notable_works",        # Mashhur asarlari yoki ishlari
    "field",                # Soha (ilmiy, san’at, siyosat va h.k.)    
    "spouse",               # Turmush o‘rtog‘i
    "children",             # Farzandlari
    "parents"              # Ota-onasi
    
   
]
df_filtered10 = df10[important_columns]
df_filtered10.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Raw_data\hf_dataset10.csv", index=False, encoding="utf-8")

In [32]:
import requests
import re
import csv
import time

def get_wikipedia_page(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "wikitext"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyProjectBot/1.0; +https://yourdomain.com)"
    }

    response = requests.get(url, params=params, headers=headers)

    try:
        data = response.json()
    except Exception as e:
        print(f"❌ JSON decode error for {title}: {e}")
        print("Raw response text:", response.text)
        return None

    if 'error' in data:
        print(f"❌ Wikipedia API error for {title}: {data['error']['info']}")
        return None

    return data['parse']['wikitext']['*']

def extract_infobox(wikitext):
    if not wikitext:
        return None
    match = re.search(r"\{\{Infobox(.+?)\n\}\}", wikitext, re.DOTALL)
    if match:
        return match.group(0)
    else:
        match = re.search(r"\{\{Infobox(.+?)\}\}", wikitext, re.DOTALL)
        return match.group(0) if match else None

def extract_attributes(infobox_text):
    if not infobox_text:
        return {}

    attributes = {}
    current_key = None
    current_value_lines = []

    lines = infobox_text.split("\n")
    for line in lines[1:]:  # 1-qator: {{Infobox ... shuning uchun o'tkazamiz
        line = line.strip()
        if line.startswith("|"):
            # Yangi atribut boshlanmoqda, oldingisini saqlaymiz
            if current_key is not None:
                attributes[current_key] = " ".join(current_value_lines).strip()
            parts = line.split("=", 1)
            if len(parts) == 2:
                current_key = parts[0].strip("| ").lower()
                current_value_lines = [parts[1].strip()]
            else:
                current_key = None
                current_value_lines = []
        else:
            # Agar chiziq | bilan boshlanmasa, oldingi key qiymatining davomidir
            if current_key is not None:
                current_value_lines.append(line)

    # Oxirgi atributni qo'shamiz
    if current_key is not None:
        attributes[current_key] = " ".join(current_value_lines).strip()

    return attributes

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = set()
    for item in data:
        keys.update(item.keys())
    keys = sorted(list(keys))

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"✅ Dataset '{filename}' faylga saqlandi.")

if __name__ == "__main__":
    persons = [
    "Leonardo da Vinci", "Michelangelo", "Raphael", "Donatello", "Sandro Botticelli",
    "Caravaggio", "Titian", "Peter Paul Rubens", "Rembrandt", "Johannes Vermeer",
    "Claude Monet", "Edgar Degas", "Paul Cézanne", "Vincent van Gogh", "Pablo Picasso",
    "Salvador Dalí", "Henri Matisse", "Jackson Pollock", "Andy Warhol", "Frida Kahlo",
    "Georgia O'Keeffe", "Mark Rothko", "Wassily Kandinsky", "Joan Miró", "Paul Klee",
    "Diego Rivera", "Fernando Botero", "Jean-Michel Basquiat", "Banksy", "Yayoi Kusama",
    "Ai Weiwei", "Takashi Murakami", "Jeff Koons", "Damien Hirst", "Anish Kapoor",
    "Antoni Gaudí", "Frank Lloyd Wright", "Le Corbusier", "Zaha Hadid", "Norman Foster",
    "I.M. Pei", "Renzo Piano", "Oscar Niemeyer", "Ludwig Mies van der Rohe", "Louis Sullivan",
    "Walter Gropius", "Eero Saarinen", "Philip Johnson", "Richard Meier", "Santiago Calatrava",
    "Frank Gehry", "Bjarke Ingels", "Rem Koolhaas", "Tadao Ando", "Kengo Kuma",
    "Arata Isozaki", "Kazuyo Sejima", "Shigeru Ban", "Sou Fujimoto", "Kisho Kurokawa",
    "Alvar Aalto", "Gerrit Rietveld", "Carlo Scarpa", "Richard Rogers", "Toyo Ito",
    "Gustav Klimt", "Egons Schiele", "Oskar Kokoschka", "Alphonse Mucha", "Henri de Toulouse-Lautrec",
    "Édouard Manet", "Pierre-Auguste Renoir", "Camille Pissarro", "Paul Gauguin", "Georges Seurat",
    "Henri Rousseau", "Marc Chagall", "Franz Marc", "Balthus", "Edward Hopper",
    "John Singer Sargent", "Winslow Homer", "Mary Cassatt", "J.M.W. Turner", "Thomas Gainsborough"

]

    dataset = []

    for idx, person in enumerate(persons, 1):
        print(f"{idx}/{len(persons)} 📥 {person} sahifasini yuklanmoqda...")
        wikitext = get_wikipedia_page(person)
        infobox = extract_infobox(wikitext)
        attributes = extract_attributes(infobox)

        if attributes:
            if "name" not in attributes:
                attributes["name"] = person
            dataset.append(attributes)
            print(f"✅ {person} infoboxi muvaffaqiyatli olindi.")
        else:
            print(f"⚠️ {person} sahifasida infobox topilmadi yoki bo'sh.")

        time.sleep(1)  # API limitlarini cheklash uchun

    save_to_csv(dataset, "historical_figures_dataset11.csv")


1/85 📥 Leonardo da Vinci sahifasini yuklanmoqda...
✅ Leonardo da Vinci infoboxi muvaffaqiyatli olindi.
2/85 📥 Michelangelo sahifasini yuklanmoqda...
✅ Michelangelo infoboxi muvaffaqiyatli olindi.
3/85 📥 Raphael sahifasini yuklanmoqda...
✅ Raphael infoboxi muvaffaqiyatli olindi.
4/85 📥 Donatello sahifasini yuklanmoqda...
✅ Donatello infoboxi muvaffaqiyatli olindi.
5/85 📥 Sandro Botticelli sahifasini yuklanmoqda...
✅ Sandro Botticelli infoboxi muvaffaqiyatli olindi.
6/85 📥 Caravaggio sahifasini yuklanmoqda...
✅ Caravaggio infoboxi muvaffaqiyatli olindi.
7/85 📥 Titian sahifasini yuklanmoqda...
✅ Titian infoboxi muvaffaqiyatli olindi.
8/85 📥 Peter Paul Rubens sahifasini yuklanmoqda...
✅ Peter Paul Rubens infoboxi muvaffaqiyatli olindi.
9/85 📥 Rembrandt sahifasini yuklanmoqda...
✅ Rembrandt infoboxi muvaffaqiyatli olindi.
10/85 📥 Johannes Vermeer sahifasini yuklanmoqda...
✅ Johannes Vermeer infoboxi muvaffaqiyatli olindi.
11/85 📥 Claude Monet sahifasini yuklanmoqda...
✅ Claude Monet infobox

In [78]:
df11 = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Notebooks\historical_figures_dataset11.csv")
df11.head()
df11.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Columns: 106 entries, access-date to years_active
dtypes: float64(19), object(87)
memory usage: 67.2+ KB


In [98]:
important_columns = [
    "name",                 # Ism (asosiy identifikator)
    "birth_date",           # Tug‘ilgan sana
    "birth_place",          # Tug‘ilgan joy
    "death_date",           # Vafot sanasi
    "death_place",          # Vafot joyi
    "nationality",          # Millati
    "occupation",           # Kasbi yoki lavozimi
    "years_active",         # Faoliyat yillari
    "known_for",            # Mashhurlikka sabab bo‘lgan ishlari
    "awards",               # Mukofotlar
    "alma_mater",           # O‘qigan universiteti (o‘quv yurti)
    "education",            # Ta’limi    
    "notable_works",        # Mashhur asarlari yoki ishlari
    "field",                # Soha (ilmiy, san’at, siyosat va h.k.)    
    "spouse",               # Turmush o‘rtog‘i
    "children",             # Farzandlari
    "parents"              # Ota-onasi         
]
df_filtered11 = df11[important_columns]
df_filtered11.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Raw_data\hf_dataset11.csv", index=False, encoding="utf-8")

In [34]:
import requests
import re
import csv
import time

def get_wikipedia_page(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "wikitext"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyProjectBot/1.0; +https://yourdomain.com)"
    }

    response = requests.get(url, params=params, headers=headers)

    try:
        data = response.json()
    except Exception as e:
        print(f"❌ JSON decode error for {title}: {e}")
        print("Raw response text:", response.text)
        return None

    if 'error' in data:
        print(f"❌ Wikipedia API error for {title}: {data['error']['info']}")
        return None

    return data['parse']['wikitext']['*']

def extract_infobox(wikitext):
    if not wikitext:
        return None
    match = re.search(r"\{\{Infobox(.+?)\n\}\}", wikitext, re.DOTALL)
    if match:
        return match.group(0)
    else:
        match = re.search(r"\{\{Infobox(.+?)\}\}", wikitext, re.DOTALL)
        return match.group(0) if match else None

def extract_attributes(infobox_text):
    if not infobox_text:
        return {}

    attributes = {}
    current_key = None
    current_value_lines = []

    lines = infobox_text.split("\n")
    for line in lines[1:]:  # 1-qator: {{Infobox ... shuning uchun o'tkazamiz
        line = line.strip()
        if line.startswith("|"):
            # Yangi atribut boshlanmoqda, oldingisini saqlaymiz
            if current_key is not None:
                attributes[current_key] = " ".join(current_value_lines).strip()
            parts = line.split("=", 1)
            if len(parts) == 2:
                current_key = parts[0].strip("| ").lower()
                current_value_lines = [parts[1].strip()]
            else:
                current_key = None
                current_value_lines = []
        else:
            # Agar chiziq | bilan boshlanmasa, oldingi key qiymatining davomidir
            if current_key is not None:
                current_value_lines.append(line)

    # Oxirgi atributni qo'shamiz
    if current_key is not None:
        attributes[current_key] = " ".join(current_value_lines).strip()

    return attributes

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = set()
    for item in data:
        keys.update(item.keys())
    keys = sorted(list(keys))

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"✅ Dataset '{filename}' faylga saqlandi.")

if __name__ == "__main__":
    persons = [
    "Albert Schweitzer", "Ada Yonath", "Abdus Salam", "Agatha Christie", "Alan Kay",
    "Alessandro Volta", "Alfred Wegener", "Anaximander", "Anne Frank", "Antoni van Leeuwenhoek",
    "Arthur Conan Doyle", "Augustus Caesar", "Aurelia Brown", "Azim Premji", "Bessie Coleman",
    "Blaise Pascal", "Carl Bosch", "Carl Friedrich Gauss", "Charles Dickens", "Chiang Kai-shek",
    "Coco Chanel", "Daniel Kahneman", "David Attenborough", "Diego Maradona", "Dorothy Hodgkin",
    "Doris Lessing", "Edmund Hillary", "Edmund Husserl", "Edward Jenner", "Eleanor of Aquitaine",
    "Elizabeth Blackwell", "Emperor Akbar", "Florence Griffith Joyner", "Gertrude Stein", "Grace Kelly",
    "Harvey Milk", "Heinrich Heine", "Henrietta Lacks", "Hermann Hesse", "Hermann Weyl",
    "Ibn Battuta", "Imhotep", "Ingrid Bergman", "Isaac Asimov", "Jackie Robinson",
    "Jacques Cousteau", "Jane Austen", "Jawaharlal Nehru", "Johann Gutenberg", "Josephine Baker",
    "Julia Child", "Karl Marx", "Kobe Bryant", "Langston Hughes", "Leo Tolstoy",
    "Linus Pauling", "Lou Andreas‑Salomé", "Max Born", "Michel de Montaigne", "Muhammad Ali",
    "Murasaki Shikibu", "Naguib Mahfouz", "Niccolò Machiavelli", "Oscar Wilde", "P.T. Barnum",
    "Percy Bysshe Shelley", "Rabindranath Tagore", "Rosa Luxemburg", "Srinivasa Ramanujan", "Simón Bolívar",
    "Sojourner Truth", "Sylvia Plath", "Thich Nhat Hanh", "Thomas Paine", "Ursula Le Guin",
    "Voltaire", "Walt Whitman", "Wangari Maathai", "William Blake", "Wilma Rudolph",
    "Zora Neale Hurston", "Zhuangzi", "Zenobia", "Boudica", "Tecumseh",
    "Sacagawea", "Pocahontas", "Davy Crockett", "Daniel Boone", "Harriet Beecher Stowe",
    "Marguerite de Navarre", "Isadora Duncan", "Louise Michel", "Ada Lovelace", "Mary Wollstonecraft",
    "Empress Dowager Cixi", "Katherine of Aragon", "Anne Boleyn", "Mary I of England", "Mary Queen of Scots",
    "Joan Chen", "Oprah Winfrey", "Emma Watson", "Natalie Portman", "Greta Thunberg",
    "Julia Gillard", "Kamala Harris", "Michelle Obama", "Elizabeth Warren", "Hillary Clinton",
    "Nancy Pelosi", "Condoleezza Rice", "Sonia Gandhi", "Sheikh Hasina", "Pratibha Patil",
    "Ellen Johnson Sirleaf", "Rigoberta Menchú", "Leymah Gbowee", "Leyla Zana", "Tawakkol Karman",
    "Nadia Murad", "Malalai Joya", "Shirin Ebadi", "Tansu Çiller", "Beata Szydło",
    "Sviatlana Tsikhanouskaya", "Yulia Tymoshenko", "Kolinda Grabar‑Kitarović", "Atifete Jahjaga", "Micronesia Leader",
    "Jacinda Ardern", "Sanna Marin", "Theresa May", "Liz Truss", "Rishi Sunak",
    "Justin Trudeau", "Emmanuel Macron", "Boris Johnson", "Narendra Modi", "Recep Tayyip Erdoğan",
    "Volodymyr Zelenskyy", "Imran Khan", "Sheikh Hasina", "Aung San Suu Kyi", "Moon Jae-in",
    "Park Geun-hye", "Angela Merkel", "Margaret Thatcher", "Indira Gandhi", "Golda Meir",
    "Benazir Bhutto", "Catherine the Great", "Elizabeth I", "Cleopatra", "Marie Antoinette",
    "Queen Victoria", "Joan of Arc", "Simone de Beauvoir", "Virginia Woolf", "Frida Kahlo",
    "Georgia O'Keeffe", "Amelia Earhart", "Harriet Tubman", "Malala Yousafzai", "Mother Teresa",
    "Eleanor Roosevelt", "Margaret Thatcher", "Angela Merkel", "Michelle Bachelet", "Aung San Suu Kyi",
    "Ellen Ochoa", "Valentina Tereshkova", "Mae Jemison", "Kalpana Chawla", "Sunita Williams",
    "Jessica Meir", "Christa McAuliffe", "Katherine Johnson", "Dorothy Vaughan", "Mary Jackson"

]

    dataset = []

    for idx, person in enumerate(persons, 1):
        print(f"{idx}/{len(persons)} 📥 {person} sahifasini yuklanmoqda...")
        wikitext = get_wikipedia_page(person)
        infobox = extract_infobox(wikitext)
        attributes = extract_attributes(infobox)

        if attributes:
            if "name" not in attributes:
                attributes["name"] = person
            dataset.append(attributes)
            print(f"✅ {person} infoboxi muvaffaqiyatli olindi.")
        else:
            print(f"⚠️ {person} sahifasida infobox topilmadi yoki bo'sh.")

        time.sleep(1)  # API limitlarini cheklash uchun

    save_to_csv(dataset, "historical_figures_dataset12.csv")


1/180 📥 Albert Schweitzer sahifasini yuklanmoqda...
✅ Albert Schweitzer infoboxi muvaffaqiyatli olindi.
2/180 📥 Ada Yonath sahifasini yuklanmoqda...
✅ Ada Yonath infoboxi muvaffaqiyatli olindi.
3/180 📥 Abdus Salam sahifasini yuklanmoqda...
✅ Abdus Salam infoboxi muvaffaqiyatli olindi.
4/180 📥 Agatha Christie sahifasini yuklanmoqda...
✅ Agatha Christie infoboxi muvaffaqiyatli olindi.
5/180 📥 Alan Kay sahifasini yuklanmoqda...
✅ Alan Kay infoboxi muvaffaqiyatli olindi.
6/180 📥 Alessandro Volta sahifasini yuklanmoqda...
✅ Alessandro Volta infoboxi muvaffaqiyatli olindi.
7/180 📥 Alfred Wegener sahifasini yuklanmoqda...
✅ Alfred Wegener infoboxi muvaffaqiyatli olindi.
8/180 📥 Anaximander sahifasini yuklanmoqda...
✅ Anaximander infoboxi muvaffaqiyatli olindi.
9/180 📥 Anne Frank sahifasini yuklanmoqda...
✅ Anne Frank infoboxi muvaffaqiyatli olindi.
10/180 📥 Antoni van Leeuwenhoek sahifasini yuklanmoqda...
⚠️ Antoni van Leeuwenhoek sahifasida infobox topilmadi yoki bo'sh.
11/180 📥 Arthur Conan

In [80]:
df12 = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Notebooks\historical_figures_dataset12.csv")
df12.head()
df12.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163 entries, 0 to 162
Columns: 535 entries, 1blankname to {{tlit|zh|t. v. soong|i
dtypes: float64(59), object(476)
memory usage: 681.4+ KB


In [81]:
important_columns = [
    "name",                 # Ism (asosiy identifikator)
    "birth_date",           # Tug‘ilgan sana
    "birth_place",          # Tug‘ilgan joy
    "death_date",           # Vafot sanasi
    "death_place",          # Vafot joyi
    "nationality",          # Millati
    "occupation",           # Kasbi yoki lavozimi
    "years_active",         # Faoliyat yillari
    "known_for",            # Mashhurlikka sabab bo‘lgan ishlari
    "awards",               # Mukofotlar
    "alma_mater",           # O‘qigan universiteti (o‘quv yurti)
    "education",            # Ta’limi
    "employer",             # Ishlagan tashkilotlari
    "notable_works",        # Mashhur asarlari yoki ishlari
    "field",                # Soha (ilmiy, san’at, siyosat va h.k.)    
    "spouse",               # Turmush o‘rtog‘i
    "children",             # Farzandlari
    "parents",              # Ota-onasi
    "religion",             # Dini    
    "genre",                # Janri (san’at yoki adabiyotda)   
]
df_filtered12 = df12[important_columns]
df_filtered12.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Raw_data\hf_dataset12.csv", index=False, encoding="utf-8")

In [35]:
import requests
import re
import csv
import time

def get_wikipedia_page(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "wikitext"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyProjectBot/1.0; +https://yourdomain.com)"
    }

    response = requests.get(url, params=params, headers=headers)

    try:
        data = response.json()
    except Exception as e:
        print(f"❌ JSON decode error for {title}: {e}")
        print("Raw response text:", response.text)
        return None

    if 'error' in data:
        print(f"❌ Wikipedia API error for {title}: {data['error']['info']}")
        return None

    return data['parse']['wikitext']['*']

def extract_infobox(wikitext):
    if not wikitext:
        return None
    match = re.search(r"\{\{Infobox(.+?)\n\}\}", wikitext, re.DOTALL)
    if match:
        return match.group(0)
    else:
        match = re.search(r"\{\{Infobox(.+?)\}\}", wikitext, re.DOTALL)
        return match.group(0) if match else None

def extract_attributes(infobox_text):
    if not infobox_text:
        return {}

    attributes = {}
    current_key = None
    current_value_lines = []

    lines = infobox_text.split("\n")
    for line in lines[1:]:  # 1-qator: {{Infobox ... shuning uchun o'tkazamiz
        line = line.strip()
        if line.startswith("|"):
            # Yangi atribut boshlanmoqda, oldingisini saqlaymiz
            if current_key is not None:
                attributes[current_key] = " ".join(current_value_lines).strip()
            parts = line.split("=", 1)
            if len(parts) == 2:
                current_key = parts[0].strip("| ").lower()
                current_value_lines = [parts[1].strip()]
            else:
                current_key = None
                current_value_lines = []
        else:
            # Agar chiziq | bilan boshlanmasa, oldingi key qiymatining davomidir
            if current_key is not None:
                current_value_lines.append(line)

    # Oxirgi atributni qo'shamiz
    if current_key is not None:
        attributes[current_key] = " ".join(current_value_lines).strip()

    return attributes

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = set()
    for item in data:
        keys.update(item.keys())
    keys = sorted(list(keys))

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"✅ Dataset '{filename}' faylga saqlandi.")

if __name__ == "__main__":
    persons = [
    "Ada Yonath", "Abdus Salam", "Agatha Christie", "Alan Kay", "Alessandro Volta",
    "Alfred Wegener", "Anaximander", "Anne Frank", "Antoni van Leeuwenhoek", "Arthur Conan Doyle",
    "Augustus Caesar", "Azim Premji", "Bessie Coleman", "Blaise Pascal", "Carl Bosch",
    "Carl Friedrich Gauss", "Charles Dickens", "Chiang Kai-shek", "Coco Chanel", "Daniel Kahneman",
    "David Attenborough", "Diego Maradona", "Dorothy Hodgkin", "Doris Lessing", "Edmund Hillary",
    "Edmund Husserl", "Edward Jenner", "Eleanor of Aquitaine", "Elizabeth Blackwell", "Emperor Akbar",
    "Florence Griffith Joyner", "Gertrude Stein", "Grace Kelly", "Harvey Milk", "Heinrich Heine",
    "Henrietta Lacks", "Hermann Hesse", "Hermann Weyl", "Ibn Battuta", "Imhotep",
    "Ingrid Bergman", "Isaac Asimov", "Jackie Robinson", "Jacques Cousteau", "Jane Austen",
    "Jawaharlal Nehru", "Johann Gutenberg", "Josephine Baker", "Julia Child", "Karl Marx",
    "Kobe Bryant", "Langston Hughes", "Leo Tolstoy", "Linus Pauling", "Lou Andreas‑Salomé",
    "Max Born", "Michel de Montaigne", "Muhammad Ali", "Murasaki Shikibu", "Naguib Mahfouz",
    "Niccolò Machiavelli", "Oscar Wilde", "P.T. Barnum", "Percy Bysshe Shelley", "Rabindranath Tagore",
    "Rosa Luxemburg", "Srinivasa Ramanujan", "Simón Bolívar", "Sylvia Plath", "Thich Nhat Hanh",
    "Thomas Paine", "Ursula Le Guin", "Voltaire", "Walt Whitman", "Wangari Maathai",
    "William Blake", "Wilma Rudolph", "Zora Neale Hurston", "Zhuangzi", "Zenobia",
    "Boudica", "Tecumseh", "Sacagawea", "Pocahontas", "Davy Crockett",
    "Daniel Boone", "Harriet Beecher Stowe", "Marguerite de Navarre", "Isadora Duncan", "Louise Michel",
    "Mary Wollstonecraft", "Empress Dowager Cixi", "Katherine of Aragon", "Anne Boleyn", "Mary I of England",
    "Mary Queen of Scots", "Joan Chen", "Oprah Winfrey", "Emma Watson", "Natalie Portman",
    "Greta Thunberg", "Julia Gillard", "Kamala Harris", "Michelle Obama", "Elizabeth Warren",
    "Hillary Clinton", "Nancy Pelosi", "Condoleezza Rice", "Sonia Gandhi", "Sheikh Hasina",
    "Pratibha Patil", "Ellen Johnson Sirleaf", "Rigoberta Menchú", "Leymah Gbowee", "Leyla Zana",
    "Tawakkol Karman", "Nadia Murad", "Malalai Joya", "Shirin Ebadi", "Tansu Çiller",
    "Beata Szydło", "Sviatlana Tsikhanouskaya", "Yulia Tymoshenko", "Kolinda Grabar‑Kitarović", "Atifete Jahjaga",
    "Micronesia Leader", "Sanna Marin", "Theresa May", "Liz Truss", "Rishi Sunak",
    "Justin Trudeau", "Emmanuel Macron", "Boris Johnson", "Narendra Modi", "Recep Tayyip Erdoğan",
    "Volodymyr Zelenskyy", "Moon Jae-in", "Park Geun-hye", "Michelle Bachelet", "Ellen Ochoa",
    "Valentina Tereshkova", "Mae Jemison", "Kalpana Chawla", "Sunita Williams", "Jessica Meir",
    "Christa McAuliffe", "Mae Carol Jemison", "Nichelle Nichols", "Jocelyn Bell Burnell", "Barbara McClintock",
    "Rosalyn Yalow", "Rita Levi-Montalcini", "Gertrude Elion", "Ada Yonath", "Shinya Yamanaka",
    "Craig Venter", "Katalin Karikó", "Sarah Gilbert", "Özlem Türeci", "Uğur Şahin",
    "Fei-Fei Li", "Yoshua Bengio", "Yann LeCun", "Andrew Ng", "Tim Berners-Lee",
    "Vint Cerf", "Linus Torvalds", "Dennis Ritchie", "Ken Thompson", "Bjarne Stroustrup",
    "James Gosling", "Guido van Rossum", "Brendan Eich", "Margaret Hamilton", "Radia Perlman",
    "Katherine Johnson", "Grace Hopper"

]

    dataset = []

    for idx, person in enumerate(persons, 1):
        print(f"{idx}/{len(persons)} 📥 {person} sahifasini yuklanmoqda...")
        wikitext = get_wikipedia_page(person)
        infobox = extract_infobox(wikitext)
        attributes = extract_attributes(infobox)

        if attributes:
            if "name" not in attributes:
                attributes["name"] = person
            dataset.append(attributes)
            print(f"✅ {person} infoboxi muvaffaqiyatli olindi.")
        else:
            print(f"⚠️ {person} sahifasida infobox topilmadi yoki bo'sh.")

        time.sleep(1)  # API limitlarini cheklash uchun

    save_to_csv(dataset, "historical_figures_dataset13.csv")


1/177 📥 Ada Yonath sahifasini yuklanmoqda...
✅ Ada Yonath infoboxi muvaffaqiyatli olindi.
2/177 📥 Abdus Salam sahifasini yuklanmoqda...
✅ Abdus Salam infoboxi muvaffaqiyatli olindi.
3/177 📥 Agatha Christie sahifasini yuklanmoqda...
✅ Agatha Christie infoboxi muvaffaqiyatli olindi.
4/177 📥 Alan Kay sahifasini yuklanmoqda...
✅ Alan Kay infoboxi muvaffaqiyatli olindi.
5/177 📥 Alessandro Volta sahifasini yuklanmoqda...
✅ Alessandro Volta infoboxi muvaffaqiyatli olindi.
6/177 📥 Alfred Wegener sahifasini yuklanmoqda...
✅ Alfred Wegener infoboxi muvaffaqiyatli olindi.
7/177 📥 Anaximander sahifasini yuklanmoqda...
✅ Anaximander infoboxi muvaffaqiyatli olindi.
8/177 📥 Anne Frank sahifasini yuklanmoqda...
✅ Anne Frank infoboxi muvaffaqiyatli olindi.
9/177 📥 Antoni van Leeuwenhoek sahifasini yuklanmoqda...
⚠️ Antoni van Leeuwenhoek sahifasida infobox topilmadi yoki bo'sh.
10/177 📥 Arthur Conan Doyle sahifasini yuklanmoqda...
✅ Arthur Conan Doyle infoboxi muvaffaqiyatli olindi.
11/177 📥 Augustus C

In [82]:
df13 = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Notebooks\historical_figures_dataset13.csv")
df13.head()
df13.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Columns: 486 entries, 1blankname to {{tlit|zh|t. v. soong|i
dtypes: float64(57), object(429)
memory usage: 600.0+ KB


In [83]:
important_columns = [
    "name",                 # Ism (asosiy identifikator)
    "birth_date",           # Tug‘ilgan sana
    "birth_place",          # Tug‘ilgan joy
    "death_date",           # Vafot sanasi
    "death_place",          # Vafot joyi
    "nationality",          # Millati
    "occupation",           # Kasbi yoki lavozimi
    "years_active",         # Faoliyat yillari
    "known_for",            # Mashhurlikka sabab bo‘lgan ishlari
    "awards",               # Mukofotlar
    "alma_mater",           # O‘qigan universiteti (o‘quv yurti)
    "education",            # Ta’limi
    "employer",             # Ishlagan tashkilotlari
    "notable_works",        # Mashhur asarlari yoki ishlari
    "field",                # Soha (ilmiy, san’at, siyosat va h.k.)    
    "spouse",               # Turmush o‘rtog‘i
    "children",             # Farzandlari
    "parents",              # Ota-onasi
    "religion",             # Dini    
    "genre",                # Janri (san’at yoki adabiyotda)   
]
df_filtered13 = df13[important_columns]
df_filtered13.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Raw_data\hf_dataset13.csv", index=False, encoding="utf-8")

In [36]:
import requests
import re
import csv
import time

def get_wikipedia_page(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "wikitext"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyProjectBot/1.0; +https://yourdomain.com)"
    }

    response = requests.get(url, params=params, headers=headers)

    try:
        data = response.json()
    except Exception as e:
        print(f"❌ JSON decode error for {title}: {e}")
        print("Raw response text:", response.text)
        return None

    if 'error' in data:
        print(f"❌ Wikipedia API error for {title}: {data['error']['info']}")
        return None

    return data['parse']['wikitext']['*']

def extract_infobox(wikitext):
    if not wikitext:
        return None
    match = re.search(r"\{\{Infobox(.+?)\n\}\}", wikitext, re.DOTALL)
    if match:
        return match.group(0)
    else:
        match = re.search(r"\{\{Infobox(.+?)\}\}", wikitext, re.DOTALL)
        return match.group(0) if match else None

def extract_attributes(infobox_text):
    if not infobox_text:
        return {}

    attributes = {}
    current_key = None
    current_value_lines = []

    lines = infobox_text.split("\n")
    for line in lines[1:]:  # 1-qator: {{Infobox ... shuning uchun o'tkazamiz
        line = line.strip()
        if line.startswith("|"):
            # Yangi atribut boshlanmoqda, oldingisini saqlaymiz
            if current_key is not None:
                attributes[current_key] = " ".join(current_value_lines).strip()
            parts = line.split("=", 1)
            if len(parts) == 2:
                current_key = parts[0].strip("| ").lower()
                current_value_lines = [parts[1].strip()]
            else:
                current_key = None
                current_value_lines = []
        else:
            # Agar chiziq | bilan boshlanmasa, oldingi key qiymatining davomidir
            if current_key is not None:
                current_value_lines.append(line)

    # Oxirgi atributni qo'shamiz
    if current_key is not None:
        attributes[current_key] = " ".join(current_value_lines).strip()

    return attributes

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = set()
    for item in data:
        keys.update(item.keys())
    keys = sorted(list(keys))

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"✅ Dataset '{filename}' faylga saqlandi.")

if __name__ == "__main__":
    persons = [
    "Molly Wright", "Tobias Smiljan", "Zhen Zhang", "Ravi Kumar", "Alia Hassan",
    "Bjorn Svensson", "Carmen Silva", "Diego Alvarez", "Elena Petrova", "Finn O’Connor",
    "Giulia Romano", "Hassan Khalid", "Irene Novak", "Jamal Malik", "Katerina Ivanova",
    "Luca Ferrari", "Marisol Reyes", "Nadia Khan", "Omar Aziz", "Priya Singh",
    "Quinn O’Brien", "Rafael Costa", "Sara Yamada", "Tariq Yusuf", "Ursula Klein",
    "Victor Hugo","Wendy Li", "Xavier Chen", "Yasmin Ali", "Zoe Martin",
    "Arjun Mehta","Beatriz Costa","Caleb Brooks","Danielle Jones","Emilia Novak",
    "Felipe Ortega","Giovanni Rossi","Hana Lee","Ismail Khan","Jade Tran",
    "Kai Nguyen","Lina Petrova","Marco Santoro","Nora Müller","Omar Farouk",
    "Paula González","Quim Puig","Rina Sato","Stefan Müller","Thalia Papadopoulos",
    "Ugo Bianchi","Valentin Popov","Wang Mei","Ximena Rojas","Youssef Said",
    "Zarif Ahmed","Amina Yusuf","Bashir Ali","Cecilia Vega","Diego Fernandez",
    "Elif Demir","Farid Rahman","Gabriela Silva","Hiroshi Tanaka","Ines Pereira",
    "Jonas Schmidt","Kim Seo-yeon","Leonardo Silva","Mariam Hussein","Nikhil Rao",
    "Olivia Brown","Pavel Novák","Rocio Morales","Santiago Diaz","Tatiana Petrova",
    "Usman Ali","Vera Petrovna","Will Kim","Xinyi Wang","Yara Haddad",
    "Zainab Ahmed","Amelia Lopez","Bruna Santos","Carla Moreira","Dimitri Ivanov",
    "Eva Novak","Felix Bauer","Georgina Ruiz","Haruki Yamamoto","Ibrahim Ali",
    "Jasmine Zhang","Khalid Hussain","Lauren Thomas","Matteo Bianchi","Naira Karimova",
    "Oscar Nunez","Paula Silva","Ruben Gómez","Sara Johansson","Takeshi Yamamoto",
    "Ursula Klein","Veronica Ruiz","Walter Johnson","Xia Li","Yuki Kobayashi",
    "Zoe Smith","Alex Johnson","Bella Wang","Chris Martinez","Dana Davis",
    "Ethan Clark","Faith Lee","George White","Hannah Brown","Ian Miller",
    "Julia Wilson","Kevin Anderson","Laura Moore","Michael Taylor","Natalie Lee",
    "Oliver Hall","Penelope Wright","Quentin Lewis","Rachel Evans","Samuel Scott",
    "Teresa Green","Ulysses King","Victoria Adams","Wesley Young","Xenia Bishop",
    "Yvonne Carter","Zachary Davis","Adrian Murphy","Bianca Kelly","Calvin Hill",
    "Diana Rivera","Edward Brooks","Fiona Reed","Gregory Foster","Helena Schwartz",
    "Ivan Petrov","Joanna Allen","Kyle Hudson","Lilian Parker","Marcus King",
    "Natalia Ortiz","Owen Brooks"

]

    dataset = []

    for idx, person in enumerate(persons, 1):
        print(f"{idx}/{len(persons)} 📥 {person} sahifasini yuklanmoqda...")
        wikitext = get_wikipedia_page(person)
        infobox = extract_infobox(wikitext)
        attributes = extract_attributes(infobox)

        if attributes:
            if "name" not in attributes:
                attributes["name"] = person
            dataset.append(attributes)
            print(f"✅ {person} infoboxi muvaffaqiyatli olindi.")
        else:
            print(f"⚠️ {person} sahifasida infobox topilmadi yoki bo'sh.")

        time.sleep(1)  # API limitlarini cheklash uchun

    save_to_csv(dataset, "historical_figures_dataset14.csv")


1/147 📥 Molly Wright sahifasini yuklanmoqda...
⚠️ Molly Wright sahifasida infobox topilmadi yoki bo'sh.
2/147 📥 Tobias Smiljan sahifasini yuklanmoqda...
❌ Wikipedia API error for Tobias Smiljan: The page you specified doesn't exist.
⚠️ Tobias Smiljan sahifasida infobox topilmadi yoki bo'sh.
3/147 📥 Zhen Zhang sahifasini yuklanmoqda...
⚠️ Zhen Zhang sahifasida infobox topilmadi yoki bo'sh.
4/147 📥 Ravi Kumar sahifasini yuklanmoqda...
⚠️ Ravi Kumar sahifasida infobox topilmadi yoki bo'sh.
5/147 📥 Alia Hassan sahifasini yuklanmoqda...
❌ Wikipedia API error for Alia Hassan: The page you specified doesn't exist.
⚠️ Alia Hassan sahifasida infobox topilmadi yoki bo'sh.
6/147 📥 Bjorn Svensson sahifasini yuklanmoqda...
⚠️ Bjorn Svensson sahifasida infobox topilmadi yoki bo'sh.
7/147 📥 Carmen Silva sahifasini yuklanmoqda...
✅ Carmen Silva infoboxi muvaffaqiyatli olindi.
8/147 📥 Diego Alvarez sahifasini yuklanmoqda...
⚠️ Diego Alvarez sahifasida infobox topilmadi yoki bo'sh.
9/147 📥 Elena Petrova

In [84]:
df14 = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Notebooks\historical_figures_dataset14.csv")
df14.head()
df14.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Columns: 275 entries, 1blankname1 to youthyears2
dtypes: float64(83), object(192)
memory usage: 77.5+ KB


In [85]:
important_columns = [
    "name",                 # Ism (asosiy identifikator)
    "birth_date",           # Tug‘ilgan sana
    "birth_place",          # Tug‘ilgan joy
    "death_date",           # Vafot sanasi
    "death_place",          # Vafot joyi
    "nationality",          # Millati
    "occupation",           # Kasbi yoki lavozimi
    "years_active",         # Faoliyat yillari
    "known_for",            # Mashhurlikka sabab bo‘lgan ishlari
    "awards",               # Mukofotlar
    "alma_mater",           # O‘qigan universiteti (o‘quv yurti)
    "education",            # Ta’limi
    "employer",             # Ishlagan tashkilotlari
    "notable_works",        # Mashhur asarlari yoki ishlari
    "field",                # Soha (ilmiy, san’at, siyosat va h.k.)    
    "spouse",               # Turmush o‘rtog‘i
    "children",             # Farzandlari
    "parents",              # Ota-onasi
    "religion",             # Dini    
    "genre",                # Janri (san’at yoki adabiyotda)   
]
df_filtered14 = df14[important_columns]
df_filtered14.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Raw_data\hf_dataset14.csv", index=False, encoding="utf-8")

In [37]:
import requests
import re
import csv
import time

def get_wikipedia_page(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "wikitext"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyProjectBot/1.0; +https://yourdomain.com)"
    }

    response = requests.get(url, params=params, headers=headers)

    try:
        data = response.json()
    except Exception as e:
        print(f"❌ JSON decode error for {title}: {e}")
        print("Raw response text:", response.text)
        return None

    if 'error' in data:
        print(f"❌ Wikipedia API error for {title}: {data['error']['info']}")
        return None

    return data['parse']['wikitext']['*']

def extract_infobox(wikitext):
    if not wikitext:
        return None
    match = re.search(r"\{\{Infobox(.+?)\n\}\}", wikitext, re.DOTALL)
    if match:
        return match.group(0)
    else:
        match = re.search(r"\{\{Infobox(.+?)\}\}", wikitext, re.DOTALL)
        return match.group(0) if match else None

def extract_attributes(infobox_text):
    if not infobox_text:
        return {}

    attributes = {}
    current_key = None
    current_value_lines = []

    lines = infobox_text.split("\n")
    for line in lines[1:]:  # 1-qator: {{Infobox ... shuning uchun o'tkazamiz
        line = line.strip()
        if line.startswith("|"):
            # Yangi atribut boshlanmoqda, oldingisini saqlaymiz
            if current_key is not None:
                attributes[current_key] = " ".join(current_value_lines).strip()
            parts = line.split("=", 1)
            if len(parts) == 2:
                current_key = parts[0].strip("| ").lower()
                current_value_lines = [parts[1].strip()]
            else:
                current_key = None
                current_value_lines = []
        else:
            # Agar chiziq | bilan boshlanmasa, oldingi key qiymatining davomidir
            if current_key is not None:
                current_value_lines.append(line)

    # Oxirgi atributni qo'shamiz
    if current_key is not None:
        attributes[current_key] = " ".join(current_value_lines).strip()

    return attributes

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = set()
    for item in data:
        keys.update(item.keys())
    keys = sorted(list(keys))

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"✅ Dataset '{filename}' faylga saqlandi.")

if __name__ == "__main__":
    persons = [
    "Carlos Vega","Daniel Dias","Elena Kim","Fernando Silva","Gabrielle Chen",
    "Hassan Al‑Amir","Isabella Rossi","Javier Morales","Ksenia Petrova","Lars Svensson",
    "Marta Kowalska","Nicolas Dubois","Oksana Ivanova","Pablo Morales","Renée Lefèvre",
    "Samuel Cohen","Tatiana Ivanova","Umar Khan","Valeria Torres","Wyatt Miller",
    "Ximena Martínez","Youssef El‑Sayed","Zainab Khan","Abel Cruz","Bianca Rossi",
    "Cesar Silva","Dalia Petrova","Elias Andersson","Fatima Zahra","Giorgio Lombardi",
    "Helena Petrova","Ismail Elamin","Jade Nguyen","Karl Johansson","Leila Haddad",
    "Manuel García","Nina Ivanova","Oliver Svensson","Pedro Castillo","Qamar Ali",
    "Rita Gomes","Stefan Müller","Thomas Müller","Ulf Andersson","Veronica Ruiz",
    "William Brown","Xavier Lopez","Yani Zhang","Zara Ahmed","Aamir Khan",
    "Bruno Silva","Cristina Fernandes","Diego Santos","Eduardo Pereira","Fabiana Ribeiro",
    "Giulia Ferrari","Haruka Sato","Ibrahim Hassan","Janelle Smith","Kristof Kovacs",
    "Laura Martinez","Miguel Rodriguez","Noah Anderson","Olga Sergeeva","Patricia Alvarez",
    "Quincy Lee","Renata Souza","Silvia Moreno","Thiago Silva","Ulrich Müller",
    "Valentina Russo","Walter Schmidt","Xavier Chen","Yara García","Zoe Smith",
    "Aisha Ibrahim","Boris Petrov","Clara Müller","David López","Emma Novak",
    "Felipe Díaz","Guadalupe Cruz","Hiro Tanaka","Ivy Chen","Jack Wilson",
    "Kaitlyn Adams","Leonardo Silva","María González","Nikita Ivanov","Olga Petrova",
    "Paola Ferrari","Quinn Thompson","Reina Ortiz","Selena Gomez","Tobias Müller",
    "Ulrike Schmidt","Valentin Popov","Winston Jones","Xia Zhou","Yuki Yamamoto",
    "Zoey Brown","Amir Ali","Bella Martinez","Cindy Huang","Diego Luna",
    "Elena Petrova","Farhan Masood","Gita Patel","Hao Nguyen","Ingrid Olsen",
    "James Brown","Kara Johnson","Luis Fernández","Maria Silva","Noor Khan",
    "Oleg Smirnov","Priya Sharma","Quentin Gray","Raja Singh","Sofia Hernandez",
    "Taylor Smith","Ulises Martinez","Vivian Chen","William Clark","Xiang Li"


]

    dataset = []

    for idx, person in enumerate(persons, 1):
        print(f"{idx}/{len(persons)} 📥 {person} sahifasini yuklanmoqda...")
        wikitext = get_wikipedia_page(person)
        infobox = extract_infobox(wikitext)
        attributes = extract_attributes(infobox)

        if attributes:
            if "name" not in attributes:
                attributes["name"] = person
            dataset.append(attributes)
            print(f"✅ {person} infoboxi muvaffaqiyatli olindi.")
        else:
            print(f"⚠️ {person} sahifasida infobox topilmadi yoki bo'sh.")

        time.sleep(1)  # API limitlarini cheklash uchun

    save_to_csv(dataset, "historical_figures_dataset15.csv")


1/125 📥 Carlos Vega sahifasini yuklanmoqda...
⚠️ Carlos Vega sahifasida infobox topilmadi yoki bo'sh.
2/125 📥 Daniel Dias sahifasini yuklanmoqda...
✅ Daniel Dias infoboxi muvaffaqiyatli olindi.
3/125 📥 Elena Kim sahifasini yuklanmoqda...
❌ Wikipedia API error for Elena Kim: The page you specified doesn't exist.
⚠️ Elena Kim sahifasida infobox topilmadi yoki bo'sh.
4/125 📥 Fernando Silva sahifasini yuklanmoqda...
⚠️ Fernando Silva sahifasida infobox topilmadi yoki bo'sh.
5/125 📥 Gabrielle Chen sahifasini yuklanmoqda...
❌ Wikipedia API error for Gabrielle Chen: The page you specified doesn't exist.
⚠️ Gabrielle Chen sahifasida infobox topilmadi yoki bo'sh.
6/125 📥 Hassan Al‑Amir sahifasini yuklanmoqda...
❌ Wikipedia API error for Hassan Al‑Amir: The page you specified doesn't exist.
⚠️ Hassan Al‑Amir sahifasida infobox topilmadi yoki bo'sh.
7/125 📥 Isabella Rossi sahifasini yuklanmoqda...
❌ Wikipedia API error for Isabella Rossi: The page you specified doesn't exist.
⚠️ Isabella Rossi sa

In [86]:
df15 = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Notebooks\historical_figures_dataset15.csv")
df15.head()
df15.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Columns: 215 entries, alias to youthyears3
dtypes: float64(44), object(171)
memory usage: 48.8+ KB


In [97]:
important_columns = [
    "name",                 # Ism (asosiy identifikator)
    "birth_date",           # Tug‘ilgan sana
    "birth_place",          # Tug‘ilgan joy
    "death_date",           # Vafot sanasi
    "death_place",          # Vafot joyi
    "nationality",          # Millati
    "occupation",           # Kasbi yoki lavozimi
    "years_active",         # Faoliyat yillari
    "known_for",            # Mashhurlikka sabab bo‘lgan ishlari
    "awards",               # Mukofotlar
    "alma_mater",           # O‘qigan universiteti (o‘quv yurti)
    "education",            # Ta’limi    
    "notable_works",        # Mashhur asarlari yoki ishlari
    "field",                # Soha (ilmiy, san’at, siyosat va h.k.)    
    "spouse",               # Turmush o‘rtog‘i
    "children"             # Farzandlari       
]
df_filtered15 = df15[important_columns]
df_filtered15.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Raw_data\hf_dataset15.csv", index=False, encoding="utf-8")

In [38]:
import requests
import re
import csv
import time

def get_wikipedia_page(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "wikitext"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyProjectBot/1.0; +https://yourdomain.com)"
    }

    response = requests.get(url, params=params, headers=headers)

    try:
        data = response.json()
    except Exception as e:
        print(f"❌ JSON decode error for {title}: {e}")
        print("Raw response text:", response.text)
        return None

    if 'error' in data:
        print(f"❌ Wikipedia API error for {title}: {data['error']['info']}")
        return None

    return data['parse']['wikitext']['*']

def extract_infobox(wikitext):
    if not wikitext:
        return None
    match = re.search(r"\{\{Infobox(.+?)\n\}\}", wikitext, re.DOTALL)
    if match:
        return match.group(0)
    else:
        match = re.search(r"\{\{Infobox(.+?)\}\}", wikitext, re.DOTALL)
        return match.group(0) if match else None

def extract_attributes(infobox_text):
    if not infobox_text:
        return {}

    attributes = {}
    current_key = None
    current_value_lines = []

    lines = infobox_text.split("\n")
    for line in lines[1:]:  # 1-qator: {{Infobox ... shuning uchun o'tkazamiz
        line = line.strip()
        if line.startswith("|"):
            # Yangi atribut boshlanmoqda, oldingisini saqlaymiz
            if current_key is not None:
                attributes[current_key] = " ".join(current_value_lines).strip()
            parts = line.split("=", 1)
            if len(parts) == 2:
                current_key = parts[0].strip("| ").lower()
                current_value_lines = [parts[1].strip()]
            else:
                current_key = None
                current_value_lines = []
        else:
            # Agar chiziq | bilan boshlanmasa, oldingi key qiymatining davomidir
            if current_key is not None:
                current_value_lines.append(line)

    # Oxirgi atributni qo'shamiz
    if current_key is not None:
        attributes[current_key] = " ".join(current_value_lines).strip()

    return attributes

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = set()
    for item in data:
        keys.update(item.keys())
    keys = sorted(list(keys))

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"✅ Dataset '{filename}' faylga saqlandi.")

if __name__ == "__main__":
    persons = [
    "Aaron Johnson","Bethany Clark","Chloe Davis","Dylan Wilson","Ella Martinez",
    "Frederick Brown","Gloria Lewis","Harrison Turner","Isla Robinson","Jacob White",
    "Keira Moore","Leo Bailey","Maya Rodriguez","Nathan Campbell","Olivia Green",
    "Parker Adams","Quinn Lewis","Riley Walker","Sophia Hill","Tyler Hall",
    "Uma Brooks","Violet Young","Wyatt King","Xander Lee","Yara Torres",
    "Zane Scott","Aria Davis","Brady Harris","Carter White","Delilah Thomas",
    "Evan Lee","Faith Young","Grayson Brown","Hailey Smith","Ian Walker",
    "Jade Turner","Kylian Lewis","Luna Clark","Milo Reed","Nova Martinez",
    "Owen Wright","Paige Moore","Ryder Johnson","Stella Brown","Trent Hall",
    "Uma Green","Vienna Adams","Willow Young","Xander Scott","Yasmine Lewis",
    "Zoe Clark","Axel Davis","Brooke Harris","Colin White","Daniel Green",
    "Eliza King","Finn Young","Grace Brooks","Henry Smith","Isabel Moore",
    "Jack Turner","Kara Brown","Landon Davis","Mia Lee","Noah Wright",
    "Olivia Hill","Parker White","Quinn Young","Ruby Clark","Samuel Scott",
    "Tessa Adams","Ulysses Brown","Vivian Johnson","Wesley Clark","Xavier Davis",
    "Yvonne White","Zachary Young","Amber Brown","Bryce Lewis","Crystal Clark",
    "Damian White","Eden Davis","Freya Moore","Gavin Brown","Hazel Clark",
    "Ian Young","Jasmine Scott","Kaden Adams","Layla Smith","Miles Johnson",
    "Nina Clark","Orion Davis","Peyton Young","Quincy Hill","Rosie Moore",
    "Sebastian Brown","Tara White","Uriah Green","Valentina Davis","Weston Brown",
    "Xenia Moore","Yara Clark","Zayne Johnson","Aria Hill","Blake White",
    "Cecilia Moore","Dylan Brown","Elodie Clark","Felix Davis","Grace White",
    "Hudson Adams","Iris Young","Jace Brown","Kylie Clark","Luke Green",
    "Madison Moore","Nolan Brown","Olivia Clark","Phoenix Davis","Quinn White",
    "Riley Young","Scarlett Hill","Thomas Brown","Uma Clark","Violet Moore"


]

    dataset = []

    for idx, person in enumerate(persons, 1):
        print(f"{idx}/{len(persons)} 📥 {person} sahifasini yuklanmoqda...")
        wikitext = get_wikipedia_page(person)
        infobox = extract_infobox(wikitext)
        attributes = extract_attributes(infobox)

        if attributes:
            if "name" not in attributes:
                attributes["name"] = person
            dataset.append(attributes)
            print(f"✅ {person} infoboxi muvaffaqiyatli olindi.")
        else:
            print(f"⚠️ {person} sahifasida infobox topilmadi yoki bo'sh.")

        time.sleep(1)  # API limitlarini cheklash uchun

    save_to_csv(dataset, "historical_figures_dataset16.csv")


1/125 📥 Aaron Johnson sahifasini yuklanmoqda...
⚠️ Aaron Johnson sahifasida infobox topilmadi yoki bo'sh.
2/125 📥 Bethany Clark sahifasini yuklanmoqda...
❌ Wikipedia API error for Bethany Clark: The page you specified doesn't exist.
⚠️ Bethany Clark sahifasida infobox topilmadi yoki bo'sh.
3/125 📥 Chloe Davis sahifasini yuklanmoqda...
❌ Wikipedia API error for Chloe Davis: The page you specified doesn't exist.
⚠️ Chloe Davis sahifasida infobox topilmadi yoki bo'sh.
4/125 📥 Dylan Wilson sahifasini yuklanmoqda...
⚠️ Dylan Wilson sahifasida infobox topilmadi yoki bo'sh.
5/125 📥 Ella Martinez sahifasini yuklanmoqda...
❌ Wikipedia API error for Ella Martinez: The page you specified doesn't exist.
⚠️ Ella Martinez sahifasida infobox topilmadi yoki bo'sh.
6/125 📥 Frederick Brown sahifasini yuklanmoqda...
⚠️ Frederick Brown sahifasida infobox topilmadi yoki bo'sh.
7/125 📥 Gloria Lewis sahifasini yuklanmoqda...
⚠️ Gloria Lewis sahifasida infobox topilmadi yoki bo'sh.
8/125 📥 Harrison Turner sah

In [88]:
df16 = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Notebooks\historical_figures_dataset16.csv")
df16.head()
df16.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Columns: 223 entries, 100s/50s1 to youthyears1
dtypes: float64(102), object(121)
memory usage: 28.0+ KB


In [96]:
important_columns = [
    "name",                 # Ism (asosiy identifikator)
    "birth_date",           # Tug‘ilgan sana
    "birth_place",          # Tug‘ilgan joy
    "death_date",           # Vafot sanasi
    "death_place",          # Vafot joyi
    "nationality",          # Millati
    "occupation",           # Kasbi yoki lavozimi
    "years_active",         # Faoliyat yillari
    "known_for",            # Mashhurlikka sabab bo‘lgan ishlari
    "awards",               # Mukofotlar
    "alma_mater",           # O‘qigan universiteti (o‘quv yurti)
    "education",            # Ta’limi    
    "field",                # Soha (ilmiy, san’at, siyosat va h.k.)    
    "spouse",               # Turmush o‘rtog‘i
    "children",             # Farzandlari   
    "genre"               # Janri (san’at yoki adabiyotda)   
]
df_filtered16 = df16[important_columns]
df_filtered16.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Raw_data\hf_dataset16.csv", index=False, encoding="utf-8")

In [39]:
import requests
import re
import csv
import time

def get_wikipedia_page(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "wikitext"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyProjectBot/1.0; +https://yourdomain.com)"
    }

    response = requests.get(url, params=params, headers=headers)

    try:
        data = response.json()
    except Exception as e:
        print(f"❌ JSON decode error for {title}: {e}")
        print("Raw response text:", response.text)
        return None

    if 'error' in data:
        print(f"❌ Wikipedia API error for {title}: {data['error']['info']}")
        return None

    return data['parse']['wikitext']['*']

def extract_infobox(wikitext):
    if not wikitext:
        return None
    match = re.search(r"\{\{Infobox(.+?)\n\}\}", wikitext, re.DOTALL)
    if match:
        return match.group(0)
    else:
        match = re.search(r"\{\{Infobox(.+?)\}\}", wikitext, re.DOTALL)
        return match.group(0) if match else None

def extract_attributes(infobox_text):
    if not infobox_text:
        return {}

    attributes = {}
    current_key = None
    current_value_lines = []

    lines = infobox_text.split("\n")
    for line in lines[1:]:  # 1-qator: {{Infobox ... shuning uchun o'tkazamiz
        line = line.strip()
        if line.startswith("|"):
            # Yangi atribut boshlanmoqda, oldingisini saqlaymiz
            if current_key is not None:
                attributes[current_key] = " ".join(current_value_lines).strip()
            parts = line.split("=", 1)
            if len(parts) == 2:
                current_key = parts[0].strip("| ").lower()
                current_value_lines = [parts[1].strip()]
            else:
                current_key = None
                current_value_lines = []
        else:
            # Agar chiziq | bilan boshlanmasa, oldingi key qiymatining davomidir
            if current_key is not None:
                current_value_lines.append(line)

    # Oxirgi atributni qo'shamiz
    if current_key is not None:
        attributes[current_key] = " ".join(current_value_lines).strip()

    return attributes

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = set()
    for item in data:
        keys.update(item.keys())
    keys = sorted(list(keys))

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"✅ Dataset '{filename}' faylga saqlandi.")

if __name__ == "__main__":
    persons = [
    "Alisher Navoiy", "Abdulla Qodiriy", "Erkin Vohidov", "Cholpon", "Gafur G'ulom",
    "Abdulla Oripov", "Mirzo Ulug'bek", "Babur", "Islom Karimov", "Shavkat Mirziyoyev",
    "Oybek", "Abdurahmon Jomiy", "Sadriddin Ayniy", "Muhammad Yusuf", "Muhammad al-Xorazmiy",
    "Zulfiya", "Bahrom Rahmonov", "Usmon Nosir", "Mirtemir", "Muhammad Ali",
    "Olimjon", "Xurshid Davron", "Jalol Ikromiy", "Husayn Boyqaro", "Rustam Inoyatov",
    "Davron Ergashev", "Dilshodbek Ruzmetov", "Abduqahhor Hojiakbarov", "Sanjar Tursunov", "Jasur Hasanov",
    "Shuhrat Mirkholdiriy", "Oybek Xudoyberdiyev", "Sardor Rashidov", "Odil Ahmedov", "Jahon Tursunov",
    "Abdusalom Abdullayev", "Nigina Amonkulova", "Munisa Rizayeva", "Yulduz Usmonova", "Sevara Nazarkhan",
    "Dilorom Soatova", "Lola Yuldasheva", "Farrux Zokirov", "Botir Qodirov", "Ulug'bek Rahmatullayev",
    "Rustam Mamatkulov", "Azizbek Haydarov", "Maksim Shatskix", "Mirjalol Qosimov", "Jaloliddin Masharipov",
    "Sanjar Xomidov", "Ilhom Mo'minjonov", "Abbosbek Makhstaliev", "Odiljon Hamrobekov", "Dostonbek Khamdamov",
    "Sherzodbek Safarov", "Islom Tuhtakov", "Kamoliddin Murzoev", "Otabek Shukurov", "Bekzod Abdurahmonov",
    "Anvarjon Soliev", "Rustam Komilov", "Anvarjon Nazarov", "Dilshod Vasiyev", "Shavkat Salomov",
    "Azamat Abduraimov", "Ulugbek Bakayev", "Alibek Davronov", "Rustam Zabirov", "Oybek Kilichev",
    "Doston Ibragimov", "Sardor Jumaev", "Ibrohim Yusupov", "Sherzod Tursunov", "Mirjalol Qosimov Jr",
    "Akmal Shorakhmedov", "Bekzod Vasiyev", "Odil Ahmedov", "Islom Tukhtahujaev", "Sherzodbek Ziyoev",
    "Nodirbek Abdusattorov", "Jasur Mirzayev", "Umidjon Azimov", "Shahzodbek Nurmatov", "Davron Kholmurodov",
    "Rustamjon Ashurmatov", "Anvarjon Begmatov", "Oybek Toshpulatov", "Farruh Juraev", "Azizbek Turgunov",
    "Azamat Ishmuradov", "Bekzod Sattorov", "Rustam Ashurmatov", "Sherzodbek Yusufov", "Muhammadkhuja Djalilov",
    "Akmal Ziyoev", "Jahongir Ergashev", "Shakhzodbek Muminov", "Zokir Almatov", "Bekzod Rakhmonov",
    "Shakhboz Ergashev", "Sanjar Shaimardonov", "Ravshan Ermatov", "Jahongir Abdullaev", "Bekzod Gulyamov",
    "Mavlon Nazarov", "Islom Kobilov", "Jahongir Jalolov", "Nuriddin Akramov", "Temur Juraev",
    "Dostonbek Tursunov", "Otabek Shukurov", "Nematjon Tokhtakhodjaev", "Akmal Shokirov", "Mirjalol Qosimov",
    "Ulugbek Baqoev", "Rustamjon Ashurmatov", "Dilmurod Nazarov", "Bekzod Vasiyev", "Jahongir Mirzayev",
    "Sardor Jumaev", "Odiljon Hamrobekov", "Alisher Mirzo", "Dostonbek Khamdamov", "Sherzodbek Ziyoev",
    "Shavkat Mirziyoyev", "Abdulla Aripov", "Rustam Azimov", "Erkin Halilov", "Abdukahhor Tojiboev",
    "Fazliddin G'aniyev", "Islom Karimov", "Shavkat Mirziyoyev", "Rustam Azimov", "Nigmatilla Yuldashev",
    "Abdulla Aripov", "Mirziyoyev", "Sodiq Safoyev", "Erkin Ganiyev", "Abdulla Aripov",
    "Rustam Azimov", "Shavkat Mirziyoyev", "Zoir Mirzayev", "Alisher Sa'dullayev", "Javlon Vahabov",
    "Rustam Azimov", "Ravshanbek Tursunov", "Bekzod Qodirov", "Sherzodbek Shamsiev", "Oybek Sa'dullayev",
    "Islom Karimov", "Abdulla Oripov", "Muhammad Yusuf", "Erkin Vohidov", "Mirtemir",
    "Rustamjon Ashurmatov", "Akmal Shorakhmedov", "Odil Ahmedov", "Sanjar Tursunov", "Jasur Hasanov",
    "Ulugbek Bakayev", "Alibek Davronov", "Maksim Shatskix", "Sardor Jumaev", "Davron Ergashev",
    "Shuhrat Mirkholdiriy", "Sevara Nazarkhan", "Munisa Rizayeva", "Yulduz Usmonova", "Nigina Amonkulova",
    "Dilorom Soatova", "Lola Yuldasheva", "Farrux Zokirov", "Botir Qodirov", "Oybek Xudoyberdiyev",
    "Olimjon", "Xurshid Davron", "Jalol Ikromiy", "Husayn Boyqaro", "Rustam Inoyatov",
    "Mirzo Ulug'bek", "Babur", "Alisher Navoiy", "Abdulla Qodiriy", "Cholpon",
    "Gafur G'ulom", "Abdulla Oripov", "Sadriddin Ayniy", "Muhammad Yusuf", "Muhammad al-Xorazmiy",
    "Zulfiya", "Bahrom Rahmonov", "Usmon Nosir", "Mirtemir", "Muhammad Ali",
    "Olimjon", "Xurshid Davron", "Jalol Ikromiy", "Husayn Boyqaro", "Rustam Inoyatov",
    "Davron Ergashev", "Dilshodbek Ruzmetov", "Abduqahhor Hojiakbarov", "Sanjar Tursunov", "Jasur Hasanov",
    "Shuhrat Mirkholdiriy", "Oybek Xudoyberdiyev", "Sardor Rashidov", "Odil Ahmedov", "Jahon Tursunov",
    "Abdusalom Abdullayev", "Nigina Amonkulova", "Munisa Rizayeva", "Yulduz Usmonova", "Sevara Nazarkhan",
    "Dilorom Soatova", "Lola Yuldasheva", "Farrux Zokirov", "Botir Qodirov", "Ulug'bek Rahmatullayev",
    "Rustam Mamatkulov", "Azizbek Haydarov", "Maksim Shatskix", "Mirjalol Qosimov", "Jaloliddin Masharipov",
    "Sanjar Xomidov", "Ilhom Mo'minjonov", "Abbosbek Makhstaliev", "Odiljon Hamrobekov", "Dostonbek Khamdamov",
    "Sherzodbek Safarov", "Islom Tuhtakov", "Kamoliddin Murzoev", "Otabek Shukurov", "Bekzod Abdurahmonov"


]

    dataset = []

    for idx, person in enumerate(persons, 1):
        print(f"{idx}/{len(persons)} 📥 {person} sahifasini yuklanmoqda...")
        wikitext = get_wikipedia_page(person)
        infobox = extract_infobox(wikitext)
        attributes = extract_attributes(infobox)

        if attributes:
            if "name" not in attributes:
                attributes["name"] = person
            dataset.append(attributes)
            print(f"✅ {person} infoboxi muvaffaqiyatli olindi.")
        else:
            print(f"⚠️ {person} sahifasida infobox topilmadi yoki bo'sh.")

        time.sleep(1)  # API limitlarini cheklash uchun

    save_to_csv(dataset, "historical_figures_dataset17.csv")


1/235 📥 Alisher Navoiy sahifasini yuklanmoqda...
⚠️ Alisher Navoiy sahifasida infobox topilmadi yoki bo'sh.
2/235 📥 Abdulla Qodiriy sahifasini yuklanmoqda...
✅ Abdulla Qodiriy infoboxi muvaffaqiyatli olindi.
3/235 📥 Erkin Vohidov sahifasini yuklanmoqda...
✅ Erkin Vohidov infoboxi muvaffaqiyatli olindi.
4/235 📥 Cholpon sahifasini yuklanmoqda...
⚠️ Cholpon sahifasida infobox topilmadi yoki bo'sh.
5/235 📥 Gafur G'ulom sahifasini yuklanmoqda...
❌ Wikipedia API error for Gafur G'ulom: The page you specified doesn't exist.
⚠️ Gafur G'ulom sahifasida infobox topilmadi yoki bo'sh.
6/235 📥 Abdulla Oripov sahifasini yuklanmoqda...
⚠️ Abdulla Oripov sahifasida infobox topilmadi yoki bo'sh.
7/235 📥 Mirzo Ulug'bek sahifasini yuklanmoqda...
⚠️ Mirzo Ulug'bek sahifasida infobox topilmadi yoki bo'sh.
8/235 📥 Babur sahifasini yuklanmoqda...
✅ Babur infoboxi muvaffaqiyatli olindi.
9/235 📥 Islom Karimov sahifasini yuklanmoqda...
⚠️ Islom Karimov sahifasida infobox topilmadi yoki bo'sh.
10/235 📥 Shavkat M

In [90]:
df17 = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Notebooks\historical_figures_dataset17.csv")
df17.head()
df17.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Columns: 239 entries, 1blankname to youthyears4
dtypes: float64(71), object(168)
memory usage: 164.4+ KB


In [95]:
important_columns = [
    "name",                 # Ism (asosiy identifikator)
    "birth_date",           # Tug‘ilgan sana
    "birth_place",          # Tug‘ilgan joy
    "death_date",           # Vafot sanasi
    "death_place",          # Vafot joyi
    "nationality",          # Millati
    "occupation",           # Kasbi yoki lavozimi
    "years_active",         # Faoliyat yillari    
    "awards",               # Mukofotlar
    "alma_mater",           # O‘qigan universiteti (o‘quv yurti)    
    "spouse",               # Turmush o‘rtog‘i
    "children",             # Farzandlari    
    "genre"                # Janri (san’at yoki adabiyotda)   
]
df_filtered17 = df17[important_columns]
df_filtered17.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Raw_data\hf_dataset17.csv", index=False, encoding="utf-8")

In [40]:
import requests
import re
import csv
import time

def get_wikipedia_page(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "wikitext"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyProjectBot/1.0; +https://yourdomain.com)"
    }

    response = requests.get(url, params=params, headers=headers)

    try:
        data = response.json()
    except Exception as e:
        print(f"❌ JSON decode error for {title}: {e}")
        print("Raw response text:", response.text)
        return None

    if 'error' in data:
        print(f"❌ Wikipedia API error for {title}: {data['error']['info']}")
        return None

    return data['parse']['wikitext']['*']

def extract_infobox(wikitext):
    if not wikitext:
        return None
    match = re.search(r"\{\{Infobox(.+?)\n\}\}", wikitext, re.DOTALL)
    if match:
        return match.group(0)
    else:
        match = re.search(r"\{\{Infobox(.+?)\}\}", wikitext, re.DOTALL)
        return match.group(0) if match else None

def extract_attributes(infobox_text):
    if not infobox_text:
        return {}

    attributes = {}
    current_key = None
    current_value_lines = []

    lines = infobox_text.split("\n")
    for line in lines[1:]:  # 1-qator: {{Infobox ... shuning uchun o'tkazamiz
        line = line.strip()
        if line.startswith("|"):
            # Yangi atribut boshlanmoqda, oldingisini saqlaymiz
            if current_key is not None:
                attributes[current_key] = " ".join(current_value_lines).strip()
            parts = line.split("=", 1)
            if len(parts) == 2:
                current_key = parts[0].strip("| ").lower()
                current_value_lines = [parts[1].strip()]
            else:
                current_key = None
                current_value_lines = []
        else:
            # Agar chiziq | bilan boshlanmasa, oldingi key qiymatining davomidir
            if current_key is not None:
                current_value_lines.append(line)

    # Oxirgi atributni qo'shamiz
    if current_key is not None:
        attributes[current_key] = " ".join(current_value_lines).strip()

    return attributes

def save_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = set()
    for item in data:
        keys.update(item.keys())
    keys = sorted(list(keys))

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    print(f"✅ Dataset '{filename}' faylga saqlandi.")

if __name__ == "__main__":
    persons = [
    "Al-Biruniy", "Ibn Sino", "Al-Khorezmi", "Al-Farghani", "Ulug'bek",
    "Abu Rayhon Beruniy", "Avicenna", "Abu Ali ibn Sino", "Al-Maqdisi", "Al-Masudi",
    "Ibn al-Nadim", "Al-Jahiz", "Ibn al-Haytham", "Ibn Khaldun", "Al-Tabari",
    "Al-Kindi", "Al-Razi", "Ibn Rushd", "Al-Zahrawi", "Ibn al-Baitar",
    "Al-Masihi", "Al-Battani", "Al-Idrisi", "Ibn al-Shatir", "Nasir al-Din al-Tusi",
    "Omar Khayyam", "Al-Tusi", "Al-Qushji", "Al-Shirazi", "Ibn Majid",
    "Ibn al-Jazzar", "Al-Maturidi", "Al-Bukhari", "Ibn Hajar al-Asqalani", "Al-Ghazali",
    "Al-Farabi", "Ibn Rushd", "Ibn Arabi", "Ibn Tufail", "Ibn Battuta",
    "Ibn Sina", "Al-Khwarizmi", "Muhammad al-Khwarizmi", "Al-Zamakhshari", "Al-Nafis",
    "Abu al-Qasim al-Zahrawi", "Al-Tamimi", "Al-Kashani", "Al-Sijzi", "Ibn Yunus",
    "Ibn al-Awwam", "Al-Rashid", "Al-Sufi", "Al-Khazini", "Al-Baghdadi",
    "Al-Kashi", "Al-Majriti", "Ibn Juljul", "Al-Hakim al-Tirmidhi", "Ibn Abi Usaybi'a",
    "Al-Qushayri", "Al-Samarqandi", "Al-Maqdisi", "Ibn Zaydun", "Al-Farghani",
    "Al-Nisaburi", "Al-Shahrastani", "Ibn al-Muqaffa", "Al-Hasan al-Basri", "Ibn al-Nafis",
    "Ibn al-Nafis", "Al-Tamimi", "Al-Khwarizmi", "Ibn al-Jawzi", "Ibn al-Nadim",
    "Al-Muqaddasi", "Ibn al-Nafis", "Al-Balkhi", "Al-Razi", "Ibn Sina",
    "Al-Farabi", "Al-Tusi", "Al-Khwarizmi", "Al-Masudi", "Ibn Rushd",
    "Al-Biruni", "Ibn al-Haytham", "Al-Khazini", "Nasir al-Din al-Tusi", "Al-Idrisi",
    "Ibn Battuta", "Ibn Khaldun", "Al-Jahiz", "Ibn Sina", "Al-Farabi",
    "Al-Ma'arri", "Al-Tabari", "Al-Khwarizmi", "Ibn Rushd", "Al-Ghazali",
    "Ibn al-Nafis", "Al-Kindi", "Ibn Arabi", "Al-Sufi", "Ibn al-Awwam",
    "Al-Kashi", "Al-Samarqandi", "Ibn Juljul", "Al-Majriti", "Ibn Abi Usaybi'a",
    "Al-Qushayri", "Al-Nisaburi", "Al-Shahrastani", "Al-Tamimi", "Ibn al-Muqaffa",
    "Al-Hasan al-Basri", "Ibn Zaydun", "Al-Farghani", "Ibn Yunus", "Ibn al-Jazzar",
    "Ibn al-Shatir", "Al-Baghdadi", "Al-Kashani", "Al-Maturidi", "Ibn Hajar al-Asqalani",
    "Ibn Abi Usaybi'a", "Al-Rashid", "Al-Sufi", "Ibn al-Jawzi", "Al-Muqaddasi",
    "Al-Balkhi", "Al-Masihi", "Al-Zamakhshari", "Al-Nafis", "Ibn Khaldun",
    "Nasir al-Din al-Tusi", "Omar Khayyam", "Al-Tusi", "Al-Qushji", "Al-Shirazi",
    "Ibn Majid", "Abu Rayhon Beruniy", "Avicenna", "Al-Khorezmi", "Al-Farghani",
    "Ulug'bek", "Ibn Sina", "Al-Maqdisi", "Al-Masudi", "Ibn al-Nadim",
    "Al-Jahiz", "Ibn al-Haytham", "Ibn Khaldun", "Al-Tabari", "Al-Kindi",
    "Al-Razi", "Ibn Rushd", "Al-Zahrawi", "Ibn al-Baitar", "Al-Masihi",
    "Al-Battani", "Al-Idrisi", "Ibn al-Shatir", "Nasir al-Din al-Tusi", "Omar Khayyam",
    "Al-Tusi", "Al-Qushji", "Al-Shirazi", "Ibn Majid", "Ibn al-Jazzar",
    "Al-Maturidi", "Al-Bukhari", "Ibn Hajar al-Asqalani", "Al-Ghazali", "Al-Farabi",
    "Ibn Rushd", "Ibn Arabi", "Ibn Tufail", "Ibn Battuta", "Ibn Sina",
    "Al-Khwarizmi", "Muhammad al-Khwarizmi", "Al-Zamakhshari", "Al-Nafis", "Abu al-Qasim al-Zahrawi",
    "Al-Tamimi", "Al-Kashani", "Al-Sijzi", "Ibn Yunus", "Ibn al-Awwam",
    "Al-Rashid", "Al-Sufi", "Al-Khazini", "Al-Baghdadi", "Al-Kashi",
    "Al-Majriti", "Ibn Juljul", "Al-Hakim al-Tirmidhi", "Ibn Abi Usaybi'a", "Al-Qushayri",
    "Al-Samarqandi", "Al-Maqdisi", "Ibn Zaydun", "Al-Farghani", "Al-Nisaburi",
    "Al-Shahrastani", "Ibn al-Muqaffa", "Al-Hasan al-Basri", "Ibn al-Nafis", "Ibn al-Nafis",
    "Al-Tamimi", "Al-Khwarizmi", "Ibn al-Jawzi", "Ibn al-Nadim", "Al-Muqaddasi"

]

    dataset = []

    for idx, person in enumerate(persons, 1):
        print(f"{idx}/{len(persons)} 📥 {person} sahifasini yuklanmoqda...")
        wikitext = get_wikipedia_page(person)
        infobox = extract_infobox(wikitext)
        attributes = extract_attributes(infobox)

        if attributes:
            if "name" not in attributes:
                attributes["name"] = person
            dataset.append(attributes)
            print(f"✅ {person} infoboxi muvaffaqiyatli olindi.")
        else:
            print(f"⚠️ {person} sahifasida infobox topilmadi yoki bo'sh.")

        time.sleep(1)  

    save_to_csv(dataset, "historical_figures_dataset18.csv")


1/215 📥 Al-Biruniy sahifasini yuklanmoqda...
❌ Wikipedia API error for Al-Biruniy: The page you specified doesn't exist.
⚠️ Al-Biruniy sahifasida infobox topilmadi yoki bo'sh.
2/215 📥 Ibn Sino sahifasini yuklanmoqda...
❌ Wikipedia API error for Ibn Sino: The page you specified doesn't exist.
⚠️ Ibn Sino sahifasida infobox topilmadi yoki bo'sh.
3/215 📥 Al-Khorezmi sahifasini yuklanmoqda...
❌ Wikipedia API error for Al-Khorezmi: The page you specified doesn't exist.
⚠️ Al-Khorezmi sahifasida infobox topilmadi yoki bo'sh.
4/215 📥 Al-Farghani sahifasini yuklanmoqda...
✅ Al-Farghani infoboxi muvaffaqiyatli olindi.
5/215 📥 Ulug'bek sahifasini yuklanmoqda...
❌ Wikipedia API error for Ulug'bek: The page you specified doesn't exist.
⚠️ Ulug'bek sahifasida infobox topilmadi yoki bo'sh.
6/215 📥 Abu Rayhon Beruniy sahifasini yuklanmoqda...
❌ Wikipedia API error for Abu Rayhon Beruniy: The page you specified doesn't exist.
⚠️ Abu Rayhon Beruniy sahifasida infobox topilmadi yoki bo'sh.
7/215 📥 Avice

In [92]:
df18 = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Notebooks\historical_figures_dataset18.csv")
df18.head()
df18.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 91 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   alma_mater         0 non-null      float64
 1   alt                6 non-null      object 
 2   awards             0 non-null      float64
 3   birth_date         107 non-null    object 
 4   birth_name         12 non-null     object 
 5   birth_place        92 non-null     object 
 6   birthname          3 non-null      object 
 7   caption            63 non-null     object 
 8   children           6 non-null      object 
 9   citizenship        4 non-null      object 
 10  creed              32 non-null     object 
 11  death_cause        0 non-null      float64
 12  death_date         107 non-null    object 
 13  death_place        79 non-null     object 
 14  denomination       41 non-null     object 
 15  education          0 non-null      float64
 16  embed              0 non-n

In [94]:
important_columns = [
    "name",                 # Ism (asosiy identifikator)
    "birth_date",           # Tug‘ilgan sana
    "birth_place",          # Tug‘ilgan joy
    "death_date",           # Vafot sanasi
    "death_place",          # Vafot joyi
    "nationality",          # Millati
    "occupation",           # Kasbi yoki lavozimi
    "years_active",         # Faoliyat yillari
    "known_for",            # Mashhurlikka sabab bo‘lgan ishlari
    "awards",               # Mukofotlar
    "alma_mater",           # O‘qigan universiteti (o‘quv yurti)
    "education",            # Ta’limi    
    "notable_works",        # Mashhur asarlari yoki ishlari
    "field",                # Soha (ilmiy, san’at, siyosat va h.k.)    
    "spouse",               # Turmush o‘rtog‘i
    "children",             # Farzandlari    
    "religion",             # Dini    
    "genre"                # Janri (san’at yoki adabiyotda)   
]
df_filtered18 = df18[important_columns]
df_filtered18.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Raw_data\hf_dataset18.csv", index=False, encoding="utf-8")

# 2648 ta  ism berildi 