# References

Site used to extract the names: https://babynames.extraprepare.com/

Code to extract names from the site: https://github.com/memr5/Machine-Learning-Portfolio/blob/master/Deep%20Learning/Indian%20Baby%20Names%20Generator/Scraper.py

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import string

# Url of the website
base_url = 'https://babynames.extraprepare.com/'

data_dict = {"Name":[],"Gender":[]} # 0 for male, 1 for female

# Scrapping Male Names

In [1]:

for gender in ['boy']:
    print(f"Scrapping Indian {gender}s Names")

    for initial in string.ascii_lowercase:
        # To make a request to the website
        url = base_url+f"{gender}-{initial}.php"
        request = requests.get(url)

        # To create a BeautifulSoup object with content(Source code) of the website
        soup = BeautifulSoup(request.content,'html.parser')

        for names in soup.findAll('h3'):
            if ";" in names.text:
                for name in names.text.split("; "):
                    data_dict["Name"].append(name)
                    data_dict["Gender"].append(0)
            else:
                data_dict["Name"].append(names.text)
                data_dict["Gender"].append(0)

        for i in soup.findAll('td',attrs={'align':'center'}):
            if "Page" in i.text:
                for page in range(1,len(i.text[6:].split("\xa0"))-1):
                    request = requests.get(url+f"?page={page}")

                    # To create a BeautifulSoup object with content(Source code) of the website
                    soup = BeautifulSoup(request.content,'html.parser')

                    for names in soup.findAll('h3'):
                        if ";" in names.text:
                            for name in names.text.split("; "):
                                data_dict["Name"].append(names.text)
                                data_dict["Gender"].append(0)
                        else:
                            data_dict["Name"].append(names.text)
                            data_dict["Gender"].append(0)

Scrapping Indian boys Names


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [2]:
male = len(data_dict["Name"])
print("Number of male names in the list:",len(data_dict["Name"]))

Number of male names in the list: 30167


# Scrapping Female Names

In [3]:
for gender in ['girl']:
    print(f"Scrapping Indian {gender}s Names")

    for initial in string.ascii_lowercase:
        # To make a request to the website
        url = base_url+f"{gender}-{initial}.php"
        request = requests.get(url)

        # To create a BeautifulSoup object with content(Source code) of the website
        soup = BeautifulSoup(request.content,'html.parser')

        for names in soup.findAll('h3'):
            if ";" in names.text:
                for name in names.text.split("; "):
                    data_dict["Name"].append(name)
                    data_dict["Gender"].append(1)
            else:
                data_dict["Name"].append(names.text)
                data_dict["Gender"].append(1)

        for i in soup.findAll('td',attrs={'align':'center'}):
            if "Page" in i.text:
                for page in range(1,len(i.text[6:].split("\xa0"))-1):
                    request = requests.get(url+f"?page={page}")

                    # To create a BeautifulSoup object with content(Source code) of the website
                    soup = BeautifulSoup(request.content,'html.parser')

                    for names in soup.findAll('h3'):
                        if ";" in names.text:
                            for name in names.text.split("; "):
                                data_dict["Name"].append(names.text)
                                data_dict["Gender"].append(1)
                        else:
                            data_dict["Name"].append(names.text)
                            data_dict["Gender"].append(1)

Scrapping Indian girls Names


In [4]:
print("Number of female names in the list:",len(data_dict["Name"]) - male)

Number of female names in the list: 25494


In [5]:
data_df = pd.DataFrame.from_dict(data_dict)

# Generated dataframe

In [6]:
data_df

Unnamed: 0,Name,Gender
0,Aaban,0
1,Aabharan,0
2,Aabhas,0
3,Aabhat,0
4,Aabheer,0
...,...,...
55656,Zumathy,1
55657,Zurika,1
55658,Zuruthi,1
55659,Zuruthika,1


In [7]:
a = data_df["Name"].value_counts()

In [8]:
a

Airaawat; Airawat; Eirawat; Erawat; Irawat      5
Brajraaj; Brajraj; Brijraaj; Brijraj            4
Brajamohan; Brajmohan; Brijamohan; Brijmohan    4
Jasbir                                          3
Dharmpal                                        3
                                               ..
Vytheeswaran                                    1
Ratanjali                                       1
Basavaraj                                       1
Warshaya                                        1
Tharmila                                        1
Name: Name, Length: 54995, dtype: int64

### We can observe that there are still some entries with ';' in it. Now we will again run split based on ';'

In [9]:
data_df["Name"]

0            Aaban
1         Aabharan
2           Aabhas
3           Aabhat
4          Aabheer
           ...    
55656      Zumathy
55657       Zurika
55658      Zuruthi
55659    Zuruthika
55660       Zuvaka
Name: Name, Length: 55661, dtype: object

In [10]:
new_dict = {"Name":[],"Gender":[]} # 0 for male, 1 for female

In [11]:
for i in range(len(data_df)):
    if ";" in data_dict["Name"][i]:
        for name in data_dict["Name"][i].split("; "):
            if(data_dict["Name"][i].isalpha()):
                new_dict["Name"].append(name)
                new_dict["Gender"].append(data_dict["Gender"][i])
    elif(data_dict["Name"][i].isalpha()): # also removing any entries with any special characters in it
        new_dict["Name"].append(data_dict["Name"][i])
        new_dict["Gender"].append(data_dict["Gender"][i])

In [12]:
new_df = pd.DataFrame.from_dict(new_dict)

In [13]:
new_df["Name"].value_counts()

Gyan            3
Rajbir          3
Jasbir          3
Devi            3
Jasveer         3
               ..
Vadhanasri      1
Kanthamani      1
Haashni         1
Ambaajeeraav    1
Tharmila        1
Name: Name, Length: 53925, dtype: int64

### We have some repeated entries. We only want unique names in our dataset. Therefore we will remove all the duplicate entries.

In [14]:
new_df = new_df.drop_duplicates()

In [15]:
new_df["Name"].value_counts()

Bhupinder      2
Harjeet        2
Champabati     2
Giaan          2
Gemini         2
              ..
Chintana       1
Vennela        1
Lavakushan     1
Kaashinaath    1
Tharmila       1
Name: Name, Length: 53925, dtype: int64

### Now the only duplicates name are left which are classified both as male and female

## Exporting data to a csv file

In [20]:
new_df.to_csv('Gender_Data.csv', index=False)