In [0]:
#*********
# Author: Sheikh Hanif
#********

# importing necessary modules
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
import re
import string
import nltk
import random

In [0]:
# mounting google drive
from google.colab import drive
drive.mount('/content/drive')

In [0]:
def get_names():
    """
    This function will scrape names from the website
    """
    # set to store male and female names. we are using set because we don't want duplicate entry
    male = set()
    female = set()
    
    # url for male and female
    murl = "https://www.babynamesdirect.com/baby-names/bengali/boy/"
    furl = "https://www.babynamesdirect.com/baby-names/bengali/girl/"
    
    # lowercase letter and a string with all numbers we need to loop through according to the 
    # website pattern. 
    lc = string.ascii_lowercase
    num = '123456789'
    
    # here we are constructing urls based on the format provided in the website. for 
    # each pattern (page) we get names and save to our set. we loop through all possible 
    # pattern (pages)
    
    # first we will extract male names. here we use beautiful soup to extract information we want. 
    for c in lc:
        for n in num:
            url = murl + c + '/' + n
            response = get(url)

            html_soup = BeautifulSoup(response.text, 'html.parser')
            con_name = html_soup.find_all('li', class_='ntr')

            for names in con_name:
                name = names.a
                if name:
                    male.add(name.text)
                    
                    
    # Now we will extract female names. here we use beautiful soup as well to extract information we want. 
    for c in lc:
        for n in num:
            url = furl + c + '/' + n
            response = get(url)

            html_soup = BeautifulSoup(response.text, 'html.parser')
            con_name = html_soup.find_all('li', class_='ntr')

            for names in con_name:
                name = names.a
                if name:
                    female.add(name.text)

    return male, female

In [0]:
def create_df():
    
    # getting names by calling the function
    male_names, female_names = get_names()

    # converting our set object into list as it will make easy for us to work with pandas dataframe.
    males = list(male_names)
    females = list(female_names)
    
    male_df = pd.DataFrame({
        'male': males,  
    })
    
    # male_df.head()
    
    female_df = pd.DataFrame({
        'female': females
    })
    
    # female_df.head()
    
    # let's save the dataframe in local machine. We don't want 
    # to request for data from the website again and again. It's not fair
    male_df.to_csv('/content/drive/My Drive/Colab Notebooks/Bengali_Names/bengali_names_male.csv')
    female_df.to_csv('/content/drive/My Drive/Colab Notebooks/Bengali_Names/bengali_names_female.csv')

    return male_df, female_df
  
  
  
create_df()

In [0]:
male = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Bengali_Names/bengali_names_male.csv')

In [0]:
male.describe()

Unnamed: 0.1,Unnamed: 0
count,6093.0
mean,3046.0
std,1759.041927
min,0.0
25%,1523.0
50%,3046.0
75%,4569.0
max,6092.0


In [0]:
female = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Bengali_Names/bengali_names_female.csv')
female.describe()

Unnamed: 0.1,Unnamed: 0
count,6151.0
mean,3075.0
std,1775.785085
min,0.0
25%,1537.5
50%,3075.0
75%,4612.5
max,6150.0


In [0]:
female.head()

Unnamed: 0.1,Unnamed: 0,female
0,0,Dipawali
1,1,Nilaya
2,2,Shaboni
3,3,Savli
4,4,Sabida


In [0]:
def gen_feature(word):
    # defining the last alphabet of the word as gender feature 
    return {'last_letter': word[-1]}

In [0]:
def model():
  """
  This will return a naivebayes classifer model with will 
  classify bengali name between male and female
  """
  
  # loading data 
  male_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Bengali_Names/bengali_names_male.csv')
  female_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Bengali_Names/bengali_names_female.csv')

  label_names = ([ (name, 'male') for name in male_df['male']] 
                + [(name, 'female') for name in female_df['female']])

  # suffling labeled data
  random.shuffle(label_names)

  # extracting feature
  feature_set = [ (gen_feature(n), gender) for (n, gender) in label_names]

  # splliting train_test
  train_set, test_set = feature_set[:5000], feature_set[5000:]

  # choosing classifier
  classifier = nltk.NaiveBayesClassifier.train(train_set)
  
  return classifier


# testing model
name_classifier = model()
name_classifier.classify(gen_feature('salma'))

In [0]:
male.head()

Unnamed: 0.1,Unnamed: 0,male
0,0,Bhaskarjit
1,1,Rijurekh
2,2,Dillu
3,3,Ravindra
4,4,Bachchu
