In [1]:
import pandas as pd

In [None]:
import os.path
import codecs


class NoCountryError(Exception):
    """Raised when non-supported country is queried"""
    pass


class Detector:
    """Get gender by first name"""

    COUNTRIES = u"""great_britain ireland usa italy malta portugal spain france
                   belgium luxembourg the_netherlands east_frisia germany austria
                   swiss iceland denmark norway sweden finland estonia latvia
                   lithuania poland czech_republic slovakia hungary romania
                   bulgaria bosniaand croatia kosovo macedonia montenegro serbia
                   slovenia albania greece russia belarus moldova ukraine armenia
                   azerbaijan georgia the_stans turkey arabia israel china india
                   japan korea vietnam other_countries
                 """.split()

    def __init__(self,
                 case_sensitive=True):

        """Creates a detector parsing given data file"""
        self.case_sensitive = case_sensitive
        self._parse("data/nam_dict.txt")

    def _parse(self, filename):
        """Opens data file and for each line, calls _eat_name_line"""
        self.names = {}
        with codecs.open(filename, encoding="utf-8") as f:
            for line in f:
                self._eat_name_line(line.strip())

    def _eat_name_line(self, line):
        """Parses one line of data file"""
        if line[0] not in "#=":
            parts = line.split()
            country_values = line[30:-1]
            name = parts[1]
            if not self.case_sensitive:
                name = name.lower()

            if parts[0] == "M":
                self._set(name, u"male", country_values)
            elif parts[0] == "1M" or parts[0] == "?M":
                self._set(name, u"mostly_male", country_values)
            elif parts[0] == "F":
                self._set(name, u"female", country_values)
            elif parts[0] == "1F" or parts[0] == "?F":
                self._set(name, u"mostly_female", country_values)
            elif parts[0] == "?":
                self._set(name, u"andy", country_values)
            else:
                raise "Not sure what to do with a sex of %s" % parts[0]

    def _set(self, name, gender, country_values):
        """Sets gender and relevant country values for names dictionary of detector"""
        if '+' in name:
            for replacement in ['', ' ', '-']:
                self._set(name.replace('+', replacement), gender, country_values)
        else:
            if name not in self.names:
                self.names[name] = {}
            self.names[name][gender] = country_values

    def _most_popular_gender(self, name, counter):
        """Finds the most popular gender for the given name counting by given counter"""
        max_count, max_tie = (0, 0)
        for gender, country_values in list(self.names[name].items()):
            count, tie = counter(country_values)
            if count > max_count or (count == max_count and tie > max_tie):
                max_count, max_tie, best = count, tie, gender

        return best if max_count > 0 else u"andy"

    def get_gender(self, name, country=None):
        """Returns best gender for the given name and country pair"""
        if not self.case_sensitive:
            name = name.lower()

        if name not in self.names:
            return u"unknown"
        elif not country:
            def counter(country_values):
                country_values = list(map(ord, country_values.replace(" ", "")))
                return (len(country_values),
                        sum([c > 64 and c-55 or c-48 for c in country_values]))
            return self._most_popular_gender(name, counter)
        elif country in self.__class__.COUNTRIES:
            index = self.__class__.COUNTRIES.index(country)
            counter = lambda e: (ord(e[index])-32, 0)
            return self._most_popular_gender(name, counter)
        else:
            raise NoCountryError("No such country: %s" % country)

In [7]:
import codecs

In [76]:
def eat_name_line(line):
    """Parses one line of data file"""
    if line[0] not in "#=":
        parts = line.split()
        country_values = line[30:-1]
        name = parts[1]
        if parts[0] == "M":
            gender = u"male"
        elif parts[0] == "1M" or parts[0] == "?M":
            gender = u"mostly_male"
        elif parts[0] == "F":
            gender = u"female"
        elif parts[0] == "1F" or parts[0] == "?F":
            gender = u"mostly_female"
        elif parts[0] == "?":
            gender = u"andy"
        data = {"name": name, "gender": gender}
        return data
with open("data/nam_dict.txt", "r", encoding="utf-8") as f:
    data = f.read().splitlines()
final_data = []
for line in data[293:]:
    line_data = eat_name_line(line)
    if line_data != None:
        final_data.append(line_data)

In [77]:
df = pd.DataFrame(final_data)

In [85]:
df

Unnamed: 0,name,gender
0,Aad,male
1,Aadam,male
2,Aadje,female
3,Ådne,male
4,Aadu,male
...,...,...
47775,Zyra,female
47776,Zyrafete,female
47777,Zyrie,female
47778,Zyrjeta,female
