# Look for names that have swapped gender affiliation

I had always heard that certain primarily female names, such as Lindsey and Leslie, used to be primarily male names. I decided to look for some such names quantitatively.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

## Read data and group by year and gender

In [None]:
df = pd.read_csv("../input/NationalNames.csv")
gb = df.groupby(['Year','Gender'], sort=False)

## Compute normalized counts as frequency per year and gender

In [None]:
df.loc[:,'Freq_yearly'] = gb['Count'].transform(lambda x: 100. * x / x.sum())

## For each name, compute the ratio of male frequency to female frequency per year

The inner join restricts names to those that appear as both male and female names.

In [None]:
df_mf = df.set_index(["Year","Name"])
df_ratio = df_mf.loc[df_mf.Gender == "F",["Freq_yearly"]].join(df_mf.loc[df_mf.Gender == "M","Freq_yearly"], 
                                                 rsuffix="_M", how="inner")
df_ratio["Ratio"] = df_ratio.Freq_yearly_M / df_ratio.Freq_yearly

## Only consider names that have occurred in more than 90 years

In [None]:
df_ratio = df_ratio.reset_index().groupby('Name').filter(lambda x: len(x) > 90)

## Find gender shifting names
The ratio `alpha` defines how much more popular the male version was than the female in a particular year. A shifting name is considered one which was at least `alpha` times more popular for males in a given year and also `alpha` times more popular for females in a different year.

In [None]:
alpha = 5.0 # how much more prevalent was one name than the other, in terms of frequency
df_ratio = df_ratio.groupby("Name").filter(lambda x: x.Ratio.max() > alpha and x.Ratio.min() < (1/alpha))

df_ratio.groupby("Name").count().count()

## Plot the evolution of the shifting names over time

In [None]:
for n, df_sub in df_ratio.groupby("Name"):
    ax = df_sub.plot.line("Year","Ratio", title=n, logy=True, legend=False, xlim=(1880,2015), ylim=(1e-3,1e2))
    ax.set_ylabel("Ratio of males to females")