In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

from collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline

# Data overview

In [None]:
data = pd.read_csv('../input/NationalNames.csv')

In [None]:
data.head()

In [None]:
data.info()

# Top 25 most popular names

In [None]:
names_dict = dict()

for index, row in data.iterrows():
    if row['Name'] not in names_dict:
        names_dict[row['Name']] = row['Count']
    else:
        names_dict[row['Name']] += row['Count']

In [None]:
top_25 = Counter(names_dict).most_common(25)
print('Top 25 most popular names:')
for pair in top_25:
    print(pair[0])

# Top 25 most rare names

In [None]:
print('Top 25 rare names:')
for pair in Counter(names_dict).most_common()[:-25:-1]:
    print(pair[0])

# Average length of names

In [None]:
def average_length_data_transform():
    years = []
    female_average_length = []
    female_average_name_length = dict()
    male_average_length = []
    male_average_name_length = dict()
    
    for index, row in data.iterrows():
        if row['Gender'] == 'F':
            curr_year = row['Year']
            curr_name_length = len(row['Name'])
            if curr_year not in female_average_name_length:
                female_average_name_length[curr_year] = [curr_name_length, 1]
            else:
                female_average_name_length[curr_year][0] += curr_name_length
                female_average_name_length[curr_year][1] += 1
        else:
            curr_year = row['Year']
            curr_name_length = len(row['Name'])
            if curr_year not in male_average_name_length:
                male_average_name_length[curr_year] = [curr_name_length, 1]
            else:
                male_average_name_length[curr_year][0] += curr_name_length
                male_average_name_length[curr_year][1] += 1
    
    for key, value in female_average_name_length.items():
        years.append(key)
        female_average_length.append(float(value[0]) / value[1])
        
    for key, value in male_average_name_length.items():
        years.append(key)
        male_average_length.append(float(value[0]) / value[1])
        
    return (female_average_length, female_average_name_length, male_average_length, male_average_name_length)

In [None]:
female_average_length, female_average_name_length, male_average_length, male_average_name_length = average_length_data_transform()

In [None]:
years = range(1880, 2015)
f, ax = plt.subplots(figsize=(10, 6))
ax.set_xlim([1880, 2014])

plt.plot(years, female_average_length, label='Average length of female names', color='r')
plt.plot(years, male_average_length, label='Average length of male names', color='b')

ax.set_ylabel('Length of name')
ax.set_xlabel('Year')
ax.set_title('Average length of names')
legend = plt.legend(loc='best', frameon=True, borderpad=1, borderaxespad=1)

# Percent of unique names

In [None]:
top_in_each_year = dict()
years = range(1880, 2015)

for each_year in years:
    each_year_data = data[data['Year'] == each_year]
    top_in_each_year[each_year] = dict()
    for index, row in each_year_data.iterrows():            
        top_in_each_year[each_year][row['Name']] = row['Count']

In [None]:
all_sum = []
top_25_sum = []
for year, names_in_year in top_in_each_year.items():
    all_sum.append(sum(Counter(names_in_year).values()))
    top_25 = Counter(names_in_year).most_common(25)
    sum_temp = 0
    for pair in top_25:
        sum_temp += pair[1]
    top_25_sum.append(sum_temp)

In [None]:
percent_unique_names = np.array(top_25_sum).astype(float) / np.array(all_sum) * 100

In [None]:
f, ax = plt.subplots(figsize=(10, 6))
ax.set_xlim([1880, 2014])

plt.plot(years, percent_unique_names, label='Percent of unique names', color='black')

ax.set_ylabel('Percent of unique names')
ax.set_xlabel('Year')
ax.set_title('Percent of unique names')
legend = plt.legend(loc='best', frameon=True, borderpad=1, borderaxespad=1)