# <u><strong>Influence of a French Reality Show on Names & More</strong></u>

<img src="https://i.imgur.com/MmwkTUF.png" align = "left">

 

# Table of contents

[1. The dataset](#1)<br><br>
[2. Number of names by year](#2)<br><br>
[3. Number of unique names](#3)<br><br>
[4. Names & Gender](#4)<br> 
.... [4.1. The most gender neutral names](#41)<br>
.... [4.2. The most "feminines" names, which are still (rarely) used for males](#42)<br>
.... [4.3. The "manliest" names, which are still (rarely) used for females](#43)<br><br>
[5. Influence of a french reality show (Loft Story) on names](#5)<br><br>
[6. Prefered letters for names in France](#6)

# <u>1. The dataset</u><a class="anchor" id="1"></a>

INSEE (French national institute of statistics) released datasets reporting the names given to French babies since 1900. This dataset allows to track trends in how French babies are named.

- <strong>year:</strong> Year of birth
- <strong>name:</strong> Name
- <strong>sex:</strong> Sex
- <strong>count:</strong> Count

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../input/french-baby-names/national_names.csv")
df.head(5)

In [None]:
df.describe()

In [None]:
sns.heatmap(df.isnull())
plt.title("Missing values?")
plt.show()

The dataset seems to have no need for cleaning.

# <u>2. Number of names by year</u><a class="anchor" id="2"></a>

In [None]:
count_by_year = pd.pivot_table(df, index = "year", values = "count", aggfunc=np.sum).reset_index()
plt.figure(figsize = (12,6))
plt.fill_between(count_by_year["year"],count_by_year["count"], lw = 5)
plt.ylim(0,max(count_by_year["count"]+100000))
plt.xlim(min(count_by_year["year"]), max(count_by_year["year"])+2)
plt.title("Number of baby names by year", fontsize = 18)
plt.xlabel("Year")
plt.ylabel("Count")
plt.show()

# <u>3. Number of unique names</u><a class="anchor" id="3"></a>

In [None]:
# Create two sets: 
#  - Male names
#  - Female names
unique_names_sets = (set(df[df["sex"] == "M"]["name"].unique()), set(df[df["sex"] == "F"]["name"].unique()))

# Number of male names, names for both genders and female names
unique_names = [len(unique_names_sets[0] - unique_names_sets[1]), 
                len(unique_names_sets[0] & unique_names_sets[1]), 
                len(unique_names_sets[1] - unique_names_sets[0])]

# Visualize the result
plt.figure(figsize=(8,5))
plt.bar(["Male","Both","Female"], unique_names, color = ["black", "cyan", "pink"])
plt.title("Number of total unique names by Gender", fontsize = 18)
plt.show()

The gender "Female" has more unique names.

In [None]:
# Calculate the number of unique names by gender and by year
count_namesF = df[df["sex"] == "F"].groupby(by = "year").count().rename(columns = {"name":"Females"})
count_names = df[df["sex"] == "M"].groupby(by = "year").count().rename(columns = {"name":"Males"})
count_names["Females"] = count_namesF["Females"]
count_names.drop(["sex","count"], axis = 1, inplace = True)
count_names.reset_index(inplace = True)

# Display the result
fig = plt.figure(figsize = (12,6))
ax = fig.add_axes([0,0,1,1])
count_names.plot(x = "year", y = "Males", color = "black", lw = 5, ax = ax)
count_names.plot(x = "year", y = "Females", color = "pink", lw = 5, ax = ax)
ax.set_title("Number of unique names by gender and by year", fontsize = 18)
plt.legend(fontsize = 15)
ax.set_xlabel("Year",fontsize = 18)
plt.show()

The number of unique names by gender surprisingly have the same increase and decrease tendencies. There are always more unique names for the gender "Female".

# <u>4. Names & Gender</u><a class="anchor" id="4"></a>

In [None]:
# Create two columns with dummies for sex (M/F)
df = pd.get_dummies(df, columns = ["sex"])

# Count the number of occurences of the names for males and females
# in the columns sex_F and sex_M
df["sex_F"] = df["count"] * df["sex_F"]
df["sex_M"] = df["count"] * df["sex_M"]
# df.drop("count", axis = 1, inplace = True)

# Keep only the names which are used for males and females
group_name = df.groupby(by = "name").sum().drop("year", axis = 1)
both_sex = group_name[(group_name["sex_F"] > 0) & (group_name["sex_M"] > 0)].copy()

# Calculate the proportion of males/females having each name
both_sex["proportion_M/F"] = both_sex["sex_M"] / both_sex["sex_F"]

# Keep only names with a M/F proportion between 0.95 and 1.05
# Keep only names which are present at least 41 times for girls
gender_neutral = both_sex[(both_sex["proportion_M/F"] > 0.95) & (both_sex["proportion_M/F"] < 1.05) & (both_sex["sex_F"] > 40)]

In [None]:
from wordcloud import WordCloud

def display_word(words, title = None):
# Display the words in the string "text" as word cloud

    wordcloud = WordCloud(
            background_color='white',
            max_font_size=20, 
            scale=3,
            random_state=0 # chosen at random by flipping a coin; it was heads
    ).generate(str(words))
    fig = plt.figure(figsize=(12, 12))
    plt.axis('off')
    plt.imshow(wordcloud)
    plt.title(title, fontsize = 40)
    plt.show()


## 4.1. The most gender neutral names<a class="anchor" id="41"></a>

In [None]:
words = " ".join(gender_neutral.index)
display_word(words)

In [None]:
# Names with a M/F proportion between 0.95 and 1.05
gender_neutral

## 4.2. The most "feminine" names, which are still (rarely) used for males<a class="anchor" id="42"></a>

In [None]:
feminine_names = both_sex.sort_values(by = "proportion_M/F", ascending = True).drop("proportion_M/F", axis = 1)[:15]
words = " ".join(feminine_names.index)
display_word(words)

In [None]:
feminine_names

## 4.3. The “manliest” names, which are still (rarely) used for females<a class="anchor" id="43"></a>

In [None]:
manly_names = both_sex.sort_values(by = "proportion_M/F", ascending = False).drop("proportion_M/F", axis = 1)[:15]
words = " ".join(manly_names.index)
display_word(words)

In [None]:
manly_names

# 5. Influence of a French reality show (Loft Story) on names<a class="anchor" id="5"></a>

<strong>Loft Story</strong> was a French reality show, and it is the first French adaptation of Big Brother franchise. The show generally considered is the first of its kind in France. Loft Story presented by Benjamin Castaldi and broadcast on M6 from 26 April 2001 to 5 July 2001 for the first season and from 11 April 2002 to 4 July 2002 for the second season. (Source <a href="https://en.wikipedia.org/wiki/Loft_Story_(French_TV_series)">Wikipedia</a>)



<img src="https://upload.wikimedia.org/wikipedia/en/thumb/3/32/Loft_Story_eye.jpg/250px-Loft_Story_eye.jpg" align = "left">


The show was the first one introducing the new concept on reality shows like Big Brothers in France and was very famous. The question is: Might a show like this influence people to call their kids after participants of the show?

To answer this question, we first need the names of the participants The most important ones were Loana Petrucciani and Christophe Mercy because they won it. Christophe is a common French name, so we might not be able to see any influence, but Loana is uncommon. Another famous participant was Steevy, which isn't a common French name.

Therefore, we will visualize at the number of kids called "Loana" and "Steevy" to see if there is any influence of the show.


<strong>Loana:</strong>

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Loana_salon_du_chocolat_2008.jpg/220px-Loana_salon_du_chocolat_2008.jpg" align = "left">


Source <a href="https://fr.wikipedia.org/wiki/Loana_Petrucciani">Wikipedia</a>

In [None]:
fig = plt.figure(figsize = (12,6))
ax = fig.add_axes([0,0,1,1])
ax.axvline(2001, color='r', lw = 3, ls = "--", c = "black")
df[df["name"] == "Loana"][10:].plot(x = "year", y = "count", ax = ax, lw = 5)
plt.title("Number of kids called Loana by year", fontsize = 18)
ax.set_xlabel('Loft Story (2001)', position=(0.155, 2e6), horizontalalignment='left', fontsize = 15)
plt.show()

We can see a clearly increase of the name "Loana" in 2001, when the reality show was on TV.

In [None]:
fig = plt.figure(figsize = (12,6))
ax = fig.add_axes([0,0,1,1])
ax.axvline(2001, color='r', lw = 3, ls = "--", c = "black")
df[df["name"] == "Steevy"][10:].plot(x = "year", y = "count", ax = ax, lw = 5)
plt.title("Number of kids called Steevy by year", fontsize = 18)
ax.set_xlabel('Loft Story (2001)', position=(0.33, 2e6), horizontalalignment='left', fontsize = 15)
plt.show()

We can see a clearly increase of the name "Steevy" in 2001, when the reality show was on TV.

# 6. Prefered letters for names in France<a class="anchor" id="6"></a>

In [None]:
from collections import Counter

# Count the number of occurrences of each letter in the names
evg = df["name"]*df["count"]
evg = "".join(evg).lower()
letters_freq_name = Counter(evg)

The frequency of letters in french:<br>
<a href="https://fr.wikipedia.org/wiki/Fr%C3%A9quence_d%27apparition_des_lettres_en_fran%C3%A7ais">Source: Wikipedia</a>

In [None]:
# The frequency of letters in french
letters_freq = letters_freq=[['e',115024205],['a',67563628],['i',62672992],
['s',61882785],['n',60728196],['r',57656209],['t',56267109],['o',47724400],
['l',47171247],['u',42698875],['d',34914685],['c',30219574],['m',24894034],
['p',23647179],['g',11684140],['b',10817171],['v',10590858],['h',10583562],
['f',10579192],['q',6140307],['y',4351953],['x',3588990],['j',3276064],
['k',2747547],['w',1653435],['z',1433913]]

letters_freq = pd.DataFrame(letters_freq, columns=["letter","count"])
letters_freq = letters_freq.sort_values("letter")[:26].reset_index(drop=True)
letters_freq["freq_french"] = letters_freq["count"] / letters_freq["count"].sum()
letters_freq.drop(columns = "count", inplace = True)

# The frequency of letters in the names
let = "abcdefghijklmnopqrstuvwxyz"
lst = []
for k in letters_freq_name.keys():
    if k in let:
        lst.append([k, letters_freq_name[k]])
freq_name = pd.DataFrame(lst, columns = ["letter", "count"]).sort_values("letter").reset_index(drop=True)

freq_name["freq_in_names"] = freq_name["count"] / freq_name["count"].sum()

# Combine the frequency of letters in French and the frequency of letters in the names
letters_freq["freq_in_names"] = freq_name["freq_in_names"]

In [None]:
labels = letters_freq.letter.values

width = 0.35
x = np.arange(len(labels))
fig, ax = plt.subplots(1,1, figsize = (15,8))
ax.bar(x - width/2, letters_freq.freq_french.values, width, label='French corpus', color = "black")
ax.bar(x + width/2, letters_freq.freq_in_names.values, width, label='French names', color = "pink")

ax.set_ylabel('')
ax.set_title('Frequency of letters\nin names VS in French corpus', fontsize = 20)
ax.set_xticks(x)
ax.set_xticklabels(labels, fontsize = 18)
ax.legend()

fig.tight_layout()

plt.show()

In [None]:
letters_freq["difference"] = letters_freq["freq_in_names"] / letters_freq["freq_french"]

colors = letters_freq["difference"].apply(lambda x: "FUCHSIA" if x > 1.5 else ("silver" if x > 0.67 else "purple")).values

fig, ax = plt.subplots(1,1, figsize = (15,8))
ax.bar(x = letters_freq["letter"], height = letters_freq["difference"], color = colors)
ax.hlines(y = 1, xmin = -1, xmax = 26, color = "black", lw = 3)
ax.set_xticklabels(labels, fontsize = 18)
plt.title("Most/Less used letters in names\nin comparison to French corpus", fontsize = 20)
plt.text(-2, 0.85, "Igual\nuse", fontsize = 18)
plt.show()

- <strong>The letters h, j and y are used far more often in French names</strong> than in the French corpus. Maybe french people like to have those letters in their names?

- <strong>The letters d, f, p, q, s, t, u, w and x are used less in French names</strong>.
