In [1]:
import pandas as pd
from glob import glob
import os
import re

First, we want to clean up the hundreds of .txt files in the name subdirectory by combining them all, in order, into one big .csv file.

In [2]:
# naming convention of the name txt files for each year of birth
pattern = os.path.join('names', 'yob*.txt')
files = sorted(glob(pattern))

if not files:
    raise FileNotFoundError(f"No files found with pattern: {pattern}")

dfs = []
for path in files:
    # extract yob from file name
    fname = os.path.basename(path)
    try:
        year = int(fname.replace('yob', '').replace('.txt', ''))
    except ValueError:
        # skip files that don't match the expected pattern
        print(f"Skipping unexpected filename: {fname}")
        continue

    # read file
    df = pd.read_csv(path, header=None, names=['name', 'sex', 'count'], dtype={'name': str, 'sex': str, 'count': int})
    # create year column using yob from file names
    df['year'] = year
    dfs.append(df)

# concatenate dbs, ensure data types are correct + consistent
all_names = pd.concat(dfs, ignore_index=True)
all_names['count'] = all_names['count'].astype(int)
all_names['year'] = all_names['year'].astype(int)

# sort by year and popularity
all_names = all_names.sort_values(['year', 'count'], ascending=[True, False]).reset_index(drop=True)

# write to csv
out_path = os.path.join('names', 'all_years.csv')
all_names.to_csv(out_path, index=False)
print(f"Wrote combined CSV with {len(all_names)} rows to: {out_path}")

Wrote combined CSV with 2149477 rows to: names/all_years.csv


Now, we need to load the TV shows dataset- allshows.txt. We need to read the file, clean up column names, remove streaming services/non-US shows, and extract the year from the start time for each show.

In [3]:
# latin1 encodinng helps us avoid weird characters that sometimes show up
shows = pd.read_csv("allshows.txt", encoding="latin1")
shows.head()

Unnamed: 0,title,directory,tvrage,TVmaze,start date,end date,number of episodes,run time,network,country,onhiatus,onhiatusdesc
0,A for Andromeda,AforAndromeda,764.0,6921.0,Oct 1961,Nov 1961,7 eps,45 min,BBC,UK,False,
1,ï¿½ La Carte,ALaCarte,,61712.0,May 2022,___ ____,6 eps,30 min,Allblk,US,True,
2,The A List,AList,,37579.0,Oct 2018,Jun 2021,21 eps,30 min,BBC iPlayer,UK,False,
3,A to Z,AtoZ,37968.0,92.0,Oct 2014,Jan 2015,13 eps,30 min,NBC,US,False,
4,The A Word,AWord,51488.0,11402.0,Mar 2016,Jun 2020,18 eps,60 min,BBC One,UK,False,


In [4]:
# store num of shows before filtering to track efficiency
before = len(shows)

# clean up column names
shows.columns = shows.columns.str.strip().str.lower().str.replace(" ", "_")

# keep only US shows
shows = shows[shows['country'] == "US"]

# remove streaming-exclusive platforms (case-insensitive)
streaming_keywords = ["netflix", "hulu", "amazon", "prime", "apple", "hbo max", "max", "paramount", "peacock", "roku", "allblk"]
shows = shows[~shows['network'].str.lower().str.contains("|".join(streaming_keywords), na=False)]

# remove non-scripted shows (case-insensitive)
non_scripted_keywords = [
    "late night", "latenight", "late-night",
    "late late show", "lateshow",
    "tonight show", "latenight",
    "good morning", "morning show", "daytime", "today show",
    "news", "documentary",
    "special", "report",
    "variety", "circus", "talent", "competition",
    "game show", "gameshow", "interview", "panel",
    "colbert", "fallon", "kimmel", "myers",
    "ellen", "oprah", "maury", "steve harvey",
    "trevor noah", "conan", "letterman", "leno", "carson",
    "corden", "movie", "daily"
]
shows = shows[~shows['title'].str.lower().str.contains("|".join(non_scripted_keywords), na=False)]

# remove anything that ends with " show" (case-insensitive)
shows = shows[~shows['title'].str.lower().str.endswith(" show")]

# remove invalid start dates
shows = shows[shows['start_date'].notna()]
shows = shows[~shows['start_date'].str.contains("___", na=False)]

def extract_year(date):
    # extract the last 4-digit number in a string so we can isolate just the year
    match = re.search(r"\b(19\d{2}|20\d{2})\b", str(date))
    return int(match.group()) if match else None

# extract start and end years
shows['start_year'] = shows['start_date'].apply(extract_year)
shows['end_year']   = shows['end_date'].apply(extract_year)

# remove rows where either start or end year is missing
shows = shows.dropna(subset=['start_year', 'end_year'])

# convert years to ints
shows['start_year'] = shows['start_year'].astype(int)
shows['end_year'] = shows['end_year'].astype(int)

# expand shows across all years aired
expanded_rows = []

for _, row in shows.iterrows():
    for y in range(row['start_year'], row['end_year'] + 1):
        new_row = row.copy()
        new_row['year'] = y
        expanded_rows.append(new_row)

expanded_shows = pd.DataFrame(expanded_rows)

# check if it worked
expanded_shows.head()

Unnamed: 0,title,directory,tvrage,tvmaze,start_date,end_date,number_of_episodes,run_time,network,country,onhiatus,onhiatusdesc,start_year,end_year,year
3,A to Z,AtoZ,37968.0,92.0,Oct 2014,Jan 2015,13 eps,30 min,NBC,US,False,,2014,2015,2014
3,A to Z,AtoZ,37968.0,92.0,Oct 2014,Jan 2015,13 eps,30 min,NBC,US,False,,2014,2015,2015
6,Aaahh!!! Real Monsters,AaahhRealMonsters,2470.0,8571.0,Oct 1994,Dec 1997,52 eps,30 min,Nick,US,False,,1994,1997,1994
6,Aaahh!!! Real Monsters,AaahhRealMonsters,2470.0,8571.0,Oct 1994,Dec 1997,52 eps,30 min,Nick,US,False,,1994,1997,1995
6,Aaahh!!! Real Monsters,AaahhRealMonsters,2470.0,8571.0,Oct 1994,Dec 1997,52 eps,30 min,Nick,US,False,,1994,1997,1996


In [5]:
after = len(shows)
print("Rows after filtering: ", after)
print("Rows removed: ", before - after)

Rows after filtering:  6899
Rows removed:  6156


For now, the best way we have found to measure popularity is based on how many episodes that show has. Intuitively, this makes sense- if a show is running for many years, they keep getting renewed for new seasons, meaning it is well-liked. An unpopular show will likely have less episodes. For each year, we say that the most popular show is the show with the most episodes that is still streaming during that year.

In [7]:
# extract the number of episodes for our popularity metric
def extract_eps(x):
    match = re.search(r"(\d+)", str(x))
    return int(match.group()) if match else None

expanded_shows['episodes'] = expanded_shows['number_of_episodes'].apply(extract_eps)
expanded_shows['episodes'] = expanded_shows['episodes'].fillna(0)

# find the most popular show each year
top_show_per_year = expanded_shows.sort_values(['year', 'episodes'], ascending=[True, False]).groupby('year').head(1)
top_show_per_year[['title', 'year', 'episodes']].head()

Unnamed: 0,title,year,episodes
6587,Lights Out (1946),1946,160.0
6587,Lights Out (1946),1947,160.0
10955,Studio One,1948,465.0
10955,Studio One,1949,465.0
10955,Studio One,1950,465.0


In [35]:
# manually account for outlier shows that weren't caught by previous filtering, yet still show up as most popular
outliers = ["Bozo's Circus (Chicago)", "Lights Out (1946)", "Walt Disney Presents", "Family Classics", 
            "ABC's Wide World of Entertainment", "WGN Presents", "Politically Incorrect", "Soundstage",
            "Chelsea Lately", "AM Chicago", "Soul Train", "Cops", "PM Magazine (Chicago)", "Mister Rogers' Neighborhood",
            "After Midnight", "Friday Night Videos", "Later with Bob Costas", "Hee Haw", "The Wonderful World of Disney (1961)",
            "Mystery!", "Power Rangers", "Craig of the Creek", "Unsolved Mysteries (1987)", "The Real World", "Modern Marvels", 
            "The Late Show Starring Joan Rivers", "Creature Features", "Garfield and Friends", "The Smurfs", "Beyblade", "How It's Made",
            "Animaniacs (1993)", "Forensic Files", "Mike & Maty", "The E! True Hollywood Story", "Are You Smarter Than a 5th Grader? (2007)",
            "@Midnight", "WGN Sunday Matinee", "Later with Greg Kinnear", "True Life", "World Poker Tour", "Trading Spaces", "What Not to Wear (US)",
            "Cash Cab", "Anderson", "The Bonnie Hunt Show (2008)", "MadTV", "Studio One", "The Philco Television Playhouse",
            "Lux Video Theatre", "Suspense (1949)", "Robert Montgomery Presents", "The Jack Benny Program", "Actors Studio",
            "The Ford Television Theatre"
            ]
expanded_shows = expanded_shows[~expanded_shows['title'].isin(outliers)]

top_show_per_year = expanded_shows.sort_values(
    ['year', 'episodes'], ascending=[True, False]
).groupby('year').head(1)

print(top_show_per_year[['year', 'title', 'episodes']].sort_values('year').to_string())

       year                              title  episodes
6725   1949             The Lone Ranger (1949)     221.0
6725   1950             The Lone Ranger (1949)     221.0
3220   1951                     Dragnet (1951)     276.0
2865   1952                  Death Valley Days     452.0
2865   1953                  Death Valley Days     452.0
6312   1954                      Lassie (1954)     591.0
4753   1955                           Gunsmoke     635.0
4753   1956                           Gunsmoke     635.0
4753   1957                           Gunsmoke     635.0
4753   1958                           Gunsmoke     635.0
4753   1959                           Gunsmoke     635.0
4753   1960                           Gunsmoke     635.0
4753   1961                           Gunsmoke     635.0
4753   1962                           Gunsmoke     635.0
4753   1963                           Gunsmoke     635.0
4753   1964                           Gunsmoke     635.0
4753   1965                    

We need to create an empty table to track the "influence" each show has had on baby names during that year. We will fill in the top 5 character names per show.

In [36]:
influence = top_show_per_year[['title', 'year']].copy()
influence['character_names'] = "" 
influence.head()

Unnamed: 0,title,year,character_names
6725,The Lone Ranger (1949),1949,
6725,The Lone Ranger (1949),1950,
3220,Dragnet (1951),1951,
2865,Death Valley Days,1952,
2865,Death Valley Days,1953,


Now, we need to prepare the baby name data by using within-year ranks.

In [10]:
all_names['rank'] = all_names.groupby('year')['count'].rank(ascending=False, method='dense')
all_names.head()

Unnamed: 0,name,sex,count,year,rank
0,John,M,9655,1880,1.0
1,William,M,9532,1880,2.0
2,Mary,F,7065,1880,3.0
3,James,M,5927,1880,4.0
4,Charles,M,5348,1880,5.0


Finally, we will merge each name/year row with the show of that year to join these two datasets.